From 539da7877275edb21a76aa02fb2c147eff02c559 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 4 Nov 2015 22:57:00 +0000
Subject: x86/apic: Add a single-target IPI function to the apic

We still fall back on the "send mask" versions if an apic definition
doesn't have the single-target version, but at least this allows the
(trivial) case for the common clustered x2apic case.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Cc: Borislav Petkov <bp@alien.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Travis <travis@sgi.com>
Cc: Daniel J Blueman <daniel@numascale.com>
Link: http://lkml.kernel.org/r/20151104220848.737120838@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/apic.h |  1 +
 arch/x86/kernel/smp.c       | 16 ++++++++++++++--
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index a30316b..7f62ad4 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -303,6 +303,7 @@ struct apic {
 				      unsigned int *apicid);
 
 	/* ipi */
+	void (*send_IPI)(int cpu, int vector);
 	void (*send_IPI_mask)(const struct cpumask *mask, int vector);
 	void (*send_IPI_mask_allbutself)(const struct cpumask *mask,
 					 int vector);
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 12c8286..1dbf590 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -115,6 +115,18 @@ static atomic_t stopping_cpu = ATOMIC_INIT(-1);
 static bool smp_no_nmi_ipi = false;
 
 /*
+ * Helper wrapper: not all apic definitions support sending to
+ * a single CPU, so we fall back to sending to a mask.
+ */
+static void send_IPI_cpu(int cpu, int vector)
+{
+	if (apic->send_IPI)
+		apic->send_IPI(cpu, vector);
+	else
+		apic->send_IPI_mask(cpumask_of(cpu), vector);
+}
+
+/*
  * this function sends a 'reschedule' IPI to another CPU.
  * it goes straight through and wastes no time serializing
  * anything. Worst case is that we lose a reschedule ...
@@ -125,12 +137,12 @@ static void native_smp_send_reschedule(int cpu)
 		WARN_ON(1);
 		return;
 	}
-	apic->send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR);
+	send_IPI_cpu(cpu, RESCHEDULE_VECTOR);
 }
 
 void native_send_call_func_single_ipi(int cpu)
 {
-	apic->send_IPI_mask(cpumask_of(cpu), CALL_FUNCTION_SINGLE_VECTOR);
+	send_IPI_cpu(cpu, CALL_FUNCTION_SINGLE_VECTOR);
 }
 
 void native_send_call_func_ipi(const struct cpumask *mask)
-- 
cgit v1.1


From 7b6ce46cb3d096831dea3accacee4717c66abac8 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 4 Nov 2015 22:57:00 +0000
Subject: x86/apic: Implement single target IPI function for x2apic_cluster

[ tglx: Split it out from the patch which provides the new callback ]

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Travis <travis@sgi.com>
Cc: Daniel J Blueman <daniel@numascale.com>
Link: http://lkml.kernel.org/r/20151104220848.817975597@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/apic/x2apic_cluster.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index cc8311c..aca8b75 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -23,6 +23,14 @@ static inline u32 x2apic_cluster(int cpu)
 	return per_cpu(x86_cpu_to_logical_apicid, cpu) >> 16;
 }
 
+static void x2apic_send_IPI(int cpu, int vector)
+{
+	u32 dest = per_cpu(x86_cpu_to_logical_apicid, cpu);
+
+	x2apic_wrmsr_fence();
+	__x2apic_send_IPI_dest(dest, vector, APIC_DEST_LOGICAL);
+}
+
 static void
 __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
 {
@@ -266,6 +274,7 @@ static struct apic apic_x2apic_cluster = {
 
 	.cpu_mask_to_apicid_and		= x2apic_cpu_mask_to_apicid_and,
 
+	.send_IPI			= x2apic_send_IPI,
 	.send_IPI_mask			= x2apic_send_IPI_mask,
 	.send_IPI_mask_allbutself	= x2apic_send_IPI_mask_allbutself,
 	.send_IPI_allbutself		= x2apic_send_IPI_allbutself,
-- 
cgit v1.1


From 53be0fac8bdaeec87e0df7d0334345421d2be187 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 4 Nov 2015 22:57:01 +0000
Subject: x86/apic: Implement default single target IPI function

apic_physflat and bigsmp_apic can share that implementation.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Borislav Petkov <bp@alien.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Travis <travis@sgi.com>
Cc: Daniel J Blueman <daniel@numascale.com>
Link: http://lkml.kernel.org/r/20151104220848.898543767@linutronix.de
---
 arch/x86/include/asm/ipi.h |  1 +
 arch/x86/kernel/apic/ipi.c | 10 ++++++++++
 2 files changed, 11 insertions(+)

diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h
index 615fa90..22998a8 100644
--- a/arch/x86/include/asm/ipi.h
+++ b/arch/x86/include/asm/ipi.h
@@ -119,6 +119,7 @@ static inline void
 	native_apic_mem_write(APIC_ICR, cfg);
 }
 
+extern void default_send_IPI_single_phys(int cpu, int vector);
 extern void default_send_IPI_mask_sequence_phys(const struct cpumask *mask,
 						 int vector);
 extern void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask,
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index 6207156..4fcffbf 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -18,6 +18,16 @@
 #include <asm/proto.h>
 #include <asm/ipi.h>
 
+void default_send_IPI_single_phys(int cpu, int vector)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	__default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, cpu),
+				      vector, APIC_DEST_PHYSICAL);
+	local_irq_restore(flags);
+}
+
 void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector)
 {
 	unsigned long query_cpu;
-- 
cgit v1.1


From 449112f4f35074f1dc70d4f0e769cb14150c159c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 4 Nov 2015 22:57:02 +0000
Subject: x86/apic: Remove pointless indirections from apic_physflat

No value in having 32 byte extra text.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Borislav Petkov <bp@alien.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Travis <travis@sgi.com>
Cc: Daniel J Blueman <daniel@numascale.com>
Link: http://lkml.kernel.org/r/20151104220848.975653382@linutronix.de
---
 arch/x86/kernel/apic/apic_flat_64.c | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index f92ab36..6d3e1a6 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -230,17 +230,6 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 	return 0;
 }
 
-static void physflat_send_IPI_mask(const struct cpumask *cpumask, int vector)
-{
-	default_send_IPI_mask_sequence_phys(cpumask, vector);
-}
-
-static void physflat_send_IPI_mask_allbutself(const struct cpumask *cpumask,
-					      int vector)
-{
-	default_send_IPI_mask_allbutself_phys(cpumask, vector);
-}
-
 static void physflat_send_IPI_allbutself(int vector)
 {
 	default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
@@ -248,7 +237,7 @@ static void physflat_send_IPI_allbutself(int vector)
 
 static void physflat_send_IPI_all(int vector)
 {
-	physflat_send_IPI_mask(cpu_online_mask, vector);
+	default_send_IPI_mask_sequence_phys(cpu_online_mask, vector);
 }
 
 static int physflat_probe(void)
@@ -292,8 +281,8 @@ static struct apic apic_physflat =  {
 
 	.cpu_mask_to_apicid_and		= default_cpu_mask_to_apicid_and,
 
-	.send_IPI_mask			= physflat_send_IPI_mask,
-	.send_IPI_mask_allbutself	= physflat_send_IPI_mask_allbutself,
+	.send_IPI_mask			= default_send_IPI_mask_sequence_phys,
+	.send_IPI_mask_allbutself	= default_send_IPI_mask_allbutself_phys,
 	.send_IPI_allbutself		= physflat_send_IPI_allbutself,
 	.send_IPI_all			= physflat_send_IPI_all,
 	.send_IPI_self			= apic_send_IPI_self,
-- 
cgit v1.1


From 68cd88ff8df97846eb07080f17264a4de50cb012 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 4 Nov 2015 22:57:02 +0000
Subject: x86/apic: Wire up single IPI for apic_physflat

Use the default implementation.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Borislav Petkov <bp@alien.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Travis <travis@sgi.com>
Cc: Daniel J Blueman <daniel@numascale.com>
Link: http://lkml.kernel.org/r/20151104220849.055046864@linutronix.de
---
 arch/x86/kernel/apic/apic_flat_64.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 6d3e1a6..9de25d4 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -281,6 +281,7 @@ static struct apic apic_physflat =  {
 
 	.cpu_mask_to_apicid_and		= default_cpu_mask_to_apicid_and,
 
+	.send_IPI			= default_send_IPI_single_phys,
 	.send_IPI_mask			= default_send_IPI_mask_sequence_phys,
 	.send_IPI_mask_allbutself	= default_send_IPI_mask_allbutself_phys,
 	.send_IPI_allbutself		= physflat_send_IPI_allbutself,
-- 
cgit v1.1


From 500bd02fb17e5d9296c77ccc07db61fd5d4922a4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 4 Nov 2015 22:57:03 +0000
Subject: x86/apic: Remove pointless indirections from bigsmp_apic

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Borislav Petkov <bp@alien.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Travis <travis@sgi.com>
Cc: Daniel J Blueman <daniel@numascale.com>
Link: http://lkml.kernel.org/r/20151104220849.133086575@linutronix.de
---
 arch/x86/kernel/apic/bigsmp_32.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 971cf88..d4d103b 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -96,11 +96,6 @@ static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb)
 	return cpuid_apic >> index_msb;
 }
 
-static inline void bigsmp_send_IPI_mask(const struct cpumask *mask, int vector)
-{
-	default_send_IPI_mask_sequence_phys(mask, vector);
-}
-
 static void bigsmp_send_IPI_allbutself(int vector)
 {
 	default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
@@ -108,7 +103,7 @@ static void bigsmp_send_IPI_allbutself(int vector)
 
 static void bigsmp_send_IPI_all(int vector)
 {
-	bigsmp_send_IPI_mask(cpu_online_mask, vector);
+	default_send_IPI_mask_sequence_phys(cpu_online_mask, vector);
 }
 
 static int dmi_bigsmp; /* can be set by dmi scanners */
@@ -180,7 +175,7 @@ static struct apic apic_bigsmp = {
 
 	.cpu_mask_to_apicid_and		= default_cpu_mask_to_apicid_and,
 
-	.send_IPI_mask			= bigsmp_send_IPI_mask,
+	.send_IPI_mask			= default_send_IPI_mask_sequence_phys,
 	.send_IPI_mask_allbutself	= NULL,
 	.send_IPI_allbutself		= bigsmp_send_IPI_allbutself,
 	.send_IPI_all			= bigsmp_send_IPI_all,
-- 
cgit v1.1


From 5789a12e28f7bf6a37564a5fc9ebc60dc86659b5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 4 Nov 2015 22:57:04 +0000
Subject: x86/apic: Wire up single IPI for bigsmp_apic

Use the default implementation.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Borislav Petkov <bp@alien.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Travis <travis@sgi.com>
Cc: Daniel J Blueman <daniel@numascale.com>
Link: http://lkml.kernel.org/r/20151104220849.213292642@linutronix.de
---
 arch/x86/kernel/apic/bigsmp_32.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index d4d103b..cf9bd896 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -175,6 +175,7 @@ static struct apic apic_bigsmp = {
 
 	.cpu_mask_to_apicid_and		= default_cpu_mask_to_apicid_and,
 
+	.send_IPI			= default_send_IPI_single_phys,
 	.send_IPI_mask			= default_send_IPI_mask_sequence_phys,
 	.send_IPI_mask_allbutself	= NULL,
 	.send_IPI_allbutself		= bigsmp_send_IPI_allbutself,
-- 
cgit v1.1


From f2bffe8a3eef42a1cd3393d56acd9fe598d2119c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 4 Nov 2015 22:57:04 +0000
Subject: x86/apic: Implement single IPI for x2apic_phys

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Borislav Petkov <bp@alien.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Travis <travis@sgi.com>
Cc: Daniel J Blueman <daniel@numascale.com>
Link: http://lkml.kernel.org/r/20151104220849.296438009@linutronix.de
---
 arch/x86/kernel/apic/x2apic_phys.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 662e915..a1242e2 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -36,6 +36,14 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 	return x2apic_enabled() && (x2apic_phys || x2apic_fadt_phys());
 }
 
+static void x2apic_send_IPI(int cpu, int vector)
+{
+	u32 dest = per_cpu(x86_cpu_to_apicid, cpu);
+
+	x2apic_wrmsr_fence();
+	__x2apic_send_IPI_dest(dest, vector, APIC_DEST_PHYSICAL);
+}
+
 static void
 __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
 {
@@ -122,6 +130,7 @@ static struct apic apic_x2apic_phys = {
 
 	.cpu_mask_to_apicid_and		= default_cpu_mask_to_apicid_and,
 
+	.send_IPI			= x2apic_send_IPI,
 	.send_IPI_mask			= x2apic_send_IPI_mask,
 	.send_IPI_mask_allbutself	= x2apic_send_IPI_mask_allbutself,
 	.send_IPI_allbutself		= x2apic_send_IPI_allbutself,
-- 
cgit v1.1


From 8642ea953d99fc037c1076e9a8b3a822025fb251 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 4 Nov 2015 22:57:05 +0000
Subject: x86/apic: Wire up single IPI for x2apic_uv

The function already exists.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Borislav Petkov <bp@alien.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Travis <travis@sgi.com>
Cc: Daniel J Blueman <daniel@numascale.com>
Link: http://lkml.kernel.org/r/20151104220849.376775625@linutronix.de
---
 arch/x86/kernel/apic/x2apic_uv_x.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 4a13946..d760c6b 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -406,6 +406,7 @@ static struct apic __refdata apic_x2apic_uv_x = {
 
 	.cpu_mask_to_apicid_and		= uv_cpu_mask_to_apicid_and,
 
+	.send_IPI			= uv_send_IPI_one,
 	.send_IPI_mask			= uv_send_IPI_mask,
 	.send_IPI_mask_allbutself	= uv_send_IPI_mask_allbutself,
 	.send_IPI_allbutself		= uv_send_IPI_allbutself,
-- 
cgit v1.1


From c61a0d31ba0ce75cb1b88bb4eb2f41a1b80bc90f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 4 Nov 2015 22:57:06 +0000
Subject: x86/apic: Wire up single IPI for apic_numachip

The function already exists.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Borislav Petkov <bp@alien.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Travis <travis@sgi.com>
Cc: Daniel J Blueman <daniel@numascale.com>
Link: http://lkml.kernel.org/r/20151104220849.551445489@linutronix.de
---
 arch/x86/kernel/apic/apic_numachip.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index 38dd5ef..69329a6 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -276,6 +276,7 @@ static const struct apic apic_numachip1 __refconst = {
 
 	.cpu_mask_to_apicid_and		= default_cpu_mask_to_apicid_and,
 
+	.send_IPI			= numachip_send_IPI_one,
 	.send_IPI_mask			= numachip_send_IPI_mask,
 	.send_IPI_mask_allbutself	= numachip_send_IPI_mask_allbutself,
 	.send_IPI_allbutself		= numachip_send_IPI_allbutself,
@@ -327,6 +328,7 @@ static const struct apic apic_numachip2 __refconst = {
 
 	.cpu_mask_to_apicid_and		= default_cpu_mask_to_apicid_and,
 
+	.send_IPI			= numachip_send_IPI_one,
 	.send_IPI_mask			= numachip_send_IPI_mask,
 	.send_IPI_mask_allbutself	= numachip_send_IPI_mask_allbutself,
 	.send_IPI_allbutself		= numachip_send_IPI_allbutself,
-- 
cgit v1.1


From 4727da2eb1ec79fdc2acdd2f764b5b2aacab998c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 4 Nov 2015 22:57:06 +0000
Subject: x86/apic: Implement single IPI for apic_noop

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Borislav Petkov <bp@alien.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Travis <travis@sgi.com>
Cc: Daniel J Blueman <daniel@numascale.com>
Link: http://lkml.kernel.org/r/20151104220849.455429817@linutronix.de
---
 arch/x86/kernel/apic/apic_noop.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index 0d96749..331a7a0 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -30,6 +30,7 @@
 #include <asm/e820.h>
 
 static void noop_init_apic_ldr(void) { }
+static void noop_send_IPI(int cpu, int vector) { }
 static void noop_send_IPI_mask(const struct cpumask *cpumask, int vector) { }
 static void noop_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) { }
 static void noop_send_IPI_allbutself(int vector) { }
@@ -144,6 +145,7 @@ struct apic apic_noop = {
 
 	.cpu_mask_to_apicid_and		= flat_cpu_mask_to_apicid_and,
 
+	.send_IPI			= noop_send_IPI,
 	.send_IPI_mask			= noop_send_IPI_mask,
 	.send_IPI_mask_allbutself	= noop_send_IPI_mask_allbutself,
 	.send_IPI_allbutself		= noop_send_IPI_allbutself,
-- 
cgit v1.1


From 7e29393b20a1a863a5f9bf48dc71e5cff4035ff5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 4 Nov 2015 22:57:07 +0000
Subject: x86/apic: Provide default send single IPI wrapper

Instead of doing the wrapping in the smp code we can provide a default
wrapper for those APICs which insist on cpumasks.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Borislav Petkov <bp@alien.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Travis <travis@sgi.com>
Cc: Daniel J Blueman <daniel@numascale.com>
Link: http://lkml.kernel.org/r/20151104220849.631111846@linutronix.de
---
 arch/x86/include/asm/ipi.h | 1 +
 arch/x86/kernel/apic/ipi.c | 8 ++++++++
 2 files changed, 9 insertions(+)

diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h
index 22998a8..cfc9a0d 100644
--- a/arch/x86/include/asm/ipi.h
+++ b/arch/x86/include/asm/ipi.h
@@ -119,6 +119,7 @@ static inline void
 	native_apic_mem_write(APIC_ICR, cfg);
 }
 
+extern void default_send_IPI_single(int cpu, int vector);
 extern void default_send_IPI_single_phys(int cpu, int vector);
 extern void default_send_IPI_mask_sequence_phys(const struct cpumask *mask,
 						 int vector);
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index 4fcffbf..eb45fc9 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -65,6 +65,14 @@ void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask,
 	local_irq_restore(flags);
 }
 
+/*
+ * Helper function for APICs which insist on cpumasks
+ */
+void default_send_IPI_single(int cpu, int vector)
+{
+	apic->send_IPI_mask(cpumask_of(cpu), vector);
+}
+
 #ifdef CONFIG_X86_32
 
 void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
-- 
cgit v1.1


From 6153058a03f4cc5200b0b29e201caa11779ebca0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 4 Nov 2015 22:57:08 +0000
Subject: x86/apic: Use default send single IPI wrapper

Wire up the default_send_IPI_single() wrapper to the last holdouts.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Borislav Petkov <bp@alien.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Travis <travis@sgi.com>
Cc: Daniel J Blueman <daniel@numascale.com>
Link: http://lkml.kernel.org/r/20151104220849.711224890@linutronix.de
---
 arch/x86/kernel/apic/apic_flat_64.c | 1 +
 arch/x86/kernel/apic/probe_32.c     | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 9de25d4..9968f30 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -185,6 +185,7 @@ static struct apic apic_flat =  {
 
 	.cpu_mask_to_apicid_and		= flat_cpu_mask_to_apicid_and,
 
+	.send_IPI			= default_send_IPI_single,
 	.send_IPI_mask			= flat_send_IPI_mask,
 	.send_IPI_mask_allbutself	= flat_send_IPI_mask_allbutself,
 	.send_IPI_allbutself		= flat_send_IPI_allbutself,
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 7694ae6..f316e34 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -105,6 +105,7 @@ static struct apic apic_default = {
 
 	.cpu_mask_to_apicid_and		= flat_cpu_mask_to_apicid_and,
 
+	.send_IPI			= default_send_IPI_single,
 	.send_IPI_mask			= default_send_IPI_mask_logical,
 	.send_IPI_mask_allbutself	= default_send_IPI_mask_allbutself_logical,
 	.send_IPI_allbutself		= default_send_IPI_allbutself,
-- 
cgit v1.1


From 72613184a1f076659e8a902d64351f50d3f9c990 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 4 Nov 2015 22:57:09 +0000
Subject: x86/smp: Remove single IPI wrapper

All APIC implementation have send_IPI now. Remove the conditional in
the calling code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Borislav Petkov <bp@alien.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Travis <travis@sgi.com>
Cc: Daniel J Blueman <daniel@numascale.com>
Link: http://lkml.kernel.org/r/20151104220849.807817097@linutronix.de
---
 arch/x86/kernel/smp.c | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 1dbf590..658777c 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -115,18 +115,6 @@ static atomic_t stopping_cpu = ATOMIC_INIT(-1);
 static bool smp_no_nmi_ipi = false;
 
 /*
- * Helper wrapper: not all apic definitions support sending to
- * a single CPU, so we fall back to sending to a mask.
- */
-static void send_IPI_cpu(int cpu, int vector)
-{
-	if (apic->send_IPI)
-		apic->send_IPI(cpu, vector);
-	else
-		apic->send_IPI_mask(cpumask_of(cpu), vector);
-}
-
-/*
  * this function sends a 'reschedule' IPI to another CPU.
  * it goes straight through and wastes no time serializing
  * anything. Worst case is that we lose a reschedule ...
@@ -137,12 +125,12 @@ static void native_smp_send_reschedule(int cpu)
 		WARN_ON(1);
 		return;
 	}
-	send_IPI_cpu(cpu, RESCHEDULE_VECTOR);
+	apic->send_IPI(cpu, RESCHEDULE_VECTOR);
 }
 
 void native_send_call_func_single_ipi(int cpu)
 {
-	send_IPI_cpu(cpu, CALL_FUNCTION_SINGLE_VECTOR);
+	apic->send_IPI(cpu, CALL_FUNCTION_SINGLE_VECTOR);
 }
 
 void native_send_call_func_ipi(const struct cpumask *mask)
-- 
cgit v1.1


From 79f1d836925c545b4612f7ed19423f0950978b5e Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Tue, 3 Nov 2015 10:18:49 +0100
Subject: x86/paravirt: Kill some unused patching functions

paravirt_patch_ignore() is completely unused and paravirt_patch_nop()
doesn't do a whole lot. Remove them both.

Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Juergen Gross <jgross@suse.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Chris Wright <chrisw@sous-sol.org>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: virtualization@lists.linux-foundation.org
Cc: xen-devel@lists.xenproject.org
Link: http://lkml.kernel.org/r/1446542329-32037-1-git-send-email-bp@alien8.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/paravirt_types.h |  2 --
 arch/x86/kernel/paravirt.c            | 13 +------------
 2 files changed, 1 insertion(+), 14 deletions(-)

diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 31247b5..e1f31df 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -402,10 +402,8 @@ extern struct pv_lock_ops pv_lock_ops;
 	__visible extern const char start_##ops##_##name[], end_##ops##_##name[];	\
 	asm(NATIVE_LABEL("start_", ops, name) code NATIVE_LABEL("end_", ops, name))
 
-unsigned paravirt_patch_nop(void);
 unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len);
 unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len);
-unsigned paravirt_patch_ignore(unsigned len);
 unsigned paravirt_patch_call(void *insnbuf,
 			     const void *target, u16 tgt_clobbers,
 			     unsigned long addr, u16 site_clobbers,
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index c2130ae..4f32a10 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -74,16 +74,6 @@ void __init default_banner(void)
 /* Undefined instruction for dealing with missing ops pointers. */
 static const unsigned char ud2a[] = { 0x0f, 0x0b };
 
-unsigned paravirt_patch_nop(void)
-{
-	return 0;
-}
-
-unsigned paravirt_patch_ignore(unsigned len)
-{
-	return len;
-}
-
 struct branch {
 	unsigned char opcode;
 	u32 delta;
@@ -152,8 +142,7 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
 		/* If there's no function, patch it with a ud2a (BUG) */
 		ret = paravirt_patch_insns(insnbuf, len, ud2a, ud2a+sizeof(ud2a));
 	else if (opfunc == _paravirt_nop)
-		/* If the operation is a nop, then nop the callsite */
-		ret = paravirt_patch_nop();
+		ret = 0;
 
 	/* identity functions just return their single argument */
 	else if (opfunc == _paravirt_ident_32)
-- 
cgit v1.1


From ed29210cd6a67425026e78aa298fa434e11a74e3 Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Tue, 17 Nov 2015 13:05:43 +0100
Subject: x86: Remove unused function cpu_has_ht_siblings()

It is used nowhere.

Signed-off-by: Juergen Gross <jgross@suse.com>
Link: http://lkml.kernel.org/r/1447761943-770-1-git-send-email-jgross@suse.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/smp.h | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 222a6a3..a438c55 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -21,15 +21,6 @@
 extern int smp_num_siblings;
 extern unsigned int num_processors;
 
-static inline bool cpu_has_ht_siblings(void)
-{
-	bool has_siblings = false;
-#ifdef CONFIG_SMP
-	has_siblings = cpu_has_ht && smp_num_siblings > 1;
-#endif
-	return has_siblings;
-}
-
 DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
 DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
 /* cpus sharing the last level cache: */
-- 
cgit v1.1


From 916d4092a1d2d7bb50630497be71ee4c4c2807fa Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 18 Nov 2015 17:38:49 -0300
Subject: perf test: Fix build of BPF and LLVM on older glibc libraries
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

  $ rpm -q glibc
  glibc-2.12-1.166.el6_7.1.x86_64

<SNIP>
    CC       /tmp/build/perf/tests/llvm.o
  cc1: warnings being treated as errors
  tests/llvm.c: In function ‘test_llvm__fetch_bpf_obj’:
  tests/llvm.c:53: error: declaration of ‘index’ shadows a global declaration
  /usr/include/string.h:489: error: shadowed declaration is here
<SNIP>
    CC       /tmp/build/perf/tests/bpf.o
  cc1: warnings being treated as errors
  tests/bpf.c: In function ‘__test__bpf’:
  tests/bpf.c:149: error: declaration of ‘index’ shadows a global declaration
  /usr/include/string.h:489: error: shadowed declaration is here
<SNIP>

Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: pi3orama@163.com
Cc: Wang Nan <wangnan0@huawei.com>
Cc: Zefan Li <lizefan@huawei.com>
Fixes: b31de018a628 ("perf test: Enhance the LLVM test: update basic BPF test program")
Fixes: ba1fae431e74 ("perf test: Add 'perf test BPF'")
Link: http://lkml.kernel.org/n/tip-akpo4r750oya2phxoh9e3447@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/bpf.c  | 14 +++++++-------
 tools/perf/tests/llvm.c |  8 ++++----
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/tools/perf/tests/bpf.c b/tools/perf/tests/bpf.c
index ec16f78..6ebfdee 100644
--- a/tools/perf/tests/bpf.c
+++ b/tools/perf/tests/bpf.c
@@ -146,7 +146,7 @@ prepare_bpf(void *obj_buf, size_t obj_buf_sz, const char *name)
 	return obj;
 }
 
-static int __test__bpf(int index)
+static int __test__bpf(int idx)
 {
 	int ret;
 	void *obj_buf;
@@ -154,27 +154,27 @@ static int __test__bpf(int index)
 	struct bpf_object *obj;
 
 	ret = test_llvm__fetch_bpf_obj(&obj_buf, &obj_buf_sz,
-				       bpf_testcase_table[index].prog_id,
+				       bpf_testcase_table[idx].prog_id,
 				       true);
 	if (ret != TEST_OK || !obj_buf || !obj_buf_sz) {
 		pr_debug("Unable to get BPF object, %s\n",
-			 bpf_testcase_table[index].msg_compile_fail);
-		if (index == 0)
+			 bpf_testcase_table[idx].msg_compile_fail);
+		if (idx == 0)
 			return TEST_SKIP;
 		else
 			return TEST_FAIL;
 	}
 
 	obj = prepare_bpf(obj_buf, obj_buf_sz,
-			  bpf_testcase_table[index].name);
+			  bpf_testcase_table[idx].name);
 	if (!obj) {
 		ret = TEST_FAIL;
 		goto out;
 	}
 
 	ret = do_test(obj,
-		      bpf_testcase_table[index].target_func,
-		      bpf_testcase_table[index].expect_result);
+		      bpf_testcase_table[idx].target_func,
+		      bpf_testcase_table[idx].expect_result);
 out:
 	bpf__clear();
 	return ret;
diff --git a/tools/perf/tests/llvm.c b/tools/perf/tests/llvm.c
index bc4cf50..366e38b 100644
--- a/tools/perf/tests/llvm.c
+++ b/tools/perf/tests/llvm.c
@@ -50,7 +50,7 @@ static struct {
 int
 test_llvm__fetch_bpf_obj(void **p_obj_buf,
 			 size_t *p_obj_buf_sz,
-			 enum test_llvm__testcase index,
+			 enum test_llvm__testcase idx,
 			 bool force)
 {
 	const char *source;
@@ -59,11 +59,11 @@ test_llvm__fetch_bpf_obj(void **p_obj_buf,
 	char *tmpl_new = NULL, *clang_opt_new = NULL;
 	int err, old_verbose, ret = TEST_FAIL;
 
-	if (index >= __LLVM_TESTCASE_MAX)
+	if (idx >= __LLVM_TESTCASE_MAX)
 		return TEST_FAIL;
 
-	source = bpf_source_table[index].source;
-	desc = bpf_source_table[index].desc;
+	source = bpf_source_table[idx].source;
+	desc = bpf_source_table[idx].desc;
 
 	perf_config(perf_config_cb, NULL);
 
-- 
cgit v1.1


From 9a13c6587e2f0d5e80ce02f5f9ef62788b48d163 Mon Sep 17 00:00:00 2001
From: Kevin Hilman <khilman@linaro.org>
Date: Tue, 17 Nov 2015 13:54:19 -0800
Subject: tools: Fix selftests_install Makefile rule

Fix copy/paste error in selftests_install rule which was copy-pasted
from the clean rule but not properly changed.

Signed-off-by: Kevin Hilman <khilman@linaro.org>
Cc: Bamvor Jian Zhang <bamvor.zhangjian@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Jonathan Cameron <jic23@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Pali Rohar <pali.rohar@gmail.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Roberta Dobrescu <roberta.dobrescu@gmail.com>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: linaro-kernel@lists.linaro.org
Link: http://lkml.kernel.org/r/1447797261-1775-1-git-send-email-khilman@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/Makefile b/tools/Makefile
index 7dc820a..0ba0df3 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -96,7 +96,7 @@ cgroup_install firewire_install hv_install lguest_install perf_install usb_insta
 	$(call descend,$(@:_install=),install)
 
 selftests_install:
-	$(call descend,testing/$(@:_clean=),install)
+	$(call descend,testing/$(@:_install=),install)
 
 turbostat_install x86_energy_perf_policy_install:
 	$(call descend,power/x86/$(@:_install=),install)
-- 
cgit v1.1


From 4ddd32741da87657113d964588ce13ee64b34820 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 16 Nov 2015 11:36:29 -0300
Subject: tools: Adopt memdup() from tools/perf, moving it to
 tools/lib/string.c

That will contain more string functions with counterparts, sometimes
verbatim copies, in the kernel.

Acked-by: Wang Nan <wangnan0@huawei.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: http://lkml.kernel.org/n/tip-rah6g97kn21vfgmlramorz6o@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/include/linux/string.h           |  9 +++++++++
 tools/lib/string.c                     | 19 +++++++++++++++++++
 tools/perf/MANIFEST                    |  2 ++
 tools/perf/util/Build                  |  6 ++++++
 tools/perf/util/include/linux/string.h |  3 ---
 tools/perf/util/string.c               | 16 ----------------
 6 files changed, 36 insertions(+), 19 deletions(-)
 create mode 100644 tools/include/linux/string.h
 create mode 100644 tools/lib/string.c
 delete mode 100644 tools/perf/util/include/linux/string.h

diff --git a/tools/include/linux/string.h b/tools/include/linux/string.h
new file mode 100644
index 0000000..f3a6db6
--- /dev/null
+++ b/tools/include/linux/string.h
@@ -0,0 +1,9 @@
+#ifndef _TOOLS_LINUX_STRING_H_
+#define _TOOLS_LINUX_STRING_H_
+
+
+#include <linux/types.h>	/* for size_t */
+
+void *memdup(const void *src, size_t len);
+
+#endif /* _LINUX_STRING_H_ */
diff --git a/tools/lib/string.c b/tools/lib/string.c
new file mode 100644
index 0000000..ecfd43a
--- /dev/null
+++ b/tools/lib/string.c
@@ -0,0 +1,19 @@
+#include <stdlib.h>
+#include <string.h>
+#include <linux/string.h>
+
+/**
+ * memdup - duplicate region of memory
+ *
+ * @src: memory region to duplicate
+ * @len: memory region length
+ */
+void *memdup(const void *src, size_t len)
+{
+	void *p = malloc(len);
+
+	if (p)
+		memcpy(p, src, len);
+
+	return p;
+}
diff --git a/tools/perf/MANIFEST b/tools/perf/MANIFEST
index 39c38cb..2562eac 100644
--- a/tools/perf/MANIFEST
+++ b/tools/perf/MANIFEST
@@ -22,6 +22,7 @@ tools/lib/api
 tools/lib/bpf
 tools/lib/hweight.c
 tools/lib/rbtree.c
+tools/lib/string.c
 tools/lib/symbol/kallsyms.c
 tools/lib/symbol/kallsyms.h
 tools/lib/util/find_next_bit.c
@@ -50,6 +51,7 @@ tools/include/linux/log2.h
 tools/include/linux/poison.h
 tools/include/linux/rbtree.h
 tools/include/linux/rbtree_augmented.h
+tools/include/linux/string.h
 tools/include/linux/types.h
 tools/include/linux/err.h
 include/asm-generic/bitops/arch_hweight.h
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index 591b3fe..e231690 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -21,6 +21,7 @@ libperf-y += parse-events.o
 libperf-y += perf_regs.o
 libperf-y += path.o
 libperf-y += rbtree.o
+libperf-y += libstring.o
 libperf-y += bitmap.o
 libperf-y += hweight.o
 libperf-y += run-command.o
@@ -138,6 +139,7 @@ $(OUTPUT)util/pmu.o: $(OUTPUT)util/pmu-flex.c $(OUTPUT)util/pmu-bison.c
 
 CFLAGS_find_next_bit.o += -Wno-unused-parameter -DETC_PERFCONFIG="BUILD_STR($(ETC_PERFCONFIG_SQ))"
 CFLAGS_rbtree.o        += -Wno-unused-parameter -DETC_PERFCONFIG="BUILD_STR($(ETC_PERFCONFIG_SQ))"
+CFLAGS_libstring.o     += -Wno-unused-parameter -DETC_PERFCONFIG="BUILD_STR($(ETC_PERFCONFIG_SQ))"
 CFLAGS_hweight.o       += -Wno-unused-parameter -DETC_PERFCONFIG="BUILD_STR($(ETC_PERFCONFIG_SQ))"
 CFLAGS_parse-events.o  += -Wno-redundant-decls
 
@@ -153,6 +155,10 @@ $(OUTPUT)util/rbtree.o: ../lib/rbtree.c FORCE
 	$(call rule_mkdir)
 	$(call if_changed_dep,cc_o_c)
 
+$(OUTPUT)util/libstring.o: ../lib/string.c FORCE
+	$(call rule_mkdir)
+	$(call if_changed_dep,cc_o_c)
+
 $(OUTPUT)util/hweight.o: ../lib/hweight.c FORCE
 	$(call rule_mkdir)
 	$(call if_changed_dep,cc_o_c)
diff --git a/tools/perf/util/include/linux/string.h b/tools/perf/util/include/linux/string.h
deleted file mode 100644
index 6f19c54..0000000
--- a/tools/perf/util/include/linux/string.h
+++ /dev/null
@@ -1,3 +0,0 @@
-#include <string.h>
-
-void *memdup(const void *src, size_t len);
diff --git a/tools/perf/util/string.c b/tools/perf/util/string.c
index fc8781d..7f7e072 100644
--- a/tools/perf/util/string.c
+++ b/tools/perf/util/string.c
@@ -342,22 +342,6 @@ char *rtrim(char *s)
 	return s;
 }
 
-/**
- * memdup - duplicate region of memory
- * @src: memory region to duplicate
- * @len: memory region length
- */
-void *memdup(const void *src, size_t len)
-{
-	void *p;
-
-	p = malloc(len);
-	if (p)
-		memcpy(p, src, len);
-
-	return p;
-}
-
 char *asprintf_expr_inout_ints(const char *var, bool in, size_t nints, int *ints)
 {
 	/*
-- 
cgit v1.1


From 7d85c434214ea0b3416f7a62f76a0785b00d8797 Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Mon, 16 Nov 2015 11:42:05 -0300
Subject: tools: Clone the kernel's strtobool function

Copying it to tools/lib/string.c, the counterpart to the kernel's
lib/string.c.

This is preparation for enhancing BPF program configuration, which will
allow config string like 'inlines=yes'.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Jonathan Cameron <jic23@cam.ac.uk>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1447675815-166222-6-git-send-email-wangnan0@huawei.com
[ Copied it to tools/lib/string.c instead, to make it usable by other tools/ ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/include/linux/string.h |  2 ++
 tools/lib/string.c           | 43 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+)

diff --git a/tools/include/linux/string.h b/tools/include/linux/string.h
index f3a6db6..2e2f736 100644
--- a/tools/include/linux/string.h
+++ b/tools/include/linux/string.h
@@ -6,4 +6,6 @@
 
 void *memdup(const void *src, size_t len);
 
+int strtobool(const char *s, bool *res);
+
 #endif /* _LINUX_STRING_H_ */
diff --git a/tools/lib/string.c b/tools/lib/string.c
index ecfd43a..065e54f 100644
--- a/tools/lib/string.c
+++ b/tools/lib/string.c
@@ -1,5 +1,20 @@
+/*
+ *  linux/tools/lib/string.c
+ *
+ *  Copied from linux/lib/string.c, where it is:
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  More specifically, the first copied function was strtobool, which
+ *  was introduced by:
+ *
+ *  d0f1fed29e6e ("Add a strtobool function matching semantics of existing in kernel equivalents")
+ *  Author: Jonathan Cameron <jic23@cam.ac.uk>
+ */
+
 #include <stdlib.h>
 #include <string.h>
+#include <errno.h>
 #include <linux/string.h>
 
 /**
@@ -17,3 +32,31 @@ void *memdup(const void *src, size_t len)
 
 	return p;
 }
+
+/**
+ * strtobool - convert common user inputs into boolean values
+ * @s: input string
+ * @res: result
+ *
+ * This routine returns 0 iff the first character is one of 'Yy1Nn0'.
+ * Otherwise it will return -EINVAL.  Value pointed to by res is
+ * updated upon finding a match.
+ */
+int strtobool(const char *s, bool *res)
+{
+	switch (s[0]) {
+	case 'y':
+	case 'Y':
+	case '1':
+		*res = true;
+		break;
+	case 'n':
+	case 'N':
+	case '0':
+		*res = false;
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
-- 
cgit v1.1


From b580563e38487d9db8e94080149644da71c533c1 Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Mon, 16 Nov 2015 12:10:09 +0000
Subject: bpf tools: Load a program with different instances using preprocessor

This patch is a preparation for BPF prologue support which allows
generating a series of BPF bytecode for fetching kernel data before
calling program code. With the newly introduced multiple instances
support, perf is able to create different prologues for different kprobe
points.

Before this patch, a bpf_program can be loaded into kernel only once,
and get the only resulting fd. What this patch does is to allow creating
and loading different variants of one bpf_program, then fetching their
fds.

Here we describe the basic idea in this patch. The detailed description
of the newly introduced APIs can be found in comments in the patch body.

The key of this patch is the new mechanism in bpf_program__load().
Instead of loading BPF program into kernel directly, it calls a
'pre-processor' to generate program instances which would be finally
loaded into the kernel based on the original code. To enable the
generation of multiple instances, libbpf passes an index to the
pre-processor so it know which instance is being loaded.

Pre-processor should be called from libbpf's user (perf) using
bpf_program__set_prep(). The number of instances and the relationship
between indices and the target instance should be clear when calling
bpf_program__set_prep().

To retrieve a fd for a specific instance of a program,
bpf_program__nth_fd() is introduced. It returns the resulting fd
according to index.

Signed-off-by: He Kuang <hekuang@huawei.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: He Kuang <hekuang@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1447675815-166222-8-git-send-email-wangnan0@huawei.com
Signed-off-by: Wang Nan <wangnan0@huawei.com>
[ Enclosed multi-line if/else blocks with {}, (*func_ptr)() -> func_ptr() ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/lib/bpf/libbpf.c | 146 ++++++++++++++++++++++++++++++++++++++++++++++---
 tools/lib/bpf/libbpf.h |  64 ++++++++++++++++++++++
 2 files changed, 201 insertions(+), 9 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index e176bad..e3f4c33 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -152,7 +152,11 @@ struct bpf_program {
 	} *reloc_desc;
 	int nr_reloc;
 
-	int fd;
+	struct {
+		int nr;
+		int *fds;
+	} instances;
+	bpf_program_prep_t preprocessor;
 
 	struct bpf_object *obj;
 	void *priv;
@@ -206,10 +210,25 @@ struct bpf_object {
 
 static void bpf_program__unload(struct bpf_program *prog)
 {
+	int i;
+
 	if (!prog)
 		return;
 
-	zclose(prog->fd);
+	/*
+	 * If the object is opened but the program was never loaded,
+	 * it is possible that prog->instances.nr == -1.
+	 */
+	if (prog->instances.nr > 0) {
+		for (i = 0; i < prog->instances.nr; i++)
+			zclose(prog->instances.fds[i]);
+	} else if (prog->instances.nr != -1) {
+		pr_warning("Internal error: instances.nr is %d\n",
+			   prog->instances.nr);
+	}
+
+	prog->instances.nr = -1;
+	zfree(&prog->instances.fds);
 }
 
 static void bpf_program__exit(struct bpf_program *prog)
@@ -260,7 +279,8 @@ bpf_program__init(void *data, size_t size, char *name, int idx,
 	memcpy(prog->insns, data,
 	       prog->insns_cnt * sizeof(struct bpf_insn));
 	prog->idx = idx;
-	prog->fd = -1;
+	prog->instances.fds = NULL;
+	prog->instances.nr = -1;
 
 	return 0;
 errout:
@@ -860,13 +880,73 @@ static int
 bpf_program__load(struct bpf_program *prog,
 		  char *license, u32 kern_version)
 {
-	int err, fd;
+	int err = 0, fd, i;
 
-	err = load_program(prog->insns, prog->insns_cnt,
-			   license, kern_version, &fd);
-	if (!err)
-		prog->fd = fd;
+	if (prog->instances.nr < 0 || !prog->instances.fds) {
+		if (prog->preprocessor) {
+			pr_warning("Internal error: can't load program '%s'\n",
+				   prog->section_name);
+			return -LIBBPF_ERRNO__INTERNAL;
+		}
 
+		prog->instances.fds = malloc(sizeof(int));
+		if (!prog->instances.fds) {
+			pr_warning("Not enough memory for BPF fds\n");
+			return -ENOMEM;
+		}
+		prog->instances.nr = 1;
+		prog->instances.fds[0] = -1;
+	}
+
+	if (!prog->preprocessor) {
+		if (prog->instances.nr != 1) {
+			pr_warning("Program '%s' is inconsistent: nr(%d) != 1\n",
+				   prog->section_name, prog->instances.nr);
+		}
+		err = load_program(prog->insns, prog->insns_cnt,
+				   license, kern_version, &fd);
+		if (!err)
+			prog->instances.fds[0] = fd;
+		goto out;
+	}
+
+	for (i = 0; i < prog->instances.nr; i++) {
+		struct bpf_prog_prep_result result;
+		bpf_program_prep_t preprocessor = prog->preprocessor;
+
+		bzero(&result, sizeof(result));
+		err = preprocessor(prog, i, prog->insns,
+				   prog->insns_cnt, &result);
+		if (err) {
+			pr_warning("Preprocessing the %dth instance of program '%s' failed\n",
+				   i, prog->section_name);
+			goto out;
+		}
+
+		if (!result.new_insn_ptr || !result.new_insn_cnt) {
+			pr_debug("Skip loading the %dth instance of program '%s'\n",
+				 i, prog->section_name);
+			prog->instances.fds[i] = -1;
+			if (result.pfd)
+				*result.pfd = -1;
+			continue;
+		}
+
+		err = load_program(result.new_insn_ptr,
+				   result.new_insn_cnt,
+				   license, kern_version, &fd);
+
+		if (err) {
+			pr_warning("Loading the %dth instance of program '%s' failed\n",
+					i, prog->section_name);
+			goto out;
+		}
+
+		if (result.pfd)
+			*result.pfd = fd;
+		prog->instances.fds[i] = fd;
+	}
+out:
 	if (err)
 		pr_warning("failed to load program '%s'\n",
 			   prog->section_name);
@@ -1121,5 +1201,53 @@ const char *bpf_program__title(struct bpf_program *prog, bool needs_copy)
 
 int bpf_program__fd(struct bpf_program *prog)
 {
-	return prog->fd;
+	return bpf_program__nth_fd(prog, 0);
+}
+
+int bpf_program__set_prep(struct bpf_program *prog, int nr_instances,
+			  bpf_program_prep_t prep)
+{
+	int *instances_fds;
+
+	if (nr_instances <= 0 || !prep)
+		return -EINVAL;
+
+	if (prog->instances.nr > 0 || prog->instances.fds) {
+		pr_warning("Can't set pre-processor after loading\n");
+		return -EINVAL;
+	}
+
+	instances_fds = malloc(sizeof(int) * nr_instances);
+	if (!instances_fds) {
+		pr_warning("alloc memory failed for fds\n");
+		return -ENOMEM;
+	}
+
+	/* fill all fd with -1 */
+	memset(instances_fds, -1, sizeof(int) * nr_instances);
+
+	prog->instances.nr = nr_instances;
+	prog->instances.fds = instances_fds;
+	prog->preprocessor = prep;
+	return 0;
+}
+
+int bpf_program__nth_fd(struct bpf_program *prog, int n)
+{
+	int fd;
+
+	if (n >= prog->instances.nr || n < 0) {
+		pr_warning("Can't get the %dth fd from program %s: only %d instances\n",
+			   n, prog->section_name, prog->instances.nr);
+		return -EINVAL;
+	}
+
+	fd = prog->instances.fds[n];
+	if (fd < 0) {
+		pr_warning("%dth instance of program '%s' is invalid\n",
+			   n, prog->section_name);
+		return -ENOENT;
+	}
+
+	return fd;
 }
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index c9a9aef..949df4b 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -88,6 +88,70 @@ const char *bpf_program__title(struct bpf_program *prog, bool needs_copy);
 
 int bpf_program__fd(struct bpf_program *prog);
 
+struct bpf_insn;
+
+/*
+ * Libbpf allows callers to adjust BPF programs before being loaded
+ * into kernel. One program in an object file can be transform into
+ * multiple variants to be attached to different code.
+ *
+ * bpf_program_prep_t, bpf_program__set_prep and bpf_program__nth_fd
+ * are APIs for this propose.
+ *
+ * - bpf_program_prep_t:
+ *   It defines 'preprocessor', which is a caller defined function
+ *   passed to libbpf through bpf_program__set_prep(), and will be
+ *   called before program is loaded. The processor should adjust
+ *   the program one time for each instances according to the number
+ *   passed to it.
+ *
+ * - bpf_program__set_prep:
+ *   Attachs a preprocessor to a BPF program. The number of instances
+ *   whould be created is also passed through this function.
+ *
+ * - bpf_program__nth_fd:
+ *   After the program is loaded, get resuling fds from bpf program for
+ *   each instances.
+ *
+ * If bpf_program__set_prep() is not used, the program whould be loaded
+ * without adjustment during bpf_object__load(). The program has only
+ * one instance. In this case bpf_program__fd(prog) is equal to
+ * bpf_program__nth_fd(prog, 0).
+ */
+
+struct bpf_prog_prep_result {
+	/*
+	 * If not NULL, load new instruction array.
+	 * If set to NULL, don't load this instance.
+	 */
+	struct bpf_insn *new_insn_ptr;
+	int new_insn_cnt;
+
+	/* If not NULL, result fd is set to it */
+	int *pfd;
+};
+
+/*
+ * Parameters of bpf_program_prep_t:
+ *  - prog:	The bpf_program being loaded.
+ *  - n:	Index of instance being generated.
+ *  - insns:	BPF instructions array.
+ *  - insns_cnt:Number of instructions in insns.
+ *  - res:	Output parameter, result of transformation.
+ *
+ * Return value:
+ *  - Zero: pre-processing success.
+ *  - Non-zero: pre-processing, stop loading.
+ */
+typedef int (*bpf_program_prep_t)(struct bpf_program *prog, int n,
+				  struct bpf_insn *insns, int insns_cnt,
+				  struct bpf_prog_prep_result *res);
+
+int bpf_program__set_prep(struct bpf_program *prog, int nr_instance,
+			  bpf_program_prep_t prep);
+
+int bpf_program__nth_fd(struct bpf_program *prog, int n);
+
 /*
  * We don't need __attribute__((packed)) now since it is
  * unnecessary for 'bpf_map_def' because they are all aligned.
-- 
cgit v1.1


From 1c0ed63239012aa881cc811f726b549dca7279e4 Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Mon, 16 Nov 2015 12:10:10 +0000
Subject: perf bpf: Add BPF_PROLOGUE config options for further patches

If both LIBBPF and DWARF are detected, it is possible to create prologue
for eBPF programs to help them access kernel data. HAVE_BPF_PROLOGUE and
CONFIG_BPF_PROLOGUE are added as flags for this feature.

PERF_HAVE_ARCH_REGS_QUERY_REGISTER_OFFSET is introduced in commit
63ab024a5b6f295ca17a293ad81b7c728f49a89a ("perf tools:
regs_query_register_offset() infrastructure"), which indicates that an
architecture supports converting name of a register to its offset in
'struct pt_regs'. Without this support, BPF_PROLOGUE should be turned
off.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1447675815-166222-9-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/config/Makefile | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tools/perf/config/Makefile b/tools/perf/config/Makefile
index de89ec5..6eb9a95 100644
--- a/tools/perf/config/Makefile
+++ b/tools/perf/config/Makefile
@@ -318,6 +318,18 @@ ifndef NO_LIBELF
       CFLAGS += -DHAVE_LIBBPF_SUPPORT
       $(call detected,CONFIG_LIBBPF)
     endif
+
+    ifndef NO_DWARF
+      ifdef PERF_HAVE_ARCH_REGS_QUERY_REGISTER_OFFSET
+        CFLAGS += -DHAVE_BPF_PROLOGUE
+        $(call detected,CONFIG_BPF_PROLOGUE)
+      else
+        msg := $(warning BPF prologue is not supported by architecture $(ARCH), missing regs_query_register_offset());
+      endif
+    else
+      msg := $(warning DWARF support is off, BPF prologue is disabled);
+    endif
+
   endif # NO_LIBBPF
 endif # NO_LIBELF
 
-- 
cgit v1.1


From 30433a3a52b951faab95944e0f8b9d33a1e322ce Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Mon, 16 Nov 2015 12:10:11 +0000
Subject: perf bpf: Compile dwarf-regs.c if CONFIG_BPF_PROLOGUE is on

regs_query_register_offset() in dwarf-regs.c is required by BPF
prologue.  This patch compiles it if CONFIG_BPF_PROLOGUE is on to avoid
build failure when CONFIG_BPF_PROLOGUE is on but CONFIG_DWARF is not
set.

Signed-off-by: He Kuang <hekuang@huawei.com>
Acked-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: He Kuang <hekuang@huawei.com>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1447675815-166222-10-git-send-email-wangnan0@huawei.com
Signed-off-by: Wang Nan <wangnan0@huawei.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/x86/util/Build | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/perf/arch/x86/util/Build b/tools/perf/arch/x86/util/Build
index ff63649..4659703 100644
--- a/tools/perf/arch/x86/util/Build
+++ b/tools/perf/arch/x86/util/Build
@@ -5,6 +5,7 @@ libperf-y += kvm-stat.o
 libperf-y += perf_regs.o
 
 libperf-$(CONFIG_DWARF) += dwarf-regs.o
+libperf-$(CONFIG_BPF_PROLOGUE) += dwarf-regs.o
 
 libperf-$(CONFIG_LIBUNWIND)          += unwind-libunwind.o
 libperf-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o
-- 
cgit v1.1


From 361f2b1d1d7231b8685d990b886f599378a4d5a5 Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Mon, 16 Nov 2015 12:10:05 +0000
Subject: perf bpf: Allow BPF program attach to uprobe events

This patch adds a new syntax to the BPF object section name to support
probing at uprobe event. Now we can use BPF program like this:

  SEC(
  "exec=/lib64/libc.so.6;"
  "libcwrite=__write"
  )
  int libcwrite(void *ctx)
  {
      return 1;
  }

Where, in section name of a program, before the main config string, we
can use 'key=value' style options. Now the only option key is "exec",
for uprobes.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1447675815-166222-4-git-send-email-wangnan0@huawei.com
[ Changed the separator from \n to ; ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/bpf-loader.c | 120 ++++++++++++++++++++++++++++++++++++++++---
 tools/perf/util/bpf-loader.h |   1 +
 2 files changed, 115 insertions(+), 6 deletions(-)

diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c
index 4c50411..84169d6 100644
--- a/tools/perf/util/bpf-loader.c
+++ b/tools/perf/util/bpf-loader.c
@@ -110,6 +110,113 @@ bpf_prog_priv__clear(struct bpf_program *prog __maybe_unused,
 }
 
 static int
+config__exec(const char *value, struct perf_probe_event *pev)
+{
+	pev->uprobes = true;
+	pev->target = strdup(value);
+	if (!pev->target)
+		return -ENOMEM;
+	return 0;
+}
+
+static struct {
+	const char *key;
+	const char *usage;
+	const char *desc;
+	int (*func)(const char *, struct perf_probe_event *);
+} bpf_config_terms[] = {
+	{
+		.key	= "exec",
+		.usage	= "exec=<full path of file>",
+		.desc	= "Set uprobe target",
+		.func	= config__exec,
+	},
+};
+
+static int
+do_config(const char *key, const char *value,
+	  struct perf_probe_event *pev)
+{
+	unsigned int i;
+
+	pr_debug("config bpf program: %s=%s\n", key, value);
+	for (i = 0; i < ARRAY_SIZE(bpf_config_terms); i++)
+		if (strcmp(key, bpf_config_terms[i].key) == 0)
+			return bpf_config_terms[i].func(value, pev);
+
+	pr_debug("BPF: ERROR: invalid config option in object: %s=%s\n",
+		 key, value);
+
+	pr_debug("\nHint: Currently valid options are:\n");
+	for (i = 0; i < ARRAY_SIZE(bpf_config_terms); i++)
+		pr_debug("\t%s:\t%s\n", bpf_config_terms[i].usage,
+			 bpf_config_terms[i].desc);
+	pr_debug("\n");
+
+	return -BPF_LOADER_ERRNO__CONFIG_TERM;
+}
+
+static const char *
+parse_config_kvpair(const char *config_str, struct perf_probe_event *pev)
+{
+	char *text = strdup(config_str);
+	char *sep, *line;
+	const char *main_str = NULL;
+	int err = 0;
+
+	if (!text) {
+		pr_debug("No enough memory: dup config_str failed\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	line = text;
+	while ((sep = strchr(line, ';'))) {
+		char *equ;
+
+		*sep = '\0';
+		equ = strchr(line, '=');
+		if (!equ) {
+			pr_warning("WARNING: invalid config in BPF object: %s\n",
+				   line);
+			pr_warning("\tShould be 'key=value'.\n");
+			goto nextline;
+		}
+		*equ = '\0';
+
+		err = do_config(line, equ + 1, pev);
+		if (err)
+			break;
+nextline:
+		line = sep + 1;
+	}
+
+	if (!err)
+		main_str = config_str + (line - text);
+	free(text);
+
+	return err ? ERR_PTR(err) : main_str;
+}
+
+static int
+parse_config(const char *config_str, struct perf_probe_event *pev)
+{
+	int err;
+	const char *main_str = parse_config_kvpair(config_str, pev);
+
+	if (IS_ERR(main_str))
+		return PTR_ERR(main_str);
+
+	err = parse_perf_probe_command(main_str, pev);
+	if (err < 0) {
+		pr_debug("bpf: '%s' is not a valid config string\n",
+			 config_str);
+		/* parse failed, don't need clear pev. */
+		return -BPF_LOADER_ERRNO__CONFIG;
+	}
+	return 0;
+}
+
+static int
 config_bpf_program(struct bpf_program *prog)
 {
 	struct perf_probe_event *pev = NULL;
@@ -131,13 +238,9 @@ config_bpf_program(struct bpf_program *prog)
 	pev = &priv->pev;
 
 	pr_debug("bpf: config program '%s'\n", config_str);
-	err = parse_perf_probe_command(config_str, pev);
-	if (err < 0) {
-		pr_debug("bpf: '%s' is not a valid config string\n",
-			 config_str);
-		err = -BPF_LOADER_ERRNO__CONFIG;
+	err = parse_config(config_str, pev);
+	if (err)
 		goto errout;
-	}
 
 	if (pev->group && strcmp(pev->group, PERF_BPF_PROBE_GROUP)) {
 		pr_debug("bpf: '%s': group for event is set and not '%s'.\n",
@@ -340,6 +443,7 @@ static const char *bpf_loader_strerror_table[NR_ERRNO] = {
 	[ERRCODE_OFFSET(EVENTNAME)]	= "No event name found in config string",
 	[ERRCODE_OFFSET(INTERNAL)]	= "BPF loader internal error",
 	[ERRCODE_OFFSET(COMPILE)]	= "Error when compiling BPF scriptlet",
+	[ERRCODE_OFFSET(CONFIG_TERM)]	= "Invalid config term in config string",
 };
 
 static int
@@ -420,6 +524,10 @@ int bpf__strerror_probe(struct bpf_object *obj __maybe_unused,
 			int err, char *buf, size_t size)
 {
 	bpf__strerror_head(err, buf, size);
+	case BPF_LOADER_ERRNO__CONFIG_TERM: {
+		scnprintf(buf, size, "%s (add -v to see detail)", emsg);
+		break;
+	}
 	bpf__strerror_entry(EEXIST, "Probe point exist. Try use 'perf probe -d \"*\"'");
 	bpf__strerror_entry(EACCES, "You need to be root");
 	bpf__strerror_entry(EPERM, "You need to be root, and /proc/sys/kernel/kptr_restrict should be 0");
diff --git a/tools/perf/util/bpf-loader.h b/tools/perf/util/bpf-loader.h
index 9caf3ae..d19f5c5 100644
--- a/tools/perf/util/bpf-loader.h
+++ b/tools/perf/util/bpf-loader.h
@@ -20,6 +20,7 @@ enum bpf_loader_errno {
 	BPF_LOADER_ERRNO__EVENTNAME,	/* Event name is missing */
 	BPF_LOADER_ERRNO__INTERNAL,	/* BPF loader internal error */
 	BPF_LOADER_ERRNO__COMPILE,	/* Error when compiling BPF scriptlet */
+	BPF_LOADER_ERRNO__CONFIG_TERM,	/* Invalid config term in config term */
 	__BPF_LOADER_ERRNO__END,
 };
 
-- 
cgit v1.1


From 5dbd16c0c9d17ab1ab2226a5926482c26c0287ed Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Mon, 16 Nov 2015 12:10:06 +0000
Subject: perf bpf: Allow attaching BPF programs to modules symbols

By extending the syntax of BPF object section names, this patch allows
users to attach BPF programs to symbols in modules. For example:

  SEC("module=i915;"
      "parse_cmds=i915_parse_cmds")
  int parse_cmds(void *ctx)
  {
      return 1;
  }

The implementation is very simple: like what 'perf probe' does, for module,
fill 'uprobe' field in 'struct perf_probe_event'. Other parts will be done
automatically.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David Ahern <dsahern@gmail.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kaixu Xia <xiakaixu@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1447675815-166222-5-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/bpf-loader.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c
index 84169d6..d0f02ed 100644
--- a/tools/perf/util/bpf-loader.c
+++ b/tools/perf/util/bpf-loader.c
@@ -119,6 +119,16 @@ config__exec(const char *value, struct perf_probe_event *pev)
 	return 0;
 }
 
+static int
+config__module(const char *value, struct perf_probe_event *pev)
+{
+	pev->uprobes = false;
+	pev->target = strdup(value);
+	if (!pev->target)
+		return -ENOMEM;
+	return 0;
+}
+
 static struct {
 	const char *key;
 	const char *usage;
@@ -131,6 +141,12 @@ static struct {
 		.desc	= "Set uprobe target",
 		.func	= config__exec,
 	},
+	{
+		.key	= "module",
+		.usage	= "module=<module name>    ",
+		.desc	= "Set kprobe module",
+		.func	= config__module,
+	}
 };
 
 static int
-- 
cgit v1.1


From 03e01f568759ddbfdaff892e299758e7771a3478 Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Mon, 16 Nov 2015 12:10:08 +0000
Subject: perf bpf: Allow BPF program config probing options

By extending the syntax of BPF object section names, this patch allows users to
config probing options like what they can do in 'perf probe'.

The error message in 'perf probe' is also updated.

Test result:

For following BPF file test_probe_glob.c:

  # cat test_probe_glob.c
  __attribute__((section("inlines=no;func=SyS_dup?"), used))

  int func(void *ctx)
  {
	  return 1;
  }

  char _license[] __attribute__((section("license"), used)) = "GPL";
  int _version __attribute__((section("version"), used)) = 0x40300;
  #
  # ./perf record  -e ./test_probe_glob.c ls /
  ...
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.013 MB perf.data ]
  # ./perf evlist
  perf_bpf_probe:func_1
  perf_bpf_probe:func

After changing "inlines=no" to "inlines=yes":

  # ./perf record  -e ./test_probe_glob.c ls /
  ...
  [ perf record: Woken up 2 times to write data ]
  [ perf record: Captured and wrote 0.013 MB perf.data ]
  # ./perf evlist
  perf_bpf_probe:func_3
  perf_bpf_probe:func_2
  perf_bpf_probe:func_1
  perf_bpf_probe:func

Then test 'force':

Use following program:

  # cat test_probe_force.c
  __attribute__((section("func=sys_write"), used))

  int funca(void *ctx)
  {
	  return 1;
  }

  __attribute__((section("force=yes;func=sys_write"), used))

  int funcb(void *ctx)
  {
  	return 1;
  }

  char _license[] __attribute__((section("license"), used)) = "GPL";
  int _version __attribute__((section("version"), used)) = 0x40300;
  #

  # perf record -e ./test_probe_force.c usleep 1
  Error: event "func" already exists.
   Hint: Remove existing event by 'perf probe -d'
       or force duplicates by 'perf probe -f'
       or set 'force=yes' in BPF source.
  event syntax error: './test_probe_force.c'
                       \___ Probe point exist. Try 'perf probe -d "*"' and set 'force=yes'

  (add -v to see detail)
  ...

Then replace 'force=no' to 'force=yes':

  # vim test_probe_force.c
  # perf record -e ./test_probe_force.c usleep 1
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.017 MB perf.data ]
  # perf evlist
  perf_bpf_probe:func_1
  perf_bpf_probe:func
  #

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1447675815-166222-7-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/bpf-loader.c  | 53 +++++++++++++++++++++++++++++++++++++++++--
 tools/perf/util/probe-event.c |  7 ++++--
 2 files changed, 56 insertions(+), 4 deletions(-)

diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c
index d0f02ed..98f2e5d 100644
--- a/tools/perf/util/bpf-loader.c
+++ b/tools/perf/util/bpf-loader.c
@@ -7,6 +7,7 @@
 
 #include <bpf/libbpf.h>
 #include <linux/err.h>
+#include <linux/string.h>
 #include "perf.h"
 #include "debug.h"
 #include "bpf-loader.h"
@@ -129,6 +130,38 @@ config__module(const char *value, struct perf_probe_event *pev)
 	return 0;
 }
 
+static int
+config__bool(const char *value,
+	     bool *pbool, bool invert)
+{
+	int err;
+	bool bool_value;
+
+	if (!pbool)
+		return -EINVAL;
+
+	err = strtobool(value, &bool_value);
+	if (err)
+		return err;
+
+	*pbool = invert ? !bool_value : bool_value;
+	return 0;
+}
+
+static int
+config__inlines(const char *value,
+		struct perf_probe_event *pev __maybe_unused)
+{
+	return config__bool(value, &probe_conf.no_inlines, true);
+}
+
+static int
+config__force(const char *value,
+	      struct perf_probe_event *pev __maybe_unused)
+{
+	return config__bool(value, &probe_conf.force_add, false);
+}
+
 static struct {
 	const char *key;
 	const char *usage;
@@ -146,7 +179,19 @@ static struct {
 		.usage	= "module=<module name>    ",
 		.desc	= "Set kprobe module",
 		.func	= config__module,
-	}
+	},
+	{
+		.key	= "inlines",
+		.usage	= "inlines=[yes|no]        ",
+		.desc	= "Probe at inline symbol",
+		.func	= config__inlines,
+	},
+	{
+		.key	= "force",
+		.usage	= "force=[yes|no]          ",
+		.desc	= "Forcibly add events with existing name",
+		.func	= config__force,
+	},
 };
 
 static int
@@ -240,6 +285,10 @@ config_bpf_program(struct bpf_program *prog)
 	const char *config_str;
 	int err;
 
+	/* Initialize per-program probing setting */
+	probe_conf.no_inlines = false;
+	probe_conf.force_add = false;
+
 	config_str = bpf_program__title(prog, false);
 	if (IS_ERR(config_str)) {
 		pr_debug("bpf: unable to get title for program\n");
@@ -544,7 +593,7 @@ int bpf__strerror_probe(struct bpf_object *obj __maybe_unused,
 		scnprintf(buf, size, "%s (add -v to see detail)", emsg);
 		break;
 	}
-	bpf__strerror_entry(EEXIST, "Probe point exist. Try use 'perf probe -d \"*\"'");
+	bpf__strerror_entry(EEXIST, "Probe point exist. Try 'perf probe -d \"*\"' and set 'force=yes'");
 	bpf__strerror_entry(EACCES, "You need to be root");
 	bpf__strerror_entry(EPERM, "You need to be root, and /proc/sys/kernel/kptr_restrict should be 0");
 	bpf__strerror_entry(ENOENT, "You need to check probing points in BPF file");
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index 03875f9..93996ec 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -2326,8 +2326,11 @@ static int get_new_event_name(char *buf, size_t len, const char *base,
 		goto out;
 
 	if (!allow_suffix) {
-		pr_warning("Error: event \"%s\" already exists. "
-			   "(Use -f to force duplicates.)\n", buf);
+		pr_warning("Error: event \"%s\" already exists.\n"
+			   " Hint: Remove existing event by 'perf probe -d'\n"
+			   "       or force duplicates by 'perf probe -f'\n"
+			   "       or set 'force=yes' in BPF source.\n",
+			   buf);
 		ret = -EEXIST;
 		goto out;
 	}
-- 
cgit v1.1


From bfc077b4cf106793b30bf942e426ee99f1f4ac44 Mon Sep 17 00:00:00 2001
From: He Kuang <hekuang@huawei.com>
Date: Mon, 16 Nov 2015 12:10:12 +0000
Subject: perf bpf: Add prologue for BPF programs for fetching arguments

This patch generates a prologue for a BPF program which fetches arguments for
it.  With this patch, the program can have arguments as follow:

  SEC("lock_page=__lock_page page->flags")
  int lock_page(struct pt_regs *ctx, int err, unsigned long flags)
  {
 	 return 1;
  }

This patch passes at most 3 arguments from r3, r4 and r5. r1 is still the ctx
pointer. r2 is used to indicate if dereferencing was done successfully.

This patch uses r6 to hold ctx (struct pt_regs) and r7 to hold stack pointer
for result. Result of each arguments first store on stack:

 low address
 BPF_REG_FP - 24  ARG3
 BPF_REG_FP - 16  ARG2
 BPF_REG_FP - 8   ARG1
 BPF_REG_FP
 high address

Then loaded into r3, r4 and r5.

The output prologue for offn(...off2(off1(reg)))) should be:

     r6 <- r1			// save ctx into a callee saved register
     r7 <- fp
     r7 <- r7 - stack_offset	// pointer to result slot
     /* load r3 with the offset in pt_regs of 'reg' */
     (r7) <- r3			// make slot valid
     r3 <- r3 + off1		// prepare to read unsafe pointer
     r2 <- 8
     r1 <- r7			// result put onto stack
     call probe_read		// read unsafe pointer
     jnei r0, 0, err		// error checking
     r3 <- (r7)			// read result
     r3 <- r3 + off2		// prepare to read unsafe pointer
     r2 <- 8
     r1 <- r7
     call probe_read
     jnei r0, 0, err
     ...
     /* load r2, r3, r4 from stack */
     goto success
err:
     r2 <- 1
     /* load r3, r4, r5 with 0 */
     goto usercode
success:
     r2 <- 0
usercode:
     r1 <- r6	// restore ctx
     // original user code

If all of arguments reside in register (dereferencing is not
required), gen_prologue_fastpath() will be used to create
fast prologue:

     r3 <- (r1 + offset of reg1)
     r4 <- (r1 + offset of reg2)
     r5 <- (r1 + offset of reg3)
     r2 <- 0

P.S.

eBPF calling convention is defined as:

* r0		- return value from in-kernel function, and exit value
                  for eBPF program
* r1 - r5	- arguments from eBPF program to in-kernel function
* r6 - r9	- callee saved registers that in-kernel function will
                  preserve
* r10		- read-only frame pointer to access stack

Committer note:

At least testing if it builds and loads:

  # cat test_probe_arg.c
  struct pt_regs;

  __attribute__((section("lock_page=__lock_page page->flags"), used))
  int func(struct pt_regs *ctx, int err, unsigned long flags)
  {
  	return 1;
  }

  char _license[] __attribute__((section("license"), used)) = "GPL";
  int _version __attribute__((section("version"), used)) = 0x40300;
  # perf record -e ./test_probe_arg.c usleep 1
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.016 MB perf.data ]
  # perf evlist
  perf_bpf_probe:lock_page
  #

Signed-off-by: He Kuang <hekuang@huawei.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Wang Nan <wangnan0@huawei.com>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1447675815-166222-11-git-send-email-wangnan0@huawei.com
Signed-off-by: Wang Nan <wangnan0@huawei.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/Build          |   1 +
 tools/perf/util/bpf-loader.c   |   3 +
 tools/perf/util/bpf-loader.h   |   3 +
 tools/perf/util/bpf-prologue.c | 455 +++++++++++++++++++++++++++++++++++++++++
 tools/perf/util/bpf-prologue.h |  34 +++
 5 files changed, 496 insertions(+)
 create mode 100644 tools/perf/util/bpf-prologue.c
 create mode 100644 tools/perf/util/bpf-prologue.h

diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index e231690..0513dd5 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -89,6 +89,7 @@ libperf-y += parse-branch-options.o
 libperf-y += parse-regs-options.o
 
 libperf-$(CONFIG_LIBBPF) += bpf-loader.o
+libperf-$(CONFIG_BPF_PROLOGUE) += bpf-prologue.o
 libperf-$(CONFIG_LIBELF) += symbol-elf.o
 libperf-$(CONFIG_LIBELF) += probe-file.o
 libperf-$(CONFIG_LIBELF) += probe-event.o
diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c
index 98f2e5d..bd14be4 100644
--- a/tools/perf/util/bpf-loader.c
+++ b/tools/perf/util/bpf-loader.c
@@ -509,6 +509,9 @@ static const char *bpf_loader_strerror_table[NR_ERRNO] = {
 	[ERRCODE_OFFSET(INTERNAL)]	= "BPF loader internal error",
 	[ERRCODE_OFFSET(COMPILE)]	= "Error when compiling BPF scriptlet",
 	[ERRCODE_OFFSET(CONFIG_TERM)]	= "Invalid config term in config string",
+	[ERRCODE_OFFSET(PROLOGUE)]	= "Failed to generate prologue",
+	[ERRCODE_OFFSET(PROLOGUE2BIG)]	= "Prologue too big for program",
+	[ERRCODE_OFFSET(PROLOGUEOOB)]	= "Offset out of bound for prologue",
 };
 
 static int
diff --git a/tools/perf/util/bpf-loader.h b/tools/perf/util/bpf-loader.h
index d19f5c5..a58740b 100644
--- a/tools/perf/util/bpf-loader.h
+++ b/tools/perf/util/bpf-loader.h
@@ -21,6 +21,9 @@ enum bpf_loader_errno {
 	BPF_LOADER_ERRNO__INTERNAL,	/* BPF loader internal error */
 	BPF_LOADER_ERRNO__COMPILE,	/* Error when compiling BPF scriptlet */
 	BPF_LOADER_ERRNO__CONFIG_TERM,	/* Invalid config term in config term */
+	BPF_LOADER_ERRNO__PROLOGUE,	/* Failed to generate prologue */
+	BPF_LOADER_ERRNO__PROLOGUE2BIG,	/* Prologue too big for program */
+	BPF_LOADER_ERRNO__PROLOGUEOOB,	/* Offset out of bound for prologue */
 	__BPF_LOADER_ERRNO__END,
 };
 
diff --git a/tools/perf/util/bpf-prologue.c b/tools/perf/util/bpf-prologue.c
new file mode 100644
index 0000000..6cdbee1
--- /dev/null
+++ b/tools/perf/util/bpf-prologue.c
@@ -0,0 +1,455 @@
+/*
+ * bpf-prologue.c
+ *
+ * Copyright (C) 2015 He Kuang <hekuang@huawei.com>
+ * Copyright (C) 2015 Wang Nan <wangnan0@huawei.com>
+ * Copyright (C) 2015 Huawei Inc.
+ */
+
+#include <bpf/libbpf.h>
+#include "perf.h"
+#include "debug.h"
+#include "bpf-loader.h"
+#include "bpf-prologue.h"
+#include "probe-finder.h"
+#include <dwarf-regs.h>
+#include <linux/filter.h>
+
+#define BPF_REG_SIZE		8
+
+#define JMP_TO_ERROR_CODE	-1
+#define JMP_TO_SUCCESS_CODE	-2
+#define JMP_TO_USER_CODE	-3
+
+struct bpf_insn_pos {
+	struct bpf_insn *begin;
+	struct bpf_insn *end;
+	struct bpf_insn *pos;
+};
+
+static inline int
+pos_get_cnt(struct bpf_insn_pos *pos)
+{
+	return pos->pos - pos->begin;
+}
+
+static int
+append_insn(struct bpf_insn new_insn, struct bpf_insn_pos *pos)
+{
+	if (!pos->pos)
+		return -BPF_LOADER_ERRNO__PROLOGUE2BIG;
+
+	if (pos->pos + 1 >= pos->end) {
+		pr_err("bpf prologue: prologue too long\n");
+		pos->pos = NULL;
+		return -BPF_LOADER_ERRNO__PROLOGUE2BIG;
+	}
+
+	*(pos->pos)++ = new_insn;
+	return 0;
+}
+
+static int
+check_pos(struct bpf_insn_pos *pos)
+{
+	if (!pos->pos || pos->pos >= pos->end)
+		return -BPF_LOADER_ERRNO__PROLOGUE2BIG;
+	return 0;
+}
+
+/* Give it a shorter name */
+#define ins(i, p) append_insn((i), (p))
+
+/*
+ * Give a register name (in 'reg'), generate instruction to
+ * load register into an eBPF register rd:
+ *   'ldd target_reg, offset(ctx_reg)', where:
+ * ctx_reg is pre initialized to pointer of 'struct pt_regs'.
+ */
+static int
+gen_ldx_reg_from_ctx(struct bpf_insn_pos *pos, int ctx_reg,
+		     const char *reg, int target_reg)
+{
+	int offset = regs_query_register_offset(reg);
+
+	if (offset < 0) {
+		pr_err("bpf: prologue: failed to get register %s\n",
+		       reg);
+		return offset;
+	}
+	ins(BPF_LDX_MEM(BPF_DW, target_reg, ctx_reg, offset), pos);
+
+	return check_pos(pos);
+}
+
+/*
+ * Generate a BPF_FUNC_probe_read function call.
+ *
+ * src_base_addr_reg is a register holding base address,
+ * dst_addr_reg is a register holding dest address (on stack),
+ * result is:
+ *
+ *  *[dst_addr_reg] = *([src_base_addr_reg] + offset)
+ *
+ * Arguments of BPF_FUNC_probe_read:
+ *     ARG1: ptr to stack (dest)
+ *     ARG2: size (8)
+ *     ARG3: unsafe ptr (src)
+ */
+static int
+gen_read_mem(struct bpf_insn_pos *pos,
+	     int src_base_addr_reg,
+	     int dst_addr_reg,
+	     long offset)
+{
+	/* mov arg3, src_base_addr_reg */
+	if (src_base_addr_reg != BPF_REG_ARG3)
+		ins(BPF_MOV64_REG(BPF_REG_ARG3, src_base_addr_reg), pos);
+	/* add arg3, #offset */
+	if (offset)
+		ins(BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG3, offset), pos);
+
+	/* mov arg2, #reg_size */
+	ins(BPF_ALU64_IMM(BPF_MOV, BPF_REG_ARG2, BPF_REG_SIZE), pos);
+
+	/* mov arg1, dst_addr_reg */
+	if (dst_addr_reg != BPF_REG_ARG1)
+		ins(BPF_MOV64_REG(BPF_REG_ARG1, dst_addr_reg), pos);
+
+	/* Call probe_read  */
+	ins(BPF_EMIT_CALL(BPF_FUNC_probe_read), pos);
+	/*
+	 * Error processing: if read fail, goto error code,
+	 * will be relocated. Target should be the start of
+	 * error processing code.
+	 */
+	ins(BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, JMP_TO_ERROR_CODE),
+	    pos);
+
+	return check_pos(pos);
+}
+
+/*
+ * Each arg should be bare register. Fetch and save them into argument
+ * registers (r3 - r5).
+ *
+ * BPF_REG_1 should have been initialized with pointer to
+ * 'struct pt_regs'.
+ */
+static int
+gen_prologue_fastpath(struct bpf_insn_pos *pos,
+		      struct probe_trace_arg *args, int nargs)
+{
+	int i, err = 0;
+
+	for (i = 0; i < nargs; i++) {
+		err = gen_ldx_reg_from_ctx(pos, BPF_REG_1, args[i].value,
+					   BPF_PROLOGUE_START_ARG_REG + i);
+		if (err)
+			goto errout;
+	}
+
+	return check_pos(pos);
+errout:
+	return err;
+}
+
+/*
+ * Slow path:
+ *   At least one argument has the form of 'offset($rx)'.
+ *
+ * Following code first stores them into stack, then loads all of then
+ * to r2 - r5.
+ * Before final loading, the final result should be:
+ *
+ * low address
+ * BPF_REG_FP - 24  ARG3
+ * BPF_REG_FP - 16  ARG2
+ * BPF_REG_FP - 8   ARG1
+ * BPF_REG_FP
+ * high address
+ *
+ * For each argument (described as: offn(...off2(off1(reg)))),
+ * generates following code:
+ *
+ *  r7 <- fp
+ *  r7 <- r7 - stack_offset  // Ideal code should initialize r7 using
+ *                           // fp before generating args. However,
+ *                           // eBPF won't regard r7 as stack pointer
+ *                           // if it is generated by minus 8 from
+ *                           // another stack pointer except fp.
+ *                           // This is why we have to set r7
+ *                           // to fp for each variable.
+ *  r3 <- value of 'reg'-> generated using gen_ldx_reg_from_ctx()
+ *  (r7) <- r3       // skip following instructions for bare reg
+ *  r3 <- r3 + off1  . // skip if off1 == 0
+ *  r2 <- 8           \
+ *  r1 <- r7           |-> generated by gen_read_mem()
+ *  call probe_read    /
+ *  jnei r0, 0, err  ./
+ *  r3 <- (r7)
+ *  r3 <- r3 + off2  . // skip if off2 == 0
+ *  r2 <- 8           \  // r2 may be broken by probe_read, so set again
+ *  r1 <- r7           |-> generated by gen_read_mem()
+ *  call probe_read    /
+ *  jnei r0, 0, err  ./
+ *  ...
+ */
+static int
+gen_prologue_slowpath(struct bpf_insn_pos *pos,
+		      struct probe_trace_arg *args, int nargs)
+{
+	int err, i;
+
+	for (i = 0; i < nargs; i++) {
+		struct probe_trace_arg *arg = &args[i];
+		const char *reg = arg->value;
+		struct probe_trace_arg_ref *ref = NULL;
+		int stack_offset = (i + 1) * -8;
+
+		pr_debug("prologue: fetch arg %d, base reg is %s\n",
+			 i, reg);
+
+		/* value of base register is stored into ARG3 */
+		err = gen_ldx_reg_from_ctx(pos, BPF_REG_CTX, reg,
+					   BPF_REG_ARG3);
+		if (err) {
+			pr_err("prologue: failed to get offset of register %s\n",
+			       reg);
+			goto errout;
+		}
+
+		/* Make r7 the stack pointer. */
+		ins(BPF_MOV64_REG(BPF_REG_7, BPF_REG_FP), pos);
+		/* r7 += -8 */
+		ins(BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, stack_offset), pos);
+		/*
+		 * Store r3 (base register) onto stack
+		 * Ensure fp[offset] is set.
+		 * fp is the only valid base register when storing
+		 * into stack. We are not allowed to use r7 as base
+		 * register here.
+		 */
+		ins(BPF_STX_MEM(BPF_DW, BPF_REG_FP, BPF_REG_ARG3,
+				stack_offset), pos);
+
+		ref = arg->ref;
+		while (ref) {
+			pr_debug("prologue: arg %d: offset %ld\n",
+				 i, ref->offset);
+			err = gen_read_mem(pos, BPF_REG_3, BPF_REG_7,
+					   ref->offset);
+			if (err) {
+				pr_err("prologue: failed to generate probe_read function call\n");
+				goto errout;
+			}
+
+			ref = ref->next;
+			/*
+			 * Load previous result into ARG3. Use
+			 * BPF_REG_FP instead of r7 because verifier
+			 * allows FP based addressing only.
+			 */
+			if (ref)
+				ins(BPF_LDX_MEM(BPF_DW, BPF_REG_ARG3,
+						BPF_REG_FP, stack_offset), pos);
+		}
+	}
+
+	/* Final pass: read to registers */
+	for (i = 0; i < nargs; i++)
+		ins(BPF_LDX_MEM(BPF_DW, BPF_PROLOGUE_START_ARG_REG + i,
+				BPF_REG_FP, -BPF_REG_SIZE * (i + 1)), pos);
+
+	ins(BPF_JMP_IMM(BPF_JA, BPF_REG_0, 0, JMP_TO_SUCCESS_CODE), pos);
+
+	return check_pos(pos);
+errout:
+	return err;
+}
+
+static int
+prologue_relocate(struct bpf_insn_pos *pos, struct bpf_insn *error_code,
+		  struct bpf_insn *success_code, struct bpf_insn *user_code)
+{
+	struct bpf_insn *insn;
+
+	if (check_pos(pos))
+		return -BPF_LOADER_ERRNO__PROLOGUE2BIG;
+
+	for (insn = pos->begin; insn < pos->pos; insn++) {
+		struct bpf_insn *target;
+		u8 class = BPF_CLASS(insn->code);
+		u8 opcode;
+
+		if (class != BPF_JMP)
+			continue;
+		opcode = BPF_OP(insn->code);
+		if (opcode == BPF_CALL)
+			continue;
+
+		switch (insn->off) {
+		case JMP_TO_ERROR_CODE:
+			target = error_code;
+			break;
+		case JMP_TO_SUCCESS_CODE:
+			target = success_code;
+			break;
+		case JMP_TO_USER_CODE:
+			target = user_code;
+			break;
+		default:
+			pr_err("bpf prologue: internal error: relocation failed\n");
+			return -BPF_LOADER_ERRNO__PROLOGUE;
+		}
+
+		insn->off = target - (insn + 1);
+	}
+	return 0;
+}
+
+int bpf__gen_prologue(struct probe_trace_arg *args, int nargs,
+		      struct bpf_insn *new_prog, size_t *new_cnt,
+		      size_t cnt_space)
+{
+	struct bpf_insn *success_code = NULL;
+	struct bpf_insn *error_code = NULL;
+	struct bpf_insn *user_code = NULL;
+	struct bpf_insn_pos pos;
+	bool fastpath = true;
+	int err = 0, i;
+
+	if (!new_prog || !new_cnt)
+		return -EINVAL;
+
+	if (cnt_space > BPF_MAXINSNS)
+		cnt_space = BPF_MAXINSNS;
+
+	pos.begin = new_prog;
+	pos.end = new_prog + cnt_space;
+	pos.pos = new_prog;
+
+	if (!nargs) {
+		ins(BPF_ALU64_IMM(BPF_MOV, BPF_PROLOGUE_FETCH_RESULT_REG, 0),
+		    &pos);
+
+		if (check_pos(&pos))
+			goto errout;
+
+		*new_cnt = pos_get_cnt(&pos);
+		return 0;
+	}
+
+	if (nargs > BPF_PROLOGUE_MAX_ARGS) {
+		pr_warning("bpf: prologue: %d arguments are dropped\n",
+			   nargs - BPF_PROLOGUE_MAX_ARGS);
+		nargs = BPF_PROLOGUE_MAX_ARGS;
+	}
+
+	/* First pass: validation */
+	for (i = 0; i < nargs; i++) {
+		struct probe_trace_arg_ref *ref = args[i].ref;
+
+		if (args[i].value[0] == '@') {
+			/* TODO: fetch global variable */
+			pr_err("bpf: prologue: global %s%+ld not support\n",
+				args[i].value, ref ? ref->offset : 0);
+			return -ENOTSUP;
+		}
+
+		while (ref) {
+			/* fastpath is true if all args has ref == NULL */
+			fastpath = false;
+
+			/*
+			 * Instruction encodes immediate value using
+			 * s32, ref->offset is long. On systems which
+			 * can't fill long in s32, refuse to process if
+			 * ref->offset too large (or small).
+			 */
+#ifdef __LP64__
+#define OFFSET_MAX	((1LL << 31) - 1)
+#define OFFSET_MIN	((1LL << 31) * -1)
+			if (ref->offset > OFFSET_MAX ||
+					ref->offset < OFFSET_MIN) {
+				pr_err("bpf: prologue: offset out of bound: %ld\n",
+				       ref->offset);
+				return -BPF_LOADER_ERRNO__PROLOGUEOOB;
+			}
+#endif
+			ref = ref->next;
+		}
+	}
+	pr_debug("prologue: pass validation\n");
+
+	if (fastpath) {
+		/* If all variables are registers... */
+		pr_debug("prologue: fast path\n");
+		err = gen_prologue_fastpath(&pos, args, nargs);
+		if (err)
+			goto errout;
+	} else {
+		pr_debug("prologue: slow path\n");
+
+		/* Initialization: move ctx to a callee saved register. */
+		ins(BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1), &pos);
+
+		err = gen_prologue_slowpath(&pos, args, nargs);
+		if (err)
+			goto errout;
+		/*
+		 * start of ERROR_CODE (only slow pass needs error code)
+		 *   mov r2 <- 1  // r2 is error number
+		 *   mov r3 <- 0  // r3, r4... should be touched or
+		 *                // verifier would complain
+		 *   mov r4 <- 0
+		 *   ...
+		 *   goto usercode
+		 */
+		error_code = pos.pos;
+		ins(BPF_ALU64_IMM(BPF_MOV, BPF_PROLOGUE_FETCH_RESULT_REG, 1),
+		    &pos);
+
+		for (i = 0; i < nargs; i++)
+			ins(BPF_ALU64_IMM(BPF_MOV,
+					  BPF_PROLOGUE_START_ARG_REG + i,
+					  0),
+			    &pos);
+		ins(BPF_JMP_IMM(BPF_JA, BPF_REG_0, 0, JMP_TO_USER_CODE),
+				&pos);
+	}
+
+	/*
+	 * start of SUCCESS_CODE:
+	 *   mov r2 <- 0
+	 *   goto usercode  // skip
+	 */
+	success_code = pos.pos;
+	ins(BPF_ALU64_IMM(BPF_MOV, BPF_PROLOGUE_FETCH_RESULT_REG, 0), &pos);
+
+	/*
+	 * start of USER_CODE:
+	 *   Restore ctx to r1
+	 */
+	user_code = pos.pos;
+	if (!fastpath) {
+		/*
+		 * Only slow path needs restoring of ctx. In fast path,
+		 * register are loaded directly from r1.
+		 */
+		ins(BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX), &pos);
+		err = prologue_relocate(&pos, error_code, success_code,
+					user_code);
+		if (err)
+			goto errout;
+	}
+
+	err = check_pos(&pos);
+	if (err)
+		goto errout;
+
+	*new_cnt = pos_get_cnt(&pos);
+	return 0;
+errout:
+	return err;
+}
diff --git a/tools/perf/util/bpf-prologue.h b/tools/perf/util/bpf-prologue.h
new file mode 100644
index 0000000..d94cbea
--- /dev/null
+++ b/tools/perf/util/bpf-prologue.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (C) 2015, He Kuang <hekuang@huawei.com>
+ * Copyright (C) 2015, Huawei Inc.
+ */
+#ifndef __BPF_PROLOGUE_H
+#define __BPF_PROLOGUE_H
+
+#include <linux/compiler.h>
+#include <linux/filter.h>
+#include "probe-event.h"
+
+#define BPF_PROLOGUE_MAX_ARGS 3
+#define BPF_PROLOGUE_START_ARG_REG BPF_REG_3
+#define BPF_PROLOGUE_FETCH_RESULT_REG BPF_REG_2
+
+#ifdef HAVE_BPF_PROLOGUE
+int bpf__gen_prologue(struct probe_trace_arg *args, int nargs,
+		      struct bpf_insn *new_prog, size_t *new_cnt,
+		      size_t cnt_space);
+#else
+static inline int
+bpf__gen_prologue(struct probe_trace_arg *args __maybe_unused,
+		  int nargs __maybe_unused,
+		  struct bpf_insn *new_prog __maybe_unused,
+		  size_t *new_cnt,
+		  size_t cnt_space __maybe_unused)
+{
+	if (!new_cnt)
+		return -EINVAL;
+	*new_cnt = 0;
+	return -ENOTSUP;
+}
+#endif
+#endif /* __BPF_PROLOGUE_H */
-- 
cgit v1.1


From a08357d8dc7d3025d1094f727ad1f7e837766f93 Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Mon, 16 Nov 2015 12:10:13 +0000
Subject: perf bpf: Generate prologue for BPF programs

This patch generates a prologue for each 'struct probe_trace_event' for
fetching arguments for BPF programs.

After bpf__probe(), iterate over each program to check whether prologues are
required. If none of the 'struct perf_probe_event' programs will attach to have
at least one argument, simply skip preprocessor hooking. For those who a
prologue is required, call bpf__gen_prologue() and paste the original
instruction after the prologue.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1447675815-166222-12-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/bpf-loader.c | 120 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 119 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c
index bd14be4..190a1c7 100644
--- a/tools/perf/util/bpf-loader.c
+++ b/tools/perf/util/bpf-loader.c
@@ -5,12 +5,15 @@
  * Copyright (C) 2015 Huawei Inc.
  */
 
+#include <linux/bpf.h>
 #include <bpf/libbpf.h>
 #include <linux/err.h>
 #include <linux/string.h>
 #include "perf.h"
 #include "debug.h"
 #include "bpf-loader.h"
+#include "bpf-prologue.h"
+#include "llvm-utils.h"
 #include "probe-event.h"
 #include "probe-finder.h" // for MAX_PROBES
 #include "llvm-utils.h"
@@ -33,6 +36,8 @@ DEFINE_PRINT_FN(debug, 1)
 
 struct bpf_prog_priv {
 	struct perf_probe_event pev;
+	bool need_prologue;
+	struct bpf_insn *insns_buf;
 };
 
 static bool libbpf_initialized;
@@ -107,6 +112,7 @@ bpf_prog_priv__clear(struct bpf_program *prog __maybe_unused,
 	struct bpf_prog_priv *priv = _priv;
 
 	cleanup_perf_probe_events(&priv->pev, 1);
+	zfree(&priv->insns_buf);
 	free(priv);
 }
 
@@ -365,6 +371,102 @@ static int bpf__prepare_probe(void)
 	return err;
 }
 
+static int
+preproc_gen_prologue(struct bpf_program *prog, int n,
+		     struct bpf_insn *orig_insns, int orig_insns_cnt,
+		     struct bpf_prog_prep_result *res)
+{
+	struct probe_trace_event *tev;
+	struct perf_probe_event *pev;
+	struct bpf_prog_priv *priv;
+	struct bpf_insn *buf;
+	size_t prologue_cnt = 0;
+	int err;
+
+	err = bpf_program__get_private(prog, (void **)&priv);
+	if (err || !priv)
+		goto errout;
+
+	pev = &priv->pev;
+
+	if (n < 0 || n >= pev->ntevs)
+		goto errout;
+
+	tev = &pev->tevs[n];
+
+	buf = priv->insns_buf;
+	err = bpf__gen_prologue(tev->args, tev->nargs,
+				buf, &prologue_cnt,
+				BPF_MAXINSNS - orig_insns_cnt);
+	if (err) {
+		const char *title;
+
+		title = bpf_program__title(prog, false);
+		if (!title)
+			title = "[unknown]";
+
+		pr_debug("Failed to generate prologue for program %s\n",
+			 title);
+		return err;
+	}
+
+	memcpy(&buf[prologue_cnt], orig_insns,
+	       sizeof(struct bpf_insn) * orig_insns_cnt);
+
+	res->new_insn_ptr = buf;
+	res->new_insn_cnt = prologue_cnt + orig_insns_cnt;
+	res->pfd = NULL;
+	return 0;
+
+errout:
+	pr_debug("Internal error in preproc_gen_prologue\n");
+	return -BPF_LOADER_ERRNO__PROLOGUE;
+}
+
+static int hook_load_preprocessor(struct bpf_program *prog)
+{
+	struct perf_probe_event *pev;
+	struct bpf_prog_priv *priv;
+	bool need_prologue = false;
+	int err, i;
+
+	err = bpf_program__get_private(prog, (void **)&priv);
+	if (err || !priv) {
+		pr_debug("Internal error when hook preprocessor\n");
+		return -BPF_LOADER_ERRNO__INTERNAL;
+	}
+
+	pev = &priv->pev;
+	for (i = 0; i < pev->ntevs; i++) {
+		struct probe_trace_event *tev = &pev->tevs[i];
+
+		if (tev->nargs > 0) {
+			need_prologue = true;
+			break;
+		}
+	}
+
+	/*
+	 * Since all tevs don't have argument, we don't need generate
+	 * prologue.
+	 */
+	if (!need_prologue) {
+		priv->need_prologue = false;
+		return 0;
+	}
+
+	priv->need_prologue = true;
+	priv->insns_buf = malloc(sizeof(struct bpf_insn) * BPF_MAXINSNS);
+	if (!priv->insns_buf) {
+		pr_debug("No enough memory: alloc insns_buf failed\n");
+		return -ENOMEM;
+	}
+
+	err = bpf_program__set_prep(prog, pev->ntevs,
+				    preproc_gen_prologue);
+	return err;
+}
+
 int bpf__probe(struct bpf_object *obj)
 {
 	int err = 0;
@@ -399,6 +501,18 @@ int bpf__probe(struct bpf_object *obj)
 			pr_debug("bpf_probe: failed to apply perf probe events");
 			goto out;
 		}
+
+		/*
+		 * After probing, let's consider prologue, which
+		 * adds program fetcher to BPF programs.
+		 *
+		 * hook_load_preprocessorr() hooks pre-processor
+		 * to bpf_program, let it generate prologue
+		 * dynamically during loading.
+		 */
+		err = hook_load_preprocessor(prog);
+		if (err)
+			goto out;
 	}
 out:
 	return err < 0 ? err : 0;
@@ -482,7 +596,11 @@ int bpf__foreach_tev(struct bpf_object *obj,
 		for (i = 0; i < pev->ntevs; i++) {
 			tev = &pev->tevs[i];
 
-			fd = bpf_program__fd(prog);
+			if (priv->need_prologue)
+				fd = bpf_program__nth_fd(prog, i);
+			else
+				fd = bpf_program__fd(prog);
+
 			if (fd < 0) {
 				pr_debug("bpf: failed to get file descriptor\n");
 				return fd;
-- 
cgit v1.1


From bbb7d4925a05ecd5bbfdbc1147d402b0db203a5a Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Mon, 16 Nov 2015 12:10:14 +0000
Subject: perf test: Test the BPF prologue adding infrastructure

This patch introduces a new BPF script to test the BPF prologue adding
routines. The new script probes at null_lseek, which is the function pointer
used when we try to lseek on '/dev/null'.

The null_lseek function is chosen because it is used by function pointers, so
we don't need to consider inlining and LTO.

By extracting file->f_mode, bpf-script-test-prologue.c should know whether the
file is writable or readonly. According to llseek_loop() and
bpf-script-test-prologue.c, one fourth of total lseeks should be collected.

Committer note:

Testing it:

  # perf test -v BPF
  <SNIP>
  Kernel build dir is set to /lib/modules/4.3.0+/build
  set env: KBUILD_DIR=/lib/modules/4.3.0+/build
  unset env: KBUILD_OPTS
  include option is set to  -nostdinc -isystem /usr/lib/gcc/x86_64-redhat-linux/4.9.2/include -I/home/git/linux/arch/x86/include -Iarch/x86/include/generated/uapi -Iarch/x86/include/generated  -I/home/git/linux/include -Iinclude -I/home/git/linux/arch/x86/include/uapi -Iarch/x86/include/generated/uapi -I/home/git/linux/include/uapi -Iinclude/generated/uapi -include /home/git/linux/include/linux/kconfig.h
  set env: NR_CPUS=4
  set env: LINUX_VERSION_CODE=0x40300
  set env: CLANG_EXEC=/usr/libexec/icecc/bin/clang
  set env: CLANG_OPTIONS=-xc
  set env: KERNEL_INC_OPTIONS= -nostdinc -isystem /usr/lib/gcc/x86_64-redhat-linux/4.9.2/include -I/home/git/linux/arch/x86/include -Iarch/x86/include/generated/uapi -Iarch/x86/include/generated  -I/home/git/linux/include -Iinclude -I/home/git/linux/arch/x86/include/uapi -Iarch/x86/include/generated/uapi -I/home/git/linux/include/uapi -Iinclude/generated/uapi -include /home/git/linux/include/linux/kconfig.h
  set env: WORKING_DIR=/lib/modules/4.3.0+/build
  set env: CLANG_SOURCE=-
  llvm compiling command template: echo '/*
   * bpf-script-test-prologue.c
   * Test BPF prologue
   */
  #ifndef LINUX_VERSION_CODE
  # error Need LINUX_VERSION_CODE
  # error Example: for 4.2 kernel, put 'clang-opt="-DLINUX_VERSION_CODE=0x40200" into llvm section of ~/.perfconfig'
  #endif
  #define SEC(NAME) __attribute__((section(NAME), used))

  #include <uapi/linux/fs.h>

  #define FMODE_READ		0x1
  #define FMODE_WRITE		0x2

  static void (*bpf_trace_printk)(const char *fmt, int fmt_size, ...) =
	  (void *) 6;

  SEC("func=null_lseek file->f_mode offset orig")
  int bpf_func__null_lseek(void *ctx, int err, unsigned long f_mode,
			   unsigned long offset, unsigned long orig)
  {
	  if (err)
		  return 0;
	  if (f_mode & FMODE_WRITE)
		  return 0;
	  if (offset & 1)
		  return 0;
	  if (orig == SEEK_CUR)
		  return 0;
	  return 1;
  }

  char _license[] SEC("license") = "GPL";
  int _version SEC("version") = LINUX_VERSION_CODE;
  ' | $CLANG_EXEC -D__KERNEL__ -D__NR_CPUS__=$NR_CPUS -DLINUX_VERSION_CODE=$LINUX_VERSION_CODE $CLANG_OPTIONS $KERNEL_INC_OPTIONS -Wno-unused-value -Wno-pointer-sign -working-directory $WORKING_DIR -c "$CLANG_SOURCE" -target bpf -O2 -o -
  libbpf: loading object '[bpf_prologue_test]' from buffer
  libbpf: section .strtab, size 135, link 0, flags 0, type=3
  libbpf: section .text, size 0, link 0, flags 6, type=1
  libbpf: section .data, size 0, link 0, flags 3, type=1
  libbpf: section .bss, size 0, link 0, flags 3, type=8
  libbpf: section func=null_lseek file->f_mode offset orig, size 112, link 0, flags 6, type=1
  libbpf: found program func=null_lseek file->f_mode offset orig
  libbpf: section license, size 4, link 0, flags 3, type=1
  libbpf: license of [bpf_prologue_test] is GPL
  libbpf: section version, size 4, link 0, flags 3, type=1
  libbpf: kernel version of [bpf_prologue_test] is 40300
  libbpf: section .symtab, size 168, link 1, flags 0, type=2
  bpf: config program 'func=null_lseek file->f_mode offset orig'
  symbol:null_lseek file:(null) line:0 offset:0 return:0 lazy:(null)
  parsing arg: file->f_mode into file, f_mode(1)
  parsing arg: offset into offset
  parsing arg: orig into orig
  bpf: config 'func=null_lseek file->f_mode offset orig' is ok
  Looking at the vmlinux_path (7 entries long)
  Using /lib/modules/4.3.0+/build/vmlinux for symbols
  Open Debuginfo file: /lib/modules/4.3.0+/build/vmlinux
  Try to find probe point from debuginfo.
  Matched function: null_lseek
  Probe point found: null_lseek+0
  Searching 'file' variable in context.
  Converting variable file into trace event.
  converting f_mode in file
  f_mode type is unsigned int.
  Searching 'offset' variable in context.
  Converting variable offset into trace event.
  offset type is long long int.
  Searching 'orig' variable in context.
  Converting variable orig into trace event.
  orig type is int.
  Found 1 probe_trace_events.
  Opening /sys/kernel/debug/tracing//kprobe_events write=1
  Writing event: p:perf_bpf_probe/func _text+4840528 f_mode=+68(%di):u32 offset=%si:s64 orig=%dx:s32
  libbpf: don't need create maps for [bpf_prologue_test]
  prologue: pass validation
  prologue: slow path
  prologue: fetch arg 0, base reg is %di
  prologue: arg 0: offset 68
  prologue: fetch arg 1, base reg is %si
  prologue: fetch arg 2, base reg is %dx
  add bpf event perf_bpf_probe:func and attach bpf program 3
  adding perf_bpf_probe:func
  adding perf_bpf_probe:func to 0x51672c0
  mmap size 1052672B
  Opening /sys/kernel/debug/tracing//kprobe_events write=1
  Opening /sys/kernel/debug/tracing//uprobe_events write=1
  Parsing probe_events: p:perf_bpf_probe/func _text+4840528 f_mode=+68(%di):u32 offset=%si:s64 orig=%dx:s32
  Group:perf_bpf_probe Event:func probe:p
  Writing event: -:perf_bpf_probe/func
  test child finished with 0
  ---- end ----
  Test BPF filter: Ok
  #

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1447675815-166222-13-git-send-email-wangnan0@huawei.com
[ Added tools/perf/tests/llvm-src-prologue.c to .gitignore ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/.gitignore                 |  1 +
 tools/perf/tests/Build                      |  9 +++++++-
 tools/perf/tests/bpf-script-test-prologue.c | 35 +++++++++++++++++++++++++++++
 tools/perf/tests/bpf.c                      | 34 ++++++++++++++++++++++++++++
 tools/perf/tests/llvm.c                     |  4 ++++
 tools/perf/tests/llvm.h                     |  2 ++
 6 files changed, 84 insertions(+), 1 deletion(-)
 create mode 100644 tools/perf/tests/bpf-script-test-prologue.c

diff --git a/tools/perf/tests/.gitignore b/tools/perf/tests/.gitignore
index 489fc9f..bf016c4 100644
--- a/tools/perf/tests/.gitignore
+++ b/tools/perf/tests/.gitignore
@@ -1,2 +1,3 @@
 llvm-src-base.c
 llvm-src-kbuild.c
+llvm-src-prologue.c
diff --git a/tools/perf/tests/Build b/tools/perf/tests/Build
index f41ebf8..0ff8a97 100644
--- a/tools/perf/tests/Build
+++ b/tools/perf/tests/Build
@@ -31,7 +31,7 @@ perf-y += sample-parsing.o
 perf-y += parse-no-sample-id-all.o
 perf-y += kmod-path.o
 perf-y += thread-map.o
-perf-y += llvm.o llvm-src-base.o llvm-src-kbuild.o
+perf-y += llvm.o llvm-src-base.o llvm-src-kbuild.o llvm-src-prologue.o
 perf-y += bpf.o
 perf-y += topology.o
 
@@ -49,6 +49,13 @@ $(OUTPUT)tests/llvm-src-kbuild.c: tests/bpf-script-test-kbuild.c
 	$(Q)sed -e 's/"/\\"/g' -e 's/\(.*\)/"\1\\n"/g' $< >> $@
 	$(Q)echo ';' >> $@
 
+$(OUTPUT)tests/llvm-src-prologue.c: tests/bpf-script-test-prologue.c
+	$(call rule_mkdir)
+	$(Q)echo '#include <tests/llvm.h>' > $@
+	$(Q)echo 'const char test_llvm__bpf_test_prologue_prog[] =' >> $@
+	$(Q)sed -e 's/"/\\"/g' -e 's/\(.*\)/"\1\\n"/g' $< >> $@
+	$(Q)echo ';' >> $@
+
 ifeq ($(ARCH),$(filter $(ARCH),x86 arm arm64))
 perf-$(CONFIG_DWARF_UNWIND) += dwarf-unwind.o
 endif
diff --git a/tools/perf/tests/bpf-script-test-prologue.c b/tools/perf/tests/bpf-script-test-prologue.c
new file mode 100644
index 0000000..7230e62
--- /dev/null
+++ b/tools/perf/tests/bpf-script-test-prologue.c
@@ -0,0 +1,35 @@
+/*
+ * bpf-script-test-prologue.c
+ * Test BPF prologue
+ */
+#ifndef LINUX_VERSION_CODE
+# error Need LINUX_VERSION_CODE
+# error Example: for 4.2 kernel, put 'clang-opt="-DLINUX_VERSION_CODE=0x40200" into llvm section of ~/.perfconfig'
+#endif
+#define SEC(NAME) __attribute__((section(NAME), used))
+
+#include <uapi/linux/fs.h>
+
+#define FMODE_READ		0x1
+#define FMODE_WRITE		0x2
+
+static void (*bpf_trace_printk)(const char *fmt, int fmt_size, ...) =
+	(void *) 6;
+
+SEC("func=null_lseek file->f_mode offset orig")
+int bpf_func__null_lseek(void *ctx, int err, unsigned long f_mode,
+			 unsigned long offset, unsigned long orig)
+{
+	if (err)
+		return 0;
+	if (f_mode & FMODE_WRITE)
+		return 0;
+	if (offset & 1)
+		return 0;
+	if (orig == SEEK_CUR)
+		return 0;
+	return 1;
+}
+
+char _license[] SEC("license") = "GPL";
+int _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/tools/perf/tests/bpf.c b/tools/perf/tests/bpf.c
index 6ebfdee..d584422 100644
--- a/tools/perf/tests/bpf.c
+++ b/tools/perf/tests/bpf.c
@@ -19,6 +19,29 @@ static int epoll_pwait_loop(void)
 	return 0;
 }
 
+#ifdef HAVE_BPF_PROLOGUE
+
+static int llseek_loop(void)
+{
+	int fds[2], i;
+
+	fds[0] = open("/dev/null", O_RDONLY);
+	fds[1] = open("/dev/null", O_RDWR);
+
+	if (fds[0] < 0 || fds[1] < 0)
+		return -1;
+
+	for (i = 0; i < NR_ITERS; i++) {
+		lseek(fds[i % 2], i, (i / 2) % 2 ? SEEK_CUR : SEEK_SET);
+		lseek(fds[(i + 1) % 2], i, (i / 2) % 2 ? SEEK_CUR : SEEK_SET);
+	}
+	close(fds[0]);
+	close(fds[1]);
+	return 0;
+}
+
+#endif
+
 static struct {
 	enum test_llvm__testcase prog_id;
 	const char *desc;
@@ -37,6 +60,17 @@ static struct {
 		&epoll_pwait_loop,
 		(NR_ITERS + 1) / 2,
 	},
+#ifdef HAVE_BPF_PROLOGUE
+	{
+		LLVM_TESTCASE_BPF_PROLOGUE,
+		"Test BPF prologue generation",
+		"[bpf_prologue_test]",
+		"fix kbuild first",
+		"check your vmlinux setting?",
+		&llseek_loop,
+		(NR_ITERS + 1) / 4,
+	},
+#endif
 };
 
 static int do_test(struct bpf_object *obj, int (*func)(void),
diff --git a/tools/perf/tests/llvm.c b/tools/perf/tests/llvm.c
index 366e38b..b414763 100644
--- a/tools/perf/tests/llvm.c
+++ b/tools/perf/tests/llvm.c
@@ -44,6 +44,10 @@ static struct {
 		.source = test_llvm__bpf_test_kbuild_prog,
 		.desc = "Test kbuild searching",
 	},
+	[LLVM_TESTCASE_BPF_PROLOGUE] = {
+		.source = test_llvm__bpf_test_prologue_prog,
+		.desc = "Test BPF prologue generation",
+	},
 };
 
 
diff --git a/tools/perf/tests/llvm.h b/tools/perf/tests/llvm.h
index d91d8f4..5150b4d 100644
--- a/tools/perf/tests/llvm.h
+++ b/tools/perf/tests/llvm.h
@@ -6,10 +6,12 @@
 
 extern const char test_llvm__bpf_base_prog[];
 extern const char test_llvm__bpf_test_kbuild_prog[];
+extern const char test_llvm__bpf_test_prologue_prog[];
 
 enum test_llvm__testcase {
 	LLVM_TESTCASE_BASE,
 	LLVM_TESTCASE_KBUILD,
+	LLVM_TESTCASE_BPF_PROLOGUE,
 	__LLVM_TESTCASE_MAX,
 };
 
-- 
cgit v1.1


From ad0dd7aed5df8009b3ffa39bec73ad93283332c9 Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Tue, 17 Nov 2015 08:32:46 +0000
Subject: perf test: Fix 'perf test BPF' when it fails to find a suitable
 vmlinux

Two bugs in 'perf test BPF' are found when testing BPF prologue without
vmlinux:

 # mv /lib/modules/4.3.0-rc4+/build/vmlinux{,.bak}
 # ./perf test BPF
 37: Test BPF filter             :Failed to find the path for kernel: No such file or directory
 Ok

Test BPF should fail in this case.

After this patch:

 # ./perf test BPF
 37: Test BPF filter             :Failed to find the path for kernel: No such file or directory
  FAILED!
 # mv /lib/modules/4.3.0-rc4+/build/vmlinux{.bak,}
 # ./perf test BPF
 37: Test BPF filter             : Ok

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1447749170-175898-2-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/bpf.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tools/perf/tests/bpf.c b/tools/perf/tests/bpf.c
index d584422..232043c 100644
--- a/tools/perf/tests/bpf.c
+++ b/tools/perf/tests/bpf.c
@@ -102,8 +102,7 @@ static int do_test(struct bpf_object *obj, int (*func)(void),
 	err = parse_events_load_bpf_obj(&parse_evlist, &parse_evlist.list, obj);
 	if (err || list_empty(&parse_evlist.list)) {
 		pr_debug("Failed to add events selected by BPF\n");
-		if (!err)
-			return TEST_FAIL;
+		return TEST_FAIL;
 	}
 
 	snprintf(pid, sizeof(pid), "%d", getpid());
@@ -157,8 +156,10 @@ static int do_test(struct bpf_object *obj, int (*func)(void),
 		}
 	}
 
-	if (count != expect)
+	if (count != expect) {
 		pr_debug("BPF filter result incorrect\n");
+		goto out_delete_evlist;
+	}
 
 	ret = TEST_OK;
 
-- 
cgit v1.1


From d35b32891a61f1d3909bdc5280badf309adc4693 Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Tue, 17 Nov 2015 08:32:47 +0000
Subject: perf bpf: Use same BPF program if arguments are identical

This patch allows creating only one BPF program for different
'probe_trace_event'(tev) entries generated by one
'perf_probe_event'(pev) if their prologues are identical.

This is done by comparing the argument list of different tev instances,
and the maps type of prologue and tev using a mapping array. This patch
utilizes qsort to sort the tevs. After sorting, tevs with identical
argument lists will be grouped together.

Test result:

Sample BPF program:

  #define SEC(NAME) __attribute__((section(NAME), used))
  SEC("inlines=no;"
      "func=SyS_dup? oldfd")
  int func(void *ctx)
  {
      return 1;
  }

It would probe at SyS_dup2 and SyS_dup3, obtaining oldfd as its
argument.

The following cmdline shows a BPF program being loaded into the kernel
by perf:

 # perf record -e ./test_bpf_arg.c sleep 4 & sleep 1 && ls /proc/$!/fd/ -l | grep bpf-prog

Before this patch:

  # perf record -e ./test_bpf_arg.c sleep 4 & sleep 1 && ls /proc/$!/fd/ -l | grep bpf-prog
  [1] 24858
  lrwx------ 1 root root 64 Nov 14 04:09 3 -> anon_inode:bpf-prog
  lrwx------ 1 root root 64 Nov 14 04:09 4 -> anon_inode:bpf-prog
  ...

After this patch:

  # perf record -e ./test_bpf_arg.c sleep 4 & sleep 1 && ls /proc/$!/fd/ -l | grep bpf-prog
  [1] 25699
  lrwx------ 1 root root 64 Nov 14 04:10 3 -> anon_inode:bpf-prog
  ...

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1447749170-175898-3-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/bpf-loader.c | 138 ++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 131 insertions(+), 7 deletions(-)

diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c
index 190a1c7..36544e5 100644
--- a/tools/perf/util/bpf-loader.c
+++ b/tools/perf/util/bpf-loader.c
@@ -38,6 +38,8 @@ struct bpf_prog_priv {
 	struct perf_probe_event pev;
 	bool need_prologue;
 	struct bpf_insn *insns_buf;
+	int nr_types;
+	int *type_mapping;
 };
 
 static bool libbpf_initialized;
@@ -113,6 +115,7 @@ bpf_prog_priv__clear(struct bpf_program *prog __maybe_unused,
 
 	cleanup_perf_probe_events(&priv->pev, 1);
 	zfree(&priv->insns_buf);
+	zfree(&priv->type_mapping);
 	free(priv);
 }
 
@@ -381,7 +384,7 @@ preproc_gen_prologue(struct bpf_program *prog, int n,
 	struct bpf_prog_priv *priv;
 	struct bpf_insn *buf;
 	size_t prologue_cnt = 0;
-	int err;
+	int i, err;
 
 	err = bpf_program__get_private(prog, (void **)&priv);
 	if (err || !priv)
@@ -389,10 +392,21 @@ preproc_gen_prologue(struct bpf_program *prog, int n,
 
 	pev = &priv->pev;
 
-	if (n < 0 || n >= pev->ntevs)
+	if (n < 0 || n >= priv->nr_types)
 		goto errout;
 
-	tev = &pev->tevs[n];
+	/* Find a tev belongs to that type */
+	for (i = 0; i < pev->ntevs; i++) {
+		if (priv->type_mapping[i] == n)
+			break;
+	}
+
+	if (i >= pev->ntevs) {
+		pr_debug("Internal error: prologue type %d not found\n", n);
+		return -BPF_LOADER_ERRNO__PROLOGUE;
+	}
+
+	tev = &pev->tevs[i];
 
 	buf = priv->insns_buf;
 	err = bpf__gen_prologue(tev->args, tev->nargs,
@@ -423,6 +437,101 @@ errout:
 	return -BPF_LOADER_ERRNO__PROLOGUE;
 }
 
+/*
+ * compare_tev_args is reflexive, transitive and antisymmetric.
+ * I can proof it but this margin is too narrow to contain.
+ */
+static int compare_tev_args(const void *ptev1, const void *ptev2)
+{
+	int i, ret;
+	const struct probe_trace_event *tev1 =
+		*(const struct probe_trace_event **)ptev1;
+	const struct probe_trace_event *tev2 =
+		*(const struct probe_trace_event **)ptev2;
+
+	ret = tev2->nargs - tev1->nargs;
+	if (ret)
+		return ret;
+
+	for (i = 0; i < tev1->nargs; i++) {
+		struct probe_trace_arg *arg1, *arg2;
+		struct probe_trace_arg_ref *ref1, *ref2;
+
+		arg1 = &tev1->args[i];
+		arg2 = &tev2->args[i];
+
+		ret = strcmp(arg1->value, arg2->value);
+		if (ret)
+			return ret;
+
+		ref1 = arg1->ref;
+		ref2 = arg2->ref;
+
+		while (ref1 && ref2) {
+			ret = ref2->offset - ref1->offset;
+			if (ret)
+				return ret;
+
+			ref1 = ref1->next;
+			ref2 = ref2->next;
+		}
+
+		if (ref1 || ref2)
+			return ref2 ? 1 : -1;
+	}
+
+	return 0;
+}
+
+/*
+ * Assign a type number to each tevs in a pev.
+ * mapping is an array with same slots as tevs in that pev.
+ * nr_types will be set to number of types.
+ */
+static int map_prologue(struct perf_probe_event *pev, int *mapping,
+			int *nr_types)
+{
+	int i, type = 0;
+	struct probe_trace_event **ptevs;
+
+	size_t array_sz = sizeof(*ptevs) * pev->ntevs;
+
+	ptevs = malloc(array_sz);
+	if (!ptevs) {
+		pr_debug("No ehough memory: alloc ptevs failed\n");
+		return -ENOMEM;
+	}
+
+	pr_debug("In map_prologue, ntevs=%d\n", pev->ntevs);
+	for (i = 0; i < pev->ntevs; i++)
+		ptevs[i] = &pev->tevs[i];
+
+	qsort(ptevs, pev->ntevs, sizeof(*ptevs),
+	      compare_tev_args);
+
+	for (i = 0; i < pev->ntevs; i++) {
+		int n;
+
+		n = ptevs[i] - pev->tevs;
+		if (i == 0) {
+			mapping[n] = type;
+			pr_debug("mapping[%d]=%d\n", n, type);
+			continue;
+		}
+
+		if (compare_tev_args(ptevs + i, ptevs + i - 1) == 0)
+			mapping[n] = type;
+		else
+			mapping[n] = ++type;
+
+		pr_debug("mapping[%d]=%d\n", n, mapping[n]);
+	}
+	free(ptevs);
+	*nr_types = type + 1;
+
+	return 0;
+}
+
 static int hook_load_preprocessor(struct bpf_program *prog)
 {
 	struct perf_probe_event *pev;
@@ -462,7 +571,19 @@ static int hook_load_preprocessor(struct bpf_program *prog)
 		return -ENOMEM;
 	}
 
-	err = bpf_program__set_prep(prog, pev->ntevs,
+	priv->type_mapping = malloc(sizeof(int) * pev->ntevs);
+	if (!priv->type_mapping) {
+		pr_debug("No enough memory: alloc type_mapping failed\n");
+		return -ENOMEM;
+	}
+	memset(priv->type_mapping, -1,
+	       sizeof(int) * pev->ntevs);
+
+	err = map_prologue(pev, priv->type_mapping, &priv->nr_types);
+	if (err)
+		return err;
+
+	err = bpf_program__set_prep(prog, priv->nr_types,
 				    preproc_gen_prologue);
 	return err;
 }
@@ -596,10 +717,13 @@ int bpf__foreach_tev(struct bpf_object *obj,
 		for (i = 0; i < pev->ntevs; i++) {
 			tev = &pev->tevs[i];
 
-			if (priv->need_prologue)
-				fd = bpf_program__nth_fd(prog, i);
-			else
+			if (priv->need_prologue) {
+				int type = priv->type_mapping[i];
+
+				fd = bpf_program__nth_fd(prog, type);
+			} else {
 				fd = bpf_program__fd(prog);
+			}
 
 			if (fd < 0) {
 				pr_debug("bpf: failed to get file descriptor\n");
-- 
cgit v1.1


From 2f7a3f8e871eb0713d23c533bd5e44a544e43eb8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 17 Nov 2015 15:09:46 +0100
Subject: x86/tsc: Remove unused tsc_pre_init() hook

No more users. Remove it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Borislav Petkov <bp@suse.de>
---
 arch/x86/include/asm/x86_init.h | 2 --
 arch/x86/kernel/tsc.c           | 2 --
 arch/x86/kernel/x86_init.c      | 1 -
 3 files changed, 5 deletions(-)

diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 48d34d2..10002a4 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -83,13 +83,11 @@ struct x86_init_paging {
  * struct x86_init_timers - platform specific timer setup
  * @setup_perpcu_clockev:	set up the per cpu clock event device for the
  *				boot cpu
- * @tsc_pre_init:		platform function called before TSC init
  * @timer_init:			initialize the platform timer (default PIT/HPET)
  * @wallclock_init:		init the wallclock device
  */
 struct x86_init_timers {
 	void (*setup_percpu_clockev)(void);
-	void (*tsc_pre_init)(void);
 	void (*timer_init)(void);
 	void (*wallclock_init)(void);
 };
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index c7c4d9c..3d743da 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1185,8 +1185,6 @@ void __init tsc_init(void)
 	u64 lpj;
 	int cpu;
 
-	x86_init.timers.tsc_pre_init();
-
 	if (!cpu_has_tsc) {
 		setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
 		return;
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 3839628..dad5fe9 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -68,7 +68,6 @@ struct x86_init_ops x86_init __initdata = {
 
 	.timers = {
 		.setup_percpu_clockev	= setup_boot_APIC_clock,
-		.tsc_pre_init		= x86_init_noop,
 		.timer_init		= hpet_time_init,
 		.wallclock_init		= x86_init_noop,
 	},
-- 
cgit v1.1


From 460958659270b7d750d4ccfe052171cb6f655cbb Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Tue, 17 Nov 2015 14:44:32 +0100
Subject: x86/paravirt: Remove unused pv_apic_ops structure

The only member of that structure is startup_ipi_hook which is always
set to paravirt_nop.

Signed-off-by: Juergen Gross <jgross@suse.com>
Reviewed-by: David Vrabel <david.vrabel@citrix.com>
Cc: jeremy@goop.org
Cc: chrisw@sous-sol.org
Cc: akataria@vmware.com
Cc: rusty@rustcorp.com.au
Cc: virtualization@lists.linux-foundation.org
Cc: xen-devel@lists.xen.org
Cc: konrad.wilk@oracle.com
Cc: boris.ostrovsky@oracle.com
Link: http://lkml.kernel.org/r/1447767872-16730-1-git-send-email-jgross@suse.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/paravirt.h       |  9 ---------
 arch/x86/include/asm/paravirt_types.h | 10 ----------
 arch/x86/include/asm/smp.h            |  3 ---
 arch/x86/kernel/paravirt.c            |  8 --------
 arch/x86/kernel/smpboot.c             |  7 -------
 arch/x86/xen/enlighten.c              |  7 -------
 6 files changed, 44 deletions(-)

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 10d0596..4d7f080 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -285,15 +285,6 @@ static inline void slow_down_io(void)
 #endif
 }
 
-#ifdef CONFIG_SMP
-static inline void startup_ipi_hook(int phys_apicid, unsigned long start_eip,
-				    unsigned long start_esp)
-{
-	PVOP_VCALL3(pv_apic_ops.startup_ipi_hook,
-		    phys_apicid, start_eip, start_esp);
-}
-#endif
-
 static inline void paravirt_activate_mm(struct mm_struct *prev,
 					struct mm_struct *next)
 {
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index e1f31df..7afeafb 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -215,14 +215,6 @@ struct pv_irq_ops {
 #endif
 };
 
-struct pv_apic_ops {
-#ifdef CONFIG_X86_LOCAL_APIC
-	void (*startup_ipi_hook)(int phys_apicid,
-				 unsigned long start_eip,
-				 unsigned long start_esp);
-#endif
-};
-
 struct pv_mmu_ops {
 	unsigned long (*read_cr2)(void);
 	void (*write_cr2)(unsigned long);
@@ -354,7 +346,6 @@ struct paravirt_patch_template {
 	struct pv_time_ops pv_time_ops;
 	struct pv_cpu_ops pv_cpu_ops;
 	struct pv_irq_ops pv_irq_ops;
-	struct pv_apic_ops pv_apic_ops;
 	struct pv_mmu_ops pv_mmu_ops;
 	struct pv_lock_ops pv_lock_ops;
 };
@@ -364,7 +355,6 @@ extern struct pv_init_ops pv_init_ops;
 extern struct pv_time_ops pv_time_ops;
 extern struct pv_cpu_ops pv_cpu_ops;
 extern struct pv_irq_ops pv_irq_ops;
-extern struct pv_apic_ops pv_apic_ops;
 extern struct pv_mmu_ops pv_mmu_ops;
 extern struct pv_lock_ops pv_lock_ops;
 
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index a438c55..dfcf072 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -65,9 +65,6 @@ struct smp_ops {
 extern void set_cpu_sibling_map(int cpu);
 
 #ifdef CONFIG_SMP
-#ifndef CONFIG_PARAVIRT
-#define startup_ipi_hook(phys_apicid, start_eip, start_esp) do { } while (0)
-#endif
 extern struct smp_ops smp_ops;
 
 static inline void smp_send_stop(void)
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 4f32a10..f27962c 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -123,7 +123,6 @@ static void *get_call_destination(u8 type)
 		.pv_time_ops = pv_time_ops,
 		.pv_cpu_ops = pv_cpu_ops,
 		.pv_irq_ops = pv_irq_ops,
-		.pv_apic_ops = pv_apic_ops,
 		.pv_mmu_ops = pv_mmu_ops,
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
 		.pv_lock_ops = pv_lock_ops,
@@ -392,12 +391,6 @@ NOKPROBE_SYMBOL(native_get_debugreg);
 NOKPROBE_SYMBOL(native_set_debugreg);
 NOKPROBE_SYMBOL(native_load_idt);
 
-struct pv_apic_ops pv_apic_ops = {
-#ifdef CONFIG_X86_LOCAL_APIC
-	.startup_ipi_hook = paravirt_nop,
-#endif
-};
-
 #if defined(CONFIG_X86_32) && !defined(CONFIG_X86_PAE)
 /* 32-bit pagetable entries */
 #define PTE_IDENT	__PV_IS_CALLEE_SAVE(_paravirt_ident_32)
@@ -481,6 +474,5 @@ struct pv_mmu_ops pv_mmu_ops = {
 EXPORT_SYMBOL_GPL(pv_time_ops);
 EXPORT_SYMBOL    (pv_cpu_ops);
 EXPORT_SYMBOL    (pv_mmu_ops);
-EXPORT_SYMBOL_GPL(pv_apic_ops);
 EXPORT_SYMBOL_GPL(pv_info);
 EXPORT_SYMBOL    (pv_irq_ops);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 892ee2e5..4df7777 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -629,13 +629,6 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
 		num_starts = 0;
 
 	/*
-	 * Paravirt / VMI wants a startup IPI hook here to set up the
-	 * target processor state.
-	 */
-	startup_ipi_hook(phys_apicid, (unsigned long) start_secondary,
-			 stack_start);
-
-	/*
 	 * Run STARTUP IPI loop.
 	 */
 	pr_debug("#startup loops: %d\n", num_starts);
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 993b7a7..2745e8a 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1264,12 +1264,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
 	.end_context_switch = xen_end_context_switch,
 };
 
-static const struct pv_apic_ops xen_apic_ops __initconst = {
-#ifdef CONFIG_X86_LOCAL_APIC
-	.startup_ipi_hook = paravirt_nop,
-#endif
-};
-
 static void xen_reboot(int reason)
 {
 	struct sched_shutdown r = { .reason = reason };
@@ -1535,7 +1529,6 @@ asmlinkage __visible void __init xen_start_kernel(void)
 	/* Install Xen paravirt ops */
 	pv_info = xen_info;
 	pv_init_ops = xen_init_ops;
-	pv_apic_ops = xen_apic_ops;
 	if (!xen_pvh_domain()) {
 		pv_cpu_ops = xen_cpu_ops;
 
-- 
cgit v1.1


From 721a1f53df6aad3ea941f5fe95519d0d8e02bd65 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 19 Nov 2015 12:01:48 -0300
Subject: perf tests: Pass the subtest index to each test routine

Some tests have sub-tests we want to run, so allow passing this.

Wang tried to avoid having to touch all tests, but then, having the
test.func in an anonymous union makes the build fail on older compilers,
like the one in RHEL6, where:

  test a = {
	.func = foo,
  };

fails.

To fix it leave the func pointer in the main structure and pass the subtest
index to all tests, end result function is the same, but we have just one
function pointer, not two, with and without the subtest index as an argument.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/n/tip-5genj0ficwdmelpoqlds0u4y@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/x86/include/arch-tests.h     |  8 +--
 tools/perf/arch/x86/tests/insn-x86.c         |  2 +-
 tools/perf/arch/x86/tests/intel-cqm.c        |  2 +-
 tools/perf/arch/x86/tests/perf-time-to-tsc.c |  2 +-
 tools/perf/arch/x86/tests/rdpmc.c            |  2 +-
 tools/perf/tests/attr.c                      |  2 +-
 tools/perf/tests/bp_signal.c                 |  2 +-
 tools/perf/tests/bp_signal_overflow.c        |  2 +-
 tools/perf/tests/bpf.c                       |  2 +-
 tools/perf/tests/builtin-test.c              |  6 +--
 tools/perf/tests/code-reading.c              |  2 +-
 tools/perf/tests/dso-data.c                  |  6 +--
 tools/perf/tests/dwarf-unwind.c              |  2 +-
 tools/perf/tests/evsel-roundtrip-name.c      |  2 +-
 tools/perf/tests/evsel-tp-sched.c            |  2 +-
 tools/perf/tests/fdarray.c                   |  4 +-
 tools/perf/tests/hists_cumulate.c            |  2 +-
 tools/perf/tests/hists_filter.c              |  2 +-
 tools/perf/tests/hists_link.c                |  2 +-
 tools/perf/tests/hists_output.c              |  2 +-
 tools/perf/tests/keep-tracking.c             |  2 +-
 tools/perf/tests/kmod-path.c                 |  2 +-
 tools/perf/tests/llvm.c                      |  2 +-
 tools/perf/tests/mmap-basic.c                |  2 +-
 tools/perf/tests/mmap-thread-lookup.c        |  2 +-
 tools/perf/tests/openat-syscall-all-cpus.c   |  2 +-
 tools/perf/tests/openat-syscall-tp-fields.c  |  2 +-
 tools/perf/tests/openat-syscall.c            |  2 +-
 tools/perf/tests/parse-events.c              |  2 +-
 tools/perf/tests/parse-no-sample-id-all.c    |  2 +-
 tools/perf/tests/perf-record.c               |  2 +-
 tools/perf/tests/pmu.c                       |  2 +-
 tools/perf/tests/python-use.c                |  3 +-
 tools/perf/tests/sample-parsing.c            |  2 +-
 tools/perf/tests/sw-clock.c                  |  2 +-
 tools/perf/tests/switch-tracking.c           |  2 +-
 tools/perf/tests/task-exit.c                 |  2 +-
 tools/perf/tests/tests.h                     | 78 ++++++++++++++--------------
 tools/perf/tests/thread-map.c                |  2 +-
 tools/perf/tests/thread-mg-share.c           |  2 +-
 tools/perf/tests/topology.c                  |  2 +-
 tools/perf/tests/vmlinux-kallsyms.c          |  2 +-
 42 files changed, 89 insertions(+), 88 deletions(-)

diff --git a/tools/perf/arch/x86/include/arch-tests.h b/tools/perf/arch/x86/include/arch-tests.h
index 7ed00f4..b48de2f 100644
--- a/tools/perf/arch/x86/include/arch-tests.h
+++ b/tools/perf/arch/x86/include/arch-tests.h
@@ -2,10 +2,10 @@
 #define ARCH_TESTS_H
 
 /* Tests */
-int test__rdpmc(void);
-int test__perf_time_to_tsc(void);
-int test__insn_x86(void);
-int test__intel_cqm_count_nmi_context(void);
+int test__rdpmc(int subtest);
+int test__perf_time_to_tsc(int subtest);
+int test__insn_x86(int subtest);
+int test__intel_cqm_count_nmi_context(int subtest);
 
 #ifdef HAVE_DWARF_UNWIND_SUPPORT
 struct thread;
diff --git a/tools/perf/arch/x86/tests/insn-x86.c b/tools/perf/arch/x86/tests/insn-x86.c
index b6115df..08d9b2bc 100644
--- a/tools/perf/arch/x86/tests/insn-x86.c
+++ b/tools/perf/arch/x86/tests/insn-x86.c
@@ -171,7 +171,7 @@ static int test_data_set(struct test_data *dat_set, int x86_64)
  * verbose (-v) option to see all the instructions and whether or not they
  * decoded successfuly.
  */
-int test__insn_x86(void)
+int test__insn_x86(int subtest __maybe_unused)
 {
 	int ret = 0;
 
diff --git a/tools/perf/arch/x86/tests/intel-cqm.c b/tools/perf/arch/x86/tests/intel-cqm.c
index d28c1b6..94e0cb7 100644
--- a/tools/perf/arch/x86/tests/intel-cqm.c
+++ b/tools/perf/arch/x86/tests/intel-cqm.c
@@ -33,7 +33,7 @@ static pid_t spawn(void)
  * the last read counter value to avoid triggering a WARN_ON_ONCE() in
  * smp_call_function_many() caused by sending IPIs from NMI context.
  */
-int test__intel_cqm_count_nmi_context(void)
+int test__intel_cqm_count_nmi_context(int subtest __maybe_unused)
 {
 	struct perf_evlist *evlist = NULL;
 	struct perf_evsel *evsel = NULL;
diff --git a/tools/perf/arch/x86/tests/perf-time-to-tsc.c b/tools/perf/arch/x86/tests/perf-time-to-tsc.c
index 658cd20..a289aa8 100644
--- a/tools/perf/arch/x86/tests/perf-time-to-tsc.c
+++ b/tools/perf/arch/x86/tests/perf-time-to-tsc.c
@@ -35,7 +35,7 @@
  * %0 is returned, otherwise %-1 is returned.  If TSC conversion is not
  * supported then then the test passes but " (not supported)" is printed.
  */
-int test__perf_time_to_tsc(void)
+int test__perf_time_to_tsc(int subtest __maybe_unused)
 {
 	struct record_opts opts = {
 		.mmap_pages	     = UINT_MAX,
diff --git a/tools/perf/arch/x86/tests/rdpmc.c b/tools/perf/arch/x86/tests/rdpmc.c
index e768821..7bb0d13 100644
--- a/tools/perf/arch/x86/tests/rdpmc.c
+++ b/tools/perf/arch/x86/tests/rdpmc.c
@@ -149,7 +149,7 @@ out_close:
 	return 0;
 }
 
-int test__rdpmc(void)
+int test__rdpmc(int subtest __maybe_unused)
 {
 	int status = 0;
 	int wret = 0;
diff --git a/tools/perf/tests/attr.c b/tools/perf/tests/attr.c
index 638875a..b66730eb9 100644
--- a/tools/perf/tests/attr.c
+++ b/tools/perf/tests/attr.c
@@ -153,7 +153,7 @@ static int run_dir(const char *d, const char *perf)
 	return system(cmd);
 }
 
-int test__attr(void)
+int test__attr(int subtest __maybe_unused)
 {
 	struct stat st;
 	char path_perf[PATH_MAX];
diff --git a/tools/perf/tests/bp_signal.c b/tools/perf/tests/bp_signal.c
index a02b035..fb80c9e 100644
--- a/tools/perf/tests/bp_signal.c
+++ b/tools/perf/tests/bp_signal.c
@@ -111,7 +111,7 @@ static long long bp_count(int fd)
 	return count;
 }
 
-int test__bp_signal(void)
+int test__bp_signal(int subtest __maybe_unused)
 {
 	struct sigaction sa;
 	long long count1, count2;
diff --git a/tools/perf/tests/bp_signal_overflow.c b/tools/perf/tests/bp_signal_overflow.c
index e765377..89f92fa 100644
--- a/tools/perf/tests/bp_signal_overflow.c
+++ b/tools/perf/tests/bp_signal_overflow.c
@@ -58,7 +58,7 @@ static long long bp_count(int fd)
 #define EXECUTIONS 10000
 #define THRESHOLD  100
 
-int test__bp_signal_overflow(void)
+int test__bp_signal_overflow(int subtest __maybe_unused)
 {
 	struct perf_event_attr pe;
 	struct sigaction sa;
diff --git a/tools/perf/tests/bpf.c b/tools/perf/tests/bpf.c
index 232043c..4efdc16 100644
--- a/tools/perf/tests/bpf.c
+++ b/tools/perf/tests/bpf.c
@@ -215,7 +215,7 @@ out:
 	return ret;
 }
 
-int test__bpf(void)
+int test__bpf(int subtest __maybe_unused)
 {
 	unsigned int i;
 	int err;
diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c
index 80c442e..9cf4892 100644
--- a/tools/perf/tests/builtin-test.c
+++ b/tools/perf/tests/builtin-test.c
@@ -203,7 +203,7 @@ static bool perf_test__matches(struct test *test, int curr, int argc, const char
 	return false;
 }
 
-static int run_test(struct test *test)
+static int run_test(struct test *test, int subtest)
 {
 	int status, err = -1, child = fork();
 	char sbuf[STRERR_BUFSIZE];
@@ -216,7 +216,7 @@ static int run_test(struct test *test)
 
 	if (!child) {
 		pr_debug("test child forked, pid %d\n", getpid());
-		err = test->func();
+		err = test->func(subtest);
 		exit(err);
 	}
 
@@ -265,7 +265,7 @@ static int __cmd_test(int argc, const char *argv[], struct intlist *skiplist)
 		}
 
 		pr_debug("\n--- start ---\n");
-		err = run_test(t);
+		err = run_test(t, i);
 		pr_debug("---- end ----\n%s:", t->desc);
 
 		switch (err) {
diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c
index a767a64..4417b6a 100644
--- a/tools/perf/tests/code-reading.c
+++ b/tools/perf/tests/code-reading.c
@@ -601,7 +601,7 @@ out_err:
 	return err;
 }
 
-int test__code_reading(void)
+int test__code_reading(int subtest __maybe_unused)
 {
 	int ret;
 
diff --git a/tools/perf/tests/dso-data.c b/tools/perf/tests/dso-data.c
index a218aea..dc673ff 100644
--- a/tools/perf/tests/dso-data.c
+++ b/tools/perf/tests/dso-data.c
@@ -110,7 +110,7 @@ static int dso__data_fd(struct dso *dso, struct machine *machine)
 	return fd;
 }
 
-int test__dso_data(void)
+int test__dso_data(int subtest __maybe_unused)
 {
 	struct machine machine;
 	struct dso *dso;
@@ -245,7 +245,7 @@ static int set_fd_limit(int n)
 	return setrlimit(RLIMIT_NOFILE, &rlim);
 }
 
-int test__dso_data_cache(void)
+int test__dso_data_cache(int subtest __maybe_unused)
 {
 	struct machine machine;
 	long nr_end, nr = open_files_cnt();
@@ -302,7 +302,7 @@ int test__dso_data_cache(void)
 	return 0;
 }
 
-int test__dso_data_reopen(void)
+int test__dso_data_reopen(int subtest __maybe_unused)
 {
 	struct machine machine;
 	long nr_end, nr = open_files_cnt();
diff --git a/tools/perf/tests/dwarf-unwind.c b/tools/perf/tests/dwarf-unwind.c
index 0722179..01f0b61 100644
--- a/tools/perf/tests/dwarf-unwind.c
+++ b/tools/perf/tests/dwarf-unwind.c
@@ -142,7 +142,7 @@ static int krava_1(struct thread *thread)
 	return krava_2(thread);
 }
 
-int test__dwarf_unwind(void)
+int test__dwarf_unwind(int subtest __maybe_unused)
 {
 	struct machines machines;
 	struct machine *machine;
diff --git a/tools/perf/tests/evsel-roundtrip-name.c b/tools/perf/tests/evsel-roundtrip-name.c
index 3fa7159..1da92e1 100644
--- a/tools/perf/tests/evsel-roundtrip-name.c
+++ b/tools/perf/tests/evsel-roundtrip-name.c
@@ -95,7 +95,7 @@ out_delete_evlist:
 #define perf_evsel__name_array_test(names) \
 	__perf_evsel__name_array_test(names, ARRAY_SIZE(names))
 
-int test__perf_evsel__roundtrip_name_test(void)
+int test__perf_evsel__roundtrip_name_test(int subtest __maybe_unused)
 {
 	int err = 0, ret = 0;
 
diff --git a/tools/perf/tests/evsel-tp-sched.c b/tools/perf/tests/evsel-tp-sched.c
index 790e413..1984b3b 100644
--- a/tools/perf/tests/evsel-tp-sched.c
+++ b/tools/perf/tests/evsel-tp-sched.c
@@ -32,7 +32,7 @@ static int perf_evsel__test_field(struct perf_evsel *evsel, const char *name,
 	return ret;
 }
 
-int test__perf_evsel__tp_sched_test(void)
+int test__perf_evsel__tp_sched_test(int subtest __maybe_unused)
 {
 	struct perf_evsel *evsel = perf_evsel__newtp("sched", "sched_switch");
 	int ret = 0;
diff --git a/tools/perf/tests/fdarray.c b/tools/perf/tests/fdarray.c
index d24b837..c809463 100644
--- a/tools/perf/tests/fdarray.c
+++ b/tools/perf/tests/fdarray.c
@@ -25,7 +25,7 @@ static int fdarray__fprintf_prefix(struct fdarray *fda, const char *prefix, FILE
 	return printed + fdarray__fprintf(fda, fp);
 }
 
-int test__fdarray__filter(void)
+int test__fdarray__filter(int subtest __maybe_unused)
 {
 	int nr_fds, expected_fd[2], fd, err = TEST_FAIL;
 	struct fdarray *fda = fdarray__new(5, 5);
@@ -103,7 +103,7 @@ out:
 	return err;
 }
 
-int test__fdarray__add(void)
+int test__fdarray__add(int subtest __maybe_unused)
 {
 	int err = TEST_FAIL;
 	struct fdarray *fda = fdarray__new(2, 2);
diff --git a/tools/perf/tests/hists_cumulate.c b/tools/perf/tests/hists_cumulate.c
index 7ed73701..8292948 100644
--- a/tools/perf/tests/hists_cumulate.c
+++ b/tools/perf/tests/hists_cumulate.c
@@ -686,7 +686,7 @@ out:
 	return err;
 }
 
-int test__hists_cumulate(void)
+int test__hists_cumulate(int subtest __maybe_unused)
 {
 	int err = TEST_FAIL;
 	struct machines machines;
diff --git a/tools/perf/tests/hists_filter.c b/tools/perf/tests/hists_filter.c
index 818acf8..ccb5b49 100644
--- a/tools/perf/tests/hists_filter.c
+++ b/tools/perf/tests/hists_filter.c
@@ -104,7 +104,7 @@ out:
 	return TEST_FAIL;
 }
 
-int test__hists_filter(void)
+int test__hists_filter(int subtest __maybe_unused)
 {
 	int err = TEST_FAIL;
 	struct machines machines;
diff --git a/tools/perf/tests/hists_link.c b/tools/perf/tests/hists_link.c
index 8c102b0..6243e2b 100644
--- a/tools/perf/tests/hists_link.c
+++ b/tools/perf/tests/hists_link.c
@@ -274,7 +274,7 @@ static int validate_link(struct hists *leader, struct hists *other)
 	return __validate_link(leader, 0) || __validate_link(other, 1);
 }
 
-int test__hists_link(void)
+int test__hists_link(int subtest __maybe_unused)
 {
 	int err = -1;
 	struct hists *hists, *first_hists;
diff --git a/tools/perf/tests/hists_output.c b/tools/perf/tests/hists_output.c
index adbebc8..248beec 100644
--- a/tools/perf/tests/hists_output.c
+++ b/tools/perf/tests/hists_output.c
@@ -576,7 +576,7 @@ out:
 	return err;
 }
 
-int test__hists_output(void)
+int test__hists_output(int subtest __maybe_unused)
 {
 	int err = TEST_FAIL;
 	struct machines machines;
diff --git a/tools/perf/tests/keep-tracking.c b/tools/perf/tests/keep-tracking.c
index a2e2269..a337a6d 100644
--- a/tools/perf/tests/keep-tracking.c
+++ b/tools/perf/tests/keep-tracking.c
@@ -49,7 +49,7 @@ static int find_comm(struct perf_evlist *evlist, const char *comm)
  * when an event is disabled but a dummy software event is not disabled.  If the
  * test passes %0 is returned, otherwise %-1 is returned.
  */
-int test__keep_tracking(void)
+int test__keep_tracking(int subtest __maybe_unused)
 {
 	struct record_opts opts = {
 		.mmap_pages	     = UINT_MAX,
diff --git a/tools/perf/tests/kmod-path.c b/tools/perf/tests/kmod-path.c
index 08c433b..d2af781 100644
--- a/tools/perf/tests/kmod-path.c
+++ b/tools/perf/tests/kmod-path.c
@@ -49,7 +49,7 @@ static int test_is_kernel_module(const char *path, int cpumode, bool expect)
 #define M(path, c, e) \
 	TEST_ASSERT_VAL("failed", !test_is_kernel_module(path, c, e))
 
-int test__kmod_path__parse(void)
+int test__kmod_path__parse(int subtest __maybe_unused)
 {
 	/* path                alloc_name  alloc_ext   kmod  comp   name     ext */
 	T("/xxxx/xxxx/x-x.ko", true      , true      , true, false, "[x_x]", NULL);
diff --git a/tools/perf/tests/llvm.c b/tools/perf/tests/llvm.c
index b414763..4350c45 100644
--- a/tools/perf/tests/llvm.c
+++ b/tools/perf/tests/llvm.c
@@ -131,7 +131,7 @@ out:
 	return ret;
 }
 
-int test__llvm(void)
+int test__llvm(int subtest __maybe_unused)
 {
 	enum test_llvm__testcase i;
 
diff --git a/tools/perf/tests/mmap-basic.c b/tools/perf/tests/mmap-basic.c
index 4495493..359e98f 100644
--- a/tools/perf/tests/mmap-basic.c
+++ b/tools/perf/tests/mmap-basic.c
@@ -16,7 +16,7 @@
  * Then it checks if the number of syscalls reported as perf events by
  * the kernel corresponds to the number of syscalls made.
  */
-int test__basic_mmap(void)
+int test__basic_mmap(int subtest __maybe_unused)
 {
 	int err = -1;
 	union perf_event *event;
diff --git a/tools/perf/tests/mmap-thread-lookup.c b/tools/perf/tests/mmap-thread-lookup.c
index 145050e..6cdb975 100644
--- a/tools/perf/tests/mmap-thread-lookup.c
+++ b/tools/perf/tests/mmap-thread-lookup.c
@@ -221,7 +221,7 @@ static int mmap_events(synth_cb synth)
  *
  * by using all thread objects.
  */
-int test__mmap_thread_lookup(void)
+int test__mmap_thread_lookup(int subtest __maybe_unused)
 {
 	/* perf_event__synthesize_threads synthesize */
 	TEST_ASSERT_VAL("failed with sythesizing all",
diff --git a/tools/perf/tests/openat-syscall-all-cpus.c b/tools/perf/tests/openat-syscall-all-cpus.c
index 2006485..53c2273 100644
--- a/tools/perf/tests/openat-syscall-all-cpus.c
+++ b/tools/perf/tests/openat-syscall-all-cpus.c
@@ -7,7 +7,7 @@
 #include "debug.h"
 #include "stat.h"
 
-int test__openat_syscall_event_on_all_cpus(void)
+int test__openat_syscall_event_on_all_cpus(int subtest __maybe_unused)
 {
 	int err = -1, fd, cpu;
 	struct cpu_map *cpus;
diff --git a/tools/perf/tests/openat-syscall-tp-fields.c b/tools/perf/tests/openat-syscall-tp-fields.c
index 5e811cd..eb99a105 100644
--- a/tools/perf/tests/openat-syscall-tp-fields.c
+++ b/tools/perf/tests/openat-syscall-tp-fields.c
@@ -6,7 +6,7 @@
 #include "tests.h"
 #include "debug.h"
 
-int test__syscall_openat_tp_fields(void)
+int test__syscall_openat_tp_fields(int subtest __maybe_unused)
 {
 	struct record_opts opts = {
 		.target = {
diff --git a/tools/perf/tests/openat-syscall.c b/tools/perf/tests/openat-syscall.c
index 033b547..1184f9b 100644
--- a/tools/perf/tests/openat-syscall.c
+++ b/tools/perf/tests/openat-syscall.c
@@ -5,7 +5,7 @@
 #include "debug.h"
 #include "tests.h"
 
-int test__openat_syscall_event(void)
+int test__openat_syscall_event(int subtest __maybe_unused)
 {
 	int err = -1, fd;
 	struct perf_evsel *evsel;
diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c
index 636d7b4..abe8849 100644
--- a/tools/perf/tests/parse-events.c
+++ b/tools/perf/tests/parse-events.c
@@ -1765,7 +1765,7 @@ static void debug_warn(const char *warn, va_list params)
 	fprintf(stderr, " Warning: %s\n", msg);
 }
 
-int test__parse_events(void)
+int test__parse_events(int subtest __maybe_unused)
 {
 	int ret1, ret2 = 0;
 
diff --git a/tools/perf/tests/parse-no-sample-id-all.c b/tools/perf/tests/parse-no-sample-id-all.c
index 2c63ea6..294c76b 100644
--- a/tools/perf/tests/parse-no-sample-id-all.c
+++ b/tools/perf/tests/parse-no-sample-id-all.c
@@ -67,7 +67,7 @@ struct test_attr_event {
  *
  * Return: %0 on success, %-1 if the test fails.
  */
-int test__parse_no_sample_id_all(void)
+int test__parse_no_sample_id_all(int subtest __maybe_unused)
 {
 	int err;
 
diff --git a/tools/perf/tests/perf-record.c b/tools/perf/tests/perf-record.c
index 7a228a2..9d5f0b5 100644
--- a/tools/perf/tests/perf-record.c
+++ b/tools/perf/tests/perf-record.c
@@ -32,7 +32,7 @@ realloc:
 	return cpu;
 }
 
-int test__PERF_RECORD(void)
+int test__PERF_RECORD(int subtest __maybe_unused)
 {
 	struct record_opts opts = {
 		.target = {
diff --git a/tools/perf/tests/pmu.c b/tools/perf/tests/pmu.c
index faa04e9..1e2ba26 100644
--- a/tools/perf/tests/pmu.c
+++ b/tools/perf/tests/pmu.c
@@ -133,7 +133,7 @@ static struct list_head *test_terms_list(void)
 	return &terms;
 }
 
-int test__pmu(void)
+int test__pmu(int subtest __maybe_unused)
 {
 	char *format = test_format_dir_get();
 	LIST_HEAD(formats);
diff --git a/tools/perf/tests/python-use.c b/tools/perf/tests/python-use.c
index 7760277..7a52834 100644
--- a/tools/perf/tests/python-use.c
+++ b/tools/perf/tests/python-use.c
@@ -4,11 +4,12 @@
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <linux/compiler.h>
 #include "tests.h"
 
 extern int verbose;
 
-int test__python_use(void)
+int test__python_use(int subtest __maybe_unused)
 {
 	char *cmd;
 	int ret;
diff --git a/tools/perf/tests/sample-parsing.c b/tools/perf/tests/sample-parsing.c
index 30c0218..5f23710 100644
--- a/tools/perf/tests/sample-parsing.c
+++ b/tools/perf/tests/sample-parsing.c
@@ -290,7 +290,7 @@ out_free:
  * checks sample format bits separately and together.  If the test passes %0 is
  * returned, otherwise %-1 is returned.
  */
-int test__sample_parsing(void)
+int test__sample_parsing(int subtest __maybe_unused)
 {
 	const u64 rf[] = {4, 5, 6, 7, 12, 13, 14, 15};
 	u64 sample_type;
diff --git a/tools/perf/tests/sw-clock.c b/tools/perf/tests/sw-clock.c
index 5b83f56..36e8ce1 100644
--- a/tools/perf/tests/sw-clock.c
+++ b/tools/perf/tests/sw-clock.c
@@ -122,7 +122,7 @@ out_delete_evlist:
 	return err;
 }
 
-int test__sw_clock_freq(void)
+int test__sw_clock_freq(int subtest __maybe_unused)
 {
 	int ret;
 
diff --git a/tools/perf/tests/switch-tracking.c b/tools/perf/tests/switch-tracking.c
index a02af50..dfbd8d6 100644
--- a/tools/perf/tests/switch-tracking.c
+++ b/tools/perf/tests/switch-tracking.c
@@ -305,7 +305,7 @@ out_free_nodes:
  * evsel->system_wide and evsel->tracking flags (respectively) with other events
  * sometimes enabled or disabled.
  */
-int test__switch_tracking(void)
+int test__switch_tracking(int subtest __maybe_unused)
 {
 	const char *sched_switch = "sched:sched_switch";
 	struct switch_tracking switch_tracking = { .tids = NULL, };
diff --git a/tools/perf/tests/task-exit.c b/tools/perf/tests/task-exit.c
index add1638..2dfff7a 100644
--- a/tools/perf/tests/task-exit.c
+++ b/tools/perf/tests/task-exit.c
@@ -31,7 +31,7 @@ static void workload_exec_failed_signal(int signo __maybe_unused,
  * if the number of exit event reported by the kernel is 1 or not
  * in order to check the kernel returns correct number of event.
  */
-int test__task_exit(void)
+int test__task_exit(int subtest __maybe_unused)
 {
 	int err = -1;
 	union perf_event *event;
diff --git a/tools/perf/tests/tests.h b/tools/perf/tests/tests.h
index 3c8734a..204e4ee 100644
--- a/tools/perf/tests/tests.h
+++ b/tools/perf/tests/tests.h
@@ -26,48 +26,48 @@ enum {
 
 struct test {
 	const char *desc;
-	int (*func)(void);
+	int (*func)(int subtest);
 };
 
 /* Tests */
-int test__vmlinux_matches_kallsyms(void);
-int test__openat_syscall_event(void);
-int test__openat_syscall_event_on_all_cpus(void);
-int test__basic_mmap(void);
-int test__PERF_RECORD(void);
-int test__perf_evsel__roundtrip_name_test(void);
-int test__perf_evsel__tp_sched_test(void);
-int test__syscall_openat_tp_fields(void);
-int test__pmu(void);
-int test__attr(void);
-int test__dso_data(void);
-int test__dso_data_cache(void);
-int test__dso_data_reopen(void);
-int test__parse_events(void);
-int test__hists_link(void);
-int test__python_use(void);
-int test__bp_signal(void);
-int test__bp_signal_overflow(void);
-int test__task_exit(void);
-int test__sw_clock_freq(void);
-int test__code_reading(void);
-int test__sample_parsing(void);
-int test__keep_tracking(void);
-int test__parse_no_sample_id_all(void);
-int test__dwarf_unwind(void);
-int test__hists_filter(void);
-int test__mmap_thread_lookup(void);
-int test__thread_mg_share(void);
-int test__hists_output(void);
-int test__hists_cumulate(void);
-int test__switch_tracking(void);
-int test__fdarray__filter(void);
-int test__fdarray__add(void);
-int test__kmod_path__parse(void);
-int test__thread_map(void);
-int test__llvm(void);
-int test__bpf(void);
-int test_session_topology(void);
+int test__vmlinux_matches_kallsyms(int subtest);
+int test__openat_syscall_event(int subtest);
+int test__openat_syscall_event_on_all_cpus(int subtest);
+int test__basic_mmap(int subtest);
+int test__PERF_RECORD(int subtest);
+int test__perf_evsel__roundtrip_name_test(int subtest);
+int test__perf_evsel__tp_sched_test(int subtest);
+int test__syscall_openat_tp_fields(int subtest);
+int test__pmu(int subtest);
+int test__attr(int subtest);
+int test__dso_data(int subtest);
+int test__dso_data_cache(int subtest);
+int test__dso_data_reopen(int subtest);
+int test__parse_events(int subtest);
+int test__hists_link(int subtest);
+int test__python_use(int subtest);
+int test__bp_signal(int subtest);
+int test__bp_signal_overflow(int subtest);
+int test__task_exit(int subtest);
+int test__sw_clock_freq(int subtest);
+int test__code_reading(int subtest);
+int test__sample_parsing(int subtest);
+int test__keep_tracking(int subtest);
+int test__parse_no_sample_id_all(int subtest);
+int test__dwarf_unwind(int subtest);
+int test__hists_filter(int subtest);
+int test__mmap_thread_lookup(int subtest);
+int test__thread_mg_share(int subtest);
+int test__hists_output(int subtest);
+int test__hists_cumulate(int subtest);
+int test__switch_tracking(int subtest);
+int test__fdarray__filter(int subtest);
+int test__fdarray__add(int subtest);
+int test__kmod_path__parse(int subtest);
+int test__thread_map(int subtest);
+int test__llvm(int subtest);
+int test__bpf(int subtest);
+int test_session_topology(int subtest);
 
 #if defined(__arm__) || defined(__aarch64__)
 #ifdef HAVE_DWARF_UNWIND_SUPPORT
diff --git a/tools/perf/tests/thread-map.c b/tools/perf/tests/thread-map.c
index 138a0e3..2be02d3 100644
--- a/tools/perf/tests/thread-map.c
+++ b/tools/perf/tests/thread-map.c
@@ -4,7 +4,7 @@
 #include "thread_map.h"
 #include "debug.h"
 
-int test__thread_map(void)
+int test__thread_map(int subtest __maybe_unused)
 {
 	struct thread_map *map;
 
diff --git a/tools/perf/tests/thread-mg-share.c b/tools/perf/tests/thread-mg-share.c
index 01fabb1..188b631 100644
--- a/tools/perf/tests/thread-mg-share.c
+++ b/tools/perf/tests/thread-mg-share.c
@@ -4,7 +4,7 @@
 #include "map.h"
 #include "debug.h"
 
-int test__thread_mg_share(void)
+int test__thread_mg_share(int subtest __maybe_unused)
 {
 	struct machines machines;
 	struct machine *machine;
diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c
index f5bb096..98fe69a 100644
--- a/tools/perf/tests/topology.c
+++ b/tools/perf/tests/topology.c
@@ -84,7 +84,7 @@ static int check_cpu_topology(char *path, struct cpu_map *map)
 	return 0;
 }
 
-int test_session_topology(void)
+int test_session_topology(int subtest __maybe_unused)
 {
 	char path[PATH_MAX];
 	struct cpu_map *map;
diff --git a/tools/perf/tests/vmlinux-kallsyms.c b/tools/perf/tests/vmlinux-kallsyms.c
index d677e01..f0bfc9e 100644
--- a/tools/perf/tests/vmlinux-kallsyms.c
+++ b/tools/perf/tests/vmlinux-kallsyms.c
@@ -18,7 +18,7 @@ static int vmlinux_matches_kallsyms_filter(struct map *map __maybe_unused,
 
 #define UM(x) kallsyms_map->unmap_ip(kallsyms_map, (x))
 
-int test__vmlinux_matches_kallsyms(void)
+int test__vmlinux_matches_kallsyms(int subtest __maybe_unused)
 {
 	int err = -1;
 	struct rb_node *nd;
-- 
cgit v1.1


From e8c6d500447c577e669c24ec04cd4173fe9f9afb Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Tue, 17 Nov 2015 08:32:48 +0000
Subject: perf test: Print result for each LLVM subtest

Currently 'perf test llvm' and 'perf test BPF' have multiple sub-tests,
but the result is provided in only one line:

  # perf test LLVM
  35: Test LLVM searching and compiling                        : Ok

This patch introduces sub-tests support, allowing 'perf test' to report
result for each sub-tests:

  # perf test LLVM
  35: Test LLVM searching and compiling                        :
  35.1: Basic BPF llvm compiling test                          : Ok
  35.2: Test kbuild searching                                  : Ok
  35.3: Compile source for BPF prologue generation test        : Ok

When a failure happens:

  # cat ~/.perfconfig
  [llvm]
       clang-path = "/bin/false"
  # perf test LLVM
  35: Test LLVM searching and compiling                        :
  35.1: Basic BPF llvm compiling test                          : FAILED!
  35.2: Test kbuild searching                                  : Skip
  35.3: Compile source for BPF prologue generation test        : Skip

And:

  # rm ~/.perfconfig
  # ./perf test LLVM
  35: Test LLVM searching and compiling                        :
  35.1: Basic BPF llvm compiling test                          : Skip
  35.2: Test kbuild searching                                  : Skip
  35.3: Compile source for BPF prologue generation test        : Skip

Skip by user:

  # ./perf test -s 1,`seq -s , 3 42`
   1: vmlinux symtab matches kallsyms                          : Skip (user override)
   2: detect openat syscall event                              : Ok
  ...
  35: Test LLVM searching and compiling                        : Skip (user override)
  ...

Suggested-and-Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Wang Nan <wangnan0@huawei.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1447749170-175898-4-git-send-email-wangnan0@huawei.com
[ Changed so that func is not on an anonymous union ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/builtin-test.c | 91 ++++++++++++++++++++++++++++++++++-------
 tools/perf/tests/llvm.c         | 65 ++++++++++++++---------------
 tools/perf/tests/tests.h        |  9 ++++
 3 files changed, 115 insertions(+), 50 deletions(-)

diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c
index 9cf4892..8136609 100644
--- a/tools/perf/tests/builtin-test.c
+++ b/tools/perf/tests/builtin-test.c
@@ -160,6 +160,11 @@ static struct test generic_tests[] = {
 	{
 		.desc = "Test LLVM searching and compiling",
 		.func = test__llvm,
+		.subtest = {
+			.skip_if_fail	= true,
+			.get_nr		= test__llvm_subtest_get_nr,
+			.get_desc	= test__llvm_subtest_get_desc,
+		},
 	},
 	{
 		.desc = "Test topology in session",
@@ -237,6 +242,40 @@ static int run_test(struct test *test, int subtest)
 	for (j = 0; j < ARRAY_SIZE(tests); j++)	\
 		for (t = &tests[j][0]; t->func; t++)
 
+static int test_and_print(struct test *t, bool force_skip, int subtest)
+{
+	int err;
+
+	if (!force_skip) {
+		pr_debug("\n--- start ---\n");
+		err = run_test(t, subtest);
+		pr_debug("---- end ----\n");
+	} else {
+		pr_debug("\n--- force skipped ---\n");
+		err = TEST_SKIP;
+	}
+
+	if (!t->subtest.get_nr)
+		pr_debug("%s:", t->desc);
+	else
+		pr_debug("%s subtest %d:", t->desc, subtest);
+
+	switch (err) {
+	case TEST_OK:
+		pr_info(" Ok\n");
+		break;
+	case TEST_SKIP:
+		color_fprintf(stderr, PERF_COLOR_YELLOW, " Skip\n");
+		break;
+	case TEST_FAIL:
+	default:
+		color_fprintf(stderr, PERF_COLOR_RED, " FAILED!\n");
+		break;
+	}
+
+	return err;
+}
+
 static int __cmd_test(int argc, const char *argv[], struct intlist *skiplist)
 {
 	struct test *t;
@@ -264,21 +303,43 @@ static int __cmd_test(int argc, const char *argv[], struct intlist *skiplist)
 			continue;
 		}
 
-		pr_debug("\n--- start ---\n");
-		err = run_test(t, i);
-		pr_debug("---- end ----\n%s:", t->desc);
-
-		switch (err) {
-		case TEST_OK:
-			pr_info(" Ok\n");
-			break;
-		case TEST_SKIP:
-			color_fprintf(stderr, PERF_COLOR_YELLOW, " Skip\n");
-			break;
-		case TEST_FAIL:
-		default:
-			color_fprintf(stderr, PERF_COLOR_RED, " FAILED!\n");
-			break;
+		if (!t->subtest.get_nr) {
+			test_and_print(t, false, -1);
+		} else {
+			int subn = t->subtest.get_nr();
+			/*
+			 * minus 2 to align with normal testcases.
+			 * For subtest we print additional '.x' in number.
+			 * for example:
+			 *
+			 * 35: Test LLVM searching and compiling                        :
+			 * 35.1: Basic BPF llvm compiling test                          : Ok
+			 */
+			int subw = width > 2 ? width - 2 : width;
+			bool skip = false;
+			int subi;
+
+			if (subn <= 0) {
+				color_fprintf(stderr, PERF_COLOR_YELLOW,
+					      " Skip (not compiled in)\n");
+				continue;
+			}
+			pr_info("\n");
+
+			for (subi = 0; subi < subn; subi++) {
+				int len = strlen(t->subtest.get_desc(subi));
+
+				if (subw < len)
+					subw = len;
+			}
+
+			for (subi = 0; subi < subn; subi++) {
+				pr_info("%2d.%1d: %-*s:", i, subi + 1, subw,
+					t->subtest.get_desc(subi));
+				err = test_and_print(t, skip, subi);
+				if (err != TEST_OK && t->subtest.skip_if_fail)
+					skip = true;
+			}
 		}
 	}
 
diff --git a/tools/perf/tests/llvm.c b/tools/perf/tests/llvm.c
index 4350c45..06f45c1 100644
--- a/tools/perf/tests/llvm.c
+++ b/tools/perf/tests/llvm.c
@@ -46,7 +46,7 @@ static struct {
 	},
 	[LLVM_TESTCASE_BPF_PROLOGUE] = {
 		.source = test_llvm__bpf_test_prologue_prog,
-		.desc = "Test BPF prologue generation",
+		.desc = "Compile source for BPF prologue generation test",
 	},
 };
 
@@ -131,44 +131,39 @@ out:
 	return ret;
 }
 
-int test__llvm(int subtest __maybe_unused)
+int test__llvm(int subtest)
 {
-	enum test_llvm__testcase i;
+	int ret;
+	void *obj_buf = NULL;
+	size_t obj_buf_sz = 0;
 
-	for (i = 0; i < __LLVM_TESTCASE_MAX; i++) {
-		int ret;
-		void *obj_buf = NULL;
-		size_t obj_buf_sz = 0;
+	if ((subtest < 0) || (subtest >= __LLVM_TESTCASE_MAX))
+		return TEST_FAIL;
 
-		ret = test_llvm__fetch_bpf_obj(&obj_buf, &obj_buf_sz,
-					       i, false);
+	ret = test_llvm__fetch_bpf_obj(&obj_buf, &obj_buf_sz,
+				       subtest, false);
 
-		if (ret == TEST_OK) {
-			ret = test__bpf_parsing(obj_buf, obj_buf_sz);
-			if (ret != TEST_OK)
-				pr_debug("Failed to parse test case '%s'\n",
-					 bpf_source_table[i].desc);
-		}
-		free(obj_buf);
-
-		switch (ret) {
-		case TEST_SKIP:
-			return TEST_SKIP;
-		case TEST_OK:
-			break;
-		default:
-			/*
-			 * Test 0 is the basic LLVM test. If test 0
-			 * fail, the basic LLVM support not functional
-			 * so the whole test should fail. If other test
-			 * case fail, it can be fixed by adjusting
-			 * config so don't report error.
-			 */
-			if (i == 0)
-				return TEST_FAIL;
-			else
-				return TEST_SKIP;
+	if (ret == TEST_OK) {
+		ret = test__bpf_parsing(obj_buf, obj_buf_sz);
+		if (ret != TEST_OK) {
+			pr_debug("Failed to parse test case '%s'\n",
+				 bpf_source_table[subtest].desc);
 		}
 	}
-	return TEST_OK;
+	free(obj_buf);
+
+	return ret;
+}
+
+int test__llvm_subtest_get_nr(void)
+{
+	return __LLVM_TESTCASE_MAX;
+}
+
+const char *test__llvm_subtest_get_desc(int subtest)
+{
+	if ((subtest < 0) || (subtest >= __LLVM_TESTCASE_MAX))
+		return NULL;
+
+	return bpf_source_table[subtest].desc;
 }
diff --git a/tools/perf/tests/tests.h b/tools/perf/tests/tests.h
index 204e4ee..f92af52 100644
--- a/tools/perf/tests/tests.h
+++ b/tools/perf/tests/tests.h
@@ -1,6 +1,8 @@
 #ifndef TESTS_H
 #define TESTS_H
 
+#include <stdbool.h>
+
 #define TEST_ASSERT_VAL(text, cond)					 \
 do {									 \
 	if (!(cond)) {							 \
@@ -27,6 +29,11 @@ enum {
 struct test {
 	const char *desc;
 	int (*func)(int subtest);
+	struct {
+		bool skip_if_fail;
+		int (*get_nr)(void);
+		const char *(*get_desc)(int subtest);
+	} subtest;
 };
 
 /* Tests */
@@ -66,6 +73,8 @@ int test__fdarray__add(int subtest);
 int test__kmod_path__parse(int subtest);
 int test__thread_map(int subtest);
 int test__llvm(int subtest);
+const char *test__llvm_subtest_get_desc(int subtest);
+int test__llvm_subtest_get_nr(void);
 int test__bpf(int subtest);
 int test_session_topology(int subtest);
 
-- 
cgit v1.1


From 77a0cf682f7979554e10a6c605a1fef4f4197654 Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Tue, 17 Nov 2015 08:32:49 +0000
Subject: perf test: Print result for each BPF subtest

This patch prints each sub-tests results for BPF testcases.

Before:

  # ./perf test BPF
  37: Test BPF filter                                          : Ok

After:

  # ./perf test BPF
  37: Test BPF filter                                          :
  37.1: Test basic BPF filtering                               : Ok
  37.2: Test BPF prologue generation                           : Ok

When a failure happens:

  # cat ~/.perfconfig
  [llvm]
      clang-path = "/bin/false"
  # ./perf test BPF
  37: Test BPF filter                                          :
  37.1: Test basic BPF filtering                               : Skip
  37.2: Test BPF prologue generation                           : Skip

Suggested-and-Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Wang Nan <wangnan0@huawei.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1447749170-175898-5-git-send-email-wangnan0@huawei.com
[ Fixed up not to use .func in an anonymous union ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/bpf.c          | 38 ++++++++++++++++++++++++++++----------
 tools/perf/tests/builtin-test.c |  5 +++++
 tools/perf/tests/tests.h        |  2 ++
 3 files changed, 35 insertions(+), 10 deletions(-)

diff --git a/tools/perf/tests/bpf.c b/tools/perf/tests/bpf.c
index 4efdc16..33689a0 100644
--- a/tools/perf/tests/bpf.c
+++ b/tools/perf/tests/bpf.c
@@ -215,28 +215,46 @@ out:
 	return ret;
 }
 
-int test__bpf(int subtest __maybe_unused)
+int test__bpf_subtest_get_nr(void)
+{
+	return (int)ARRAY_SIZE(bpf_testcase_table);
+}
+
+const char *test__bpf_subtest_get_desc(int i)
+{
+	if (i < 0 || i >= (int)ARRAY_SIZE(bpf_testcase_table))
+		return NULL;
+	return bpf_testcase_table[i].desc;
+}
+
+int test__bpf(int i)
 {
-	unsigned int i;
 	int err;
 
+	if (i < 0 || i >= (int)ARRAY_SIZE(bpf_testcase_table))
+		return TEST_FAIL;
+
 	if (geteuid() != 0) {
 		pr_debug("Only root can run BPF test\n");
 		return TEST_SKIP;
 	}
 
-	for (i = 0; i < ARRAY_SIZE(bpf_testcase_table); i++) {
-		err = __test__bpf(i);
+	err = __test__bpf(i);
+	return err;
+}
 
-		if (err != TEST_OK)
-			return err;
-	}
+#else
+int test__bpf_subtest_get_nr(void)
+{
+	return 0;
+}
 
-	return TEST_OK;
+const char *test__bpf_subtest_get_desc(int i __maybe_unused)
+{
+	return NULL;
 }
 
-#else
-int test__bpf(void)
+int test__bpf(int i __maybe_unused)
 {
 	pr_debug("Skip BPF test because BPF support is not compiled\n");
 	return TEST_SKIP;
diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c
index 8136609..146ae98 100644
--- a/tools/perf/tests/builtin-test.c
+++ b/tools/perf/tests/builtin-test.c
@@ -173,6 +173,11 @@ static struct test generic_tests[] = {
 	{
 		.desc = "Test BPF filter",
 		.func = test__bpf,
+		.subtest = {
+			.skip_if_fail	= true,
+			.get_nr		= test__bpf_subtest_get_nr,
+			.get_desc	= test__bpf_subtest_get_desc,
+		},
 	},
 	{
 		.func = NULL,
diff --git a/tools/perf/tests/tests.h b/tools/perf/tests/tests.h
index f92af52..a0733aa 100644
--- a/tools/perf/tests/tests.h
+++ b/tools/perf/tests/tests.h
@@ -76,6 +76,8 @@ int test__llvm(int subtest);
 const char *test__llvm_subtest_get_desc(int subtest);
 int test__llvm_subtest_get_nr(void);
 int test__bpf(int subtest);
+const char *test__bpf_subtest_get_desc(int subtest);
+int test__bpf_subtest_get_nr(void);
 int test_session_topology(int subtest);
 
 #if defined(__arm__) || defined(__aarch64__)
-- 
cgit v1.1


From 5bcf2fe05318deb6fec209b4028d8a31f9f47221 Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Tue, 17 Nov 2015 08:32:50 +0000
Subject: perf test: Mute test cases error messages if verbose == 0

Sometimes error messages in breaks the pretty output of 'perf test'.
For example:

  # mv /lib/modules/4.3.0-rc4+/build/vmlinux{,.bak}
  # perf test LLVM BPF
  35: Test LLVM searching and compiling                        :
  35.1: Basic BPF llvm compiling test                          : Ok
  35.2: Test kbuild searching                                  : Ok
  35.3: Compile source for BPF prologue generation test        : Ok
  37: Test BPF filter                                          :
  37.1: Test basic BPF filtering                               : Ok
  37.2: Test BPF prologue generation                           :Failed to find the path for kernel: No such file or directory FAILED!

This patch mute test cases thoroughly by redirect their stdout and
stderr to /dev/null when verbose == 0. After applying this patch:

  # ./perf test LLVM BPF
  35: Test LLVM searching and compiling                        :
  35.1: Basic BPF llvm compiling test                          : Ok
  35.2: Test kbuild searching                                  : Ok
  35.3: Compile source for BPF prologue generation test        : Ok
  37: Test BPF filter                                          :
  37.1: Test basic BPF filtering                               : Ok
  37.2: Test BPF prologue generation                           : FAILED!

  # ./perf test -v LLVM BPF
  35: Test LLVM searching and compiling                        :
  35.1: Basic BPF llvm compiling test                          :
  --- start ---
  test child forked, pid 13183
  Kernel build dir is set to /lib/modules/4.3.0-rc4+/build
  set env: KBUILD_DIR=/lib/modules/4.3.0-rc4+/build
  ...
  bpf: config 'func=null_lseek file->f_mode offset orig' is ok
  Looking at the vmlinux_path (7 entries long)
  Failed to find the path for kernel: No such file or directory
  bpf_probe: failed to convert perf probe eventsFailed to add events selected by BPF
  test child finished with -1
  ---- end ----
  Test BPF filter subtest 1: FAILED!

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1447749170-175898-6-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/builtin-test.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c
index 146ae98..2b1ade1 100644
--- a/tools/perf/tests/builtin-test.c
+++ b/tools/perf/tests/builtin-test.c
@@ -226,6 +226,18 @@ static int run_test(struct test *test, int subtest)
 
 	if (!child) {
 		pr_debug("test child forked, pid %d\n", getpid());
+		if (!verbose) {
+			int nullfd = open("/dev/null", O_WRONLY);
+			if (nullfd >= 0) {
+				close(STDERR_FILENO);
+				close(STDOUT_FILENO);
+
+				dup2(nullfd, STDOUT_FILENO);
+				dup2(STDOUT_FILENO, STDERR_FILENO);
+				close(nullfd);
+			}
+		}
+
 		err = test->func(subtest);
 		exit(err);
 	}
-- 
cgit v1.1


From 05c8d802fa52ef17dbcce21c38b72b4a313eb036 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Wed, 18 Nov 2015 15:40:12 +0900
Subject: perf probe: Fix to free temporal Dwarf_Frame

Since dwarf_cfi_addrframe returns malloc'd Dwarf_Frame object, it has to
be freed after it is used.

Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20151118064011.30709.65674.stgit@localhost.localdomain
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/probe-finder.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index 05012bb..1cab05a 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -683,21 +683,24 @@ static int call_probe_finder(Dwarf_Die *sc_die, struct probe_finder *pf)
 	ret = dwarf_getlocation_addr(&fb_attr, pf->addr, &pf->fb_ops, &nops, 1);
 	if (ret <= 0 || nops == 0) {
 		pf->fb_ops = NULL;
+		ret = 0;
 #if _ELFUTILS_PREREQ(0, 142)
 	} else if (nops == 1 && pf->fb_ops[0].atom == DW_OP_call_frame_cfa &&
 		   pf->cfi != NULL) {
-		Dwarf_Frame *frame;
+		Dwarf_Frame *frame = NULL;
 		if (dwarf_cfi_addrframe(pf->cfi, pf->addr, &frame) != 0 ||
 		    dwarf_frame_cfa(frame, &pf->fb_ops, &nops) != 0) {
 			pr_warning("Failed to get call frame on 0x%jx\n",
 				   (uintmax_t)pf->addr);
-			return -ENOENT;
+			ret = -ENOENT;
 		}
+		free(frame);
 #endif
 	}
 
 	/* Call finder's callback handler */
-	ret = pf->callback(sc_die, pf);
+	if (ret >= 0)
+		ret = pf->callback(sc_die, pf);
 
 	/* *pf->fb_ops will be cached in libdw. Don't free it. */
 	pf->fb_ops = NULL;
-- 
cgit v1.1


From 9afcb420d6cfeadf5d872f395061c611536615fb Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Wed, 18 Nov 2015 15:40:20 +0900
Subject: perf machine: Fix machine__findnew_module_map to put registered map

Fix machine object to drop the reference to the map object after it
inserted it into machine->kmaps.

refcnt debugger shows what happened:
  ----
  ==== [2] ====
  Unreclaimed map: 0x346f750
  Refcount +1 => 1 at
    ./perf(map__new2+0xb5) [0x4bdea5]
    ./perf() [0x4b8aaf]
    ./perf(modules__parse+0xfc) [0x4a9cbc]
    ./perf() [0x4b83c0]
    ./perf(machine__create_kernel_maps+0x148) [0x4bb208]
    ./perf(machine__new_host+0xfa) [0x4bb3fa]
    ./perf(init_probe_symbol_maps+0x93) [0x5062b3]
    ./perf() [0x455ffa]
    ./perf(cmd_probe+0x6c) [0x4566bc]
    ./perf() [0x47abc5]
    ./perf(main+0x610) [0x421f90]
    /lib64/libc.so.6(__libc_start_main+0xf5) [0x7f5373899af5]
    ./perf() [0x4220a9]
  Refcount +1 => 2 at
    ./perf(maps__insert+0x9a) [0x4bfd4a]
    ./perf() [0x4b8acb]
    ./perf(modules__parse+0xfc) [0x4a9cbc]
    ./perf() [0x4b83c0]
    ./perf(machine__create_kernel_maps+0x148) [0x4bb208]
    ./perf(machine__new_host+0xfa) [0x4bb3fa]
    ./perf(init_probe_symbol_maps+0x93) [0x5062b3]
    ./perf() [0x455ffa]
    ./perf(cmd_probe+0x6c) [0x4566bc]
    ./perf() [0x47abc5]
    ./perf(main+0x610) [0x421f90]
    /lib64/libc.so.6(__libc_start_main+0xf5) [0x7f5373899af5]
    ./perf() [0x4220a9]
  Refcount -1 => 1 at
    ./perf(map_groups__exit+0x94) [0x4bea54]
    ./perf(machine__delete+0x3d) [0x4b91ed]
    ./perf(exit_probe_symbol_maps+0x28) [0x506358]
    ./perf() [0x45628a]
    ./perf(cmd_probe+0x6c) [0x4566bc]
    ./perf() [0x47abc5]
    ./perf(main+0x610) [0x421f90]
    /lib64/libc.so.6(__libc_start_main+0xf5) [0x7f5373899af5]
    ./perf() [0x4220a9]
  ----

This pattern clearly shows that the refcnt of the map is acquired twice
by map__new2 and maps__insert but released onlu once at
map_groups__exit, when we purge its maps rbtree.

Since maps__insert already reference counted the map, we have to drop
the constructor (map__new2) reference count right after inserting it.

These happened in machine__findnew_module_map, as below.

  ----
  # eu-addr2line -e ./perf -f 0x4b8aaf
  machine__findnew_module_map inlined at util/machine.c:1046
  in machine__create_module
  util/machine.c:582
  # eu-addr2line -e ./perf -f 0x4b8acb
  map_groups__insert inlined at util/machine.c:585
  in machine__create_module
  util/map.h:208
  ----

(note that both are at util/machine.c:58X which is
 machine__findnew_module_map)

Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20151118064020.30709.40499.stgit@localhost.localdomain
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/machine.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 8b303ff..0487d77 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -585,6 +585,8 @@ struct map *machine__findnew_module_map(struct machine *machine, u64 start,
 
 	map_groups__insert(&machine->kmaps, map);
 
+	/* Put the map here because map_groups__insert alread got it */
+	map__put(map);
 out:
 	free(m.name);
 	return map;
-- 
cgit v1.1


From e96e4078e9a5ea150b3ad9a296440a7976439e4a Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Wed, 18 Nov 2015 15:40:22 +0900
Subject: perf machine: Fix machine__destroy_kernel_maps to drop vmlinux_maps
 references

Fix machine__destroy_kernel_maps() to drop vmlinux_maps references
before filling it with NULL.

Refcnt debugger shows
  ==== [1] ====
  Unreclaimed map: 0x36b1070
  Refcount +1 => 1 at
    ./perf(map__new2+0xb5) [0x4bdec5]
    ./perf(machine__create_kernel_maps+0x72) [0x4bb152]
    ./perf(machine__new_host+0xfa) [0x4bb41a]
    ./perf(init_probe_symbol_maps+0x93) [0x5062d3]
    ./perf() [0x455ffa]
    ./perf(cmd_probe+0x6c) [0x4566bc]
    ./perf() [0x47abc5]
    ./perf(main+0x610) [0x421f90]
    /lib64/libc.so.6(__libc_start_main+0xf5) [0x7f1fc9fc4af5]
    ./perf() [0x4220a9]
  Refcount +1 => 2 at
    ./perf(maps__insert+0x9a) [0x4bfd6a]
    ./perf(machine__create_kernel_maps+0xc3) [0x4bb1a3]
    ./perf(machine__new_host+0xfa) [0x4bb41a]
    ./perf(init_probe_symbol_maps+0x93) [0x5062d3]
    ./perf() [0x455ffa]
    ./perf(cmd_probe+0x6c) [0x4566bc]
    ./perf() [0x47abc5]
    ./perf(main+0x610) [0x421f90]
    /lib64/libc.so.6(__libc_start_main+0xf5) [0x7f1fc9fc4af5]
    ./perf() [0x4220a9]
  Refcount -1 => 1 at
    ./perf(map_groups__exit+0x94) [0x4bea74]
    ./perf(machine__delete+0x3d) [0x4b91fd]
    ./perf(exit_probe_symbol_maps+0x28) [0x506378]
    ./perf() [0x45628a]
    ./perf(cmd_probe+0x6c) [0x4566bc]
    ./perf() [0x47abc5]
    ./perf(main+0x610) [0x421f90]
    /lib64/libc.so.6(__libc_start_main+0xf5) [0x7f1fc9fc4af5]
    ./perf() [0x4220a9]

map__new2() returns map with refcnt = 1, and also map_groups__insert
gets it again in__machine__create_kernel_maps().

machine__destroy_kernel_maps() calls map_groups__remove() to
decrement the refcnt, but before decrement it again (corresponding
to map__new2), it makes vmlinux_maps[type] = NULL. And this may
cause a refcnt leak.

Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20151118064022.30709.3897.stgit@localhost.localdomain
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/machine.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 0487d77..e9e09be 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -790,6 +790,7 @@ void machine__destroy_kernel_maps(struct machine *machine)
 				kmap->ref_reloc_sym = NULL;
 		}
 
+		map__put(machine->vmlinux_maps[type]);
 		machine->vmlinux_maps[type] = NULL;
 	}
 }
-- 
cgit v1.1


From ebe9729c8c3171aa46ad5d7af40acdc29806689d Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Wed, 18 Nov 2015 15:40:24 +0900
Subject: perf machine: Fix to destroy kernel maps when machine exits

Actually machine__exit forgot to call machine__destroy_kernel_maps.

This fixes some memory leaks on map as below.

Without this fix.
  ----
  ./perf probe vfs_read
  Added new event:
    probe:vfs_read       (on vfs_read)

  You can now use it in all perf tools, such as:

          perf record -e probe:vfs_read -aR sleep 1

  REFCNT: BUG: Unreclaimed objects found.
  REFCNT: Total 4 objects are not reclaimed.
     To see all backtraces, rerun with -v option
  ----
With this fix.
  ----
  ./perf probe vfs_read
  Added new event:
    probe:vfs_read       (on vfs_read)

  You can now use it in all perf tools, such as:

          perf record -e probe:vfs_read -aR sleep 1

  REFCNT: BUG: Unreclaimed objects found.
  REFCNT: Total 2 objects are not reclaimed.
     To see all backtraces, rerun with -v option
  ----

Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20151118064024.30709.43577.stgit@localhost.localdomain
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/machine.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index e9e09be..a358771 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -122,6 +122,7 @@ void machine__delete_threads(struct machine *machine)
 
 void machine__exit(struct machine *machine)
 {
+	machine__destroy_kernel_maps(machine);
 	map_groups__exit(&machine->kmaps);
 	dsos__exit(&machine->dsos);
 	machine__exit_vdso(machine);
-- 
cgit v1.1


From c4068f51d40df151a661a384ab1309b11d7f012e Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Thu, 19 Nov 2015 15:04:53 +0900
Subject: perf tools: Make perf_exec_path() always return malloc'd string

Since system_path() returns malloc'd string if given path is not an
absolute path, perf_exec_path() sometimes returns a static string and
sometimes returns a malloc'd string depending on the environment
variables or command options.

This may cause a memory leak because the caller can not unconditionally
free the returned string.

This fixes perf_exec_path() and system_path() to always return a
malloc'd string, so the caller can always free it.

Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20151119060453.14210.65666.stgit@localhost.localdomain
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/exec_cmd.c | 21 +++++++++++----------
 tools/perf/util/exec_cmd.h |  5 +++--
 tools/perf/util/help.c     |  6 ++++--
 3 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/tools/perf/util/exec_cmd.c b/tools/perf/util/exec_cmd.c
index 7adf4ad..1099e92 100644
--- a/tools/perf/util/exec_cmd.c
+++ b/tools/perf/util/exec_cmd.c
@@ -9,17 +9,17 @@
 static const char *argv_exec_path;
 static const char *argv0_path;
 
-const char *system_path(const char *path)
+char *system_path(const char *path)
 {
 	static const char *prefix = PREFIX;
 	struct strbuf d = STRBUF_INIT;
 
 	if (is_absolute_path(path))
-		return path;
+		return strdup(path);
 
 	strbuf_addf(&d, "%s/%s", prefix, path);
 	path = strbuf_detach(&d, NULL);
-	return path;
+	return (char *)path;
 }
 
 const char *perf_extract_argv0_path(const char *argv0)
@@ -52,17 +52,16 @@ void perf_set_argv_exec_path(const char *exec_path)
 
 
 /* Returns the highest-priority, location to look for perf programs. */
-const char *perf_exec_path(void)
+char *perf_exec_path(void)
 {
-	const char *env;
+	char *env;
 
 	if (argv_exec_path)
-		return argv_exec_path;
+		return strdup(argv_exec_path);
 
 	env = getenv(EXEC_PATH_ENVIRONMENT);
-	if (env && *env) {
-		return env;
-	}
+	if (env && *env)
+		return strdup(env);
 
 	return system_path(PERF_EXEC_PATH);
 }
@@ -83,9 +82,11 @@ void setup_path(void)
 {
 	const char *old_path = getenv("PATH");
 	struct strbuf new_path = STRBUF_INIT;
+	char *tmp = perf_exec_path();
 
-	add_path(&new_path, perf_exec_path());
+	add_path(&new_path, tmp);
 	add_path(&new_path, argv0_path);
+	free(tmp);
 
 	if (old_path)
 		strbuf_addstr(&new_path, old_path);
diff --git a/tools/perf/util/exec_cmd.h b/tools/perf/util/exec_cmd.h
index bc4b915..48b4175 100644
--- a/tools/perf/util/exec_cmd.h
+++ b/tools/perf/util/exec_cmd.h
@@ -3,10 +3,11 @@
 
 extern void perf_set_argv_exec_path(const char *exec_path);
 extern const char *perf_extract_argv0_path(const char *path);
-extern const char *perf_exec_path(void);
 extern void setup_path(void);
 extern int execv_perf_cmd(const char **argv); /* NULL terminated */
 extern int execl_perf_cmd(const char *cmd, ...);
-extern const char *system_path(const char *path);
+/* perf_exec_path and system_path return malloc'd string, caller must free it */
+extern char *perf_exec_path(void);
+extern char *system_path(const char *path);
 
 #endif /* __PERF_EXEC_CMD_H */
diff --git a/tools/perf/util/help.c b/tools/perf/util/help.c
index 86c37c4..fa1fc4a 100644
--- a/tools/perf/util/help.c
+++ b/tools/perf/util/help.c
@@ -159,7 +159,7 @@ void load_command_list(const char *prefix,
 		struct cmdnames *other_cmds)
 {
 	const char *env_path = getenv("PATH");
-	const char *exec_path = perf_exec_path();
+	char *exec_path = perf_exec_path();
 
 	if (exec_path) {
 		list_commands_in_dir(main_cmds, exec_path, prefix);
@@ -187,6 +187,7 @@ void load_command_list(const char *prefix,
 		      sizeof(*other_cmds->names), cmdname_compare);
 		uniq(other_cmds);
 	}
+	free(exec_path);
 	exclude_cmds(other_cmds, main_cmds);
 }
 
@@ -203,13 +204,14 @@ void list_commands(const char *title, struct cmdnames *main_cmds,
 			longest = other_cmds->names[i]->len;
 
 	if (main_cmds->cnt) {
-		const char *exec_path = perf_exec_path();
+		char *exec_path = perf_exec_path();
 		printf("available %s in '%s'\n", title, exec_path);
 		printf("----------------");
 		mput_char('-', strlen(title) + strlen(exec_path));
 		putchar('\n');
 		pretty_print_string_list(main_cmds, longest);
 		putchar('\n');
+		free(exec_path);
 	}
 
 	if (other_cmds->cnt) {
-- 
cgit v1.1


From 8d5c340dfcd48751fdff301bb2a7e3f875652dcb Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Wed, 18 Nov 2015 15:40:27 +0900
Subject: perf tools: Fix to put new map after inserting to map_groups in
 dso__load_sym

Fix dso__load_sym to put the map object which is already
insterted to kmaps.

Refcnt debugger shows
  ==== [0] ====
  Unreclaimed map: 0x39113e0
  Refcount +1 => 1 at
    ./perf(map__new2+0xb5) [0x4be155]
    ./perf(dso__load_sym+0xee1) [0x503461]
    ./perf(dso__load_vmlinux+0xbf) [0x4aa6df]
    ./perf(dso__load_vmlinux_path+0x8c) [0x4aa83c]
    ./perf() [0x50528a]
    ./perf(convert_perf_probe_events+0xd79) [0x50ac29]
    ./perf() [0x45600f]
    ./perf(cmd_probe+0x6c) [0x4566bc]
    ./perf() [0x47abc5]
    ./perf(main+0x610) [0x421f90]
    /lib64/libc.so.6(__libc_start_main+0xf5) [0x7f152368baf5]
    ./perf() [0x4220a9]
  Refcount +1 => 2 at
    ./perf(maps__insert+0x9a) [0x4bfffa]
    ./perf(dso__load_sym+0xf89) [0x503509]
    ./perf(dso__load_vmlinux+0xbf) [0x4aa6df]
    ./perf(dso__load_vmlinux_path+0x8c) [0x4aa83c]
    ./perf() [0x50528a]
    ./perf(convert_perf_probe_events+0xd79) [0x50ac29]
    ./perf() [0x45600f]
    ./perf(cmd_probe+0x6c) [0x4566bc]
    ./perf() [0x47abc5]
    ./perf(main+0x610) [0x421f90]
    /lib64/libc.so.6(__libc_start_main+0xf5) [0x7f152368baf5]
    ./perf() [0x4220a9]
  Refcount -1 => 1 at
    ./perf(map_groups__exit+0x94) [0x4bed04]
    ./perf(machine__delete+0xb0) [0x4b9300]
    ./perf(exit_probe_symbol_maps+0x28) [0x506608]
    ./perf() [0x45628a]
    ./perf(cmd_probe+0x6c) [0x4566bc]
    ./perf() [0x47abc5]
    ./perf(main+0x610) [0x421f90]
    /lib64/libc.so.6(__libc_start_main+0xf5) [0x7f152368baf5]
    ./perf() [0x4220a9]

This means that the dso__load_sym calls map__new2 and maps_insert, both
of them bump the map refcount, but map_groups__exit will drop just one
reference.

Fix it by dropping the refcount after inserting it into kmaps.

Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20151118064026.30709.50038.stgit@localhost.localdomain
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/symbol-elf.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c
index 475d88d..53f1996 100644
--- a/tools/perf/util/symbol-elf.c
+++ b/tools/perf/util/symbol-elf.c
@@ -1042,6 +1042,8 @@ int dso__load_sym(struct dso *dso, struct map *map,
 				}
 				curr_dso->symtab_type = dso->symtab_type;
 				map_groups__insert(kmaps, curr_map);
+				/* kmaps already got it */
+				map__put(curr_map);
 				dsos__add(&map->groups->machine->dsos, curr_dso);
 				dso__set_loaded(curr_dso, map->type);
 			} else
-- 
cgit v1.1


From 82de26abdc127172fd7453a61d35a9b33bf4f871 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Wed, 18 Nov 2015 15:40:31 +0900
Subject: perf tools: Fix __dsos__addnew to put dso after adding it to the list

__dsos__addnew should drop the constructor reference to dso after adding
it to the list, because __dsos__add() will get a reference that will be
kept while it is in the list.

This fixes DSO leaks when entries are removed to the list and the refcount
never gets to zero.

Refcnt debugger shows:
  ==== [0] ====
  Unreclaimed dso: 0x2fccab0
  Refcount +1 => 1 at
    ./perf(dso__new+0x1ff) [0x4a62df]
    ./perf(__dsos__addnew+0x29) [0x4a6e19]
    ./perf(dsos__findnew+0xd1) [0x4a7281]
    ./perf(machine__findnew_kernel+0x27) [0x4a5e17]
    ./perf() [0x4b8df2]
    ./perf(machine__create_kernel_maps+0x28) [0x4bb528]
    ./perf(machine__new_host+0xfa) [0x4bb84a]
    ./perf(init_probe_symbol_maps+0x93) [0x506713]
    ./perf() [0x455ffa]
    ./perf(cmd_probe+0x6c) [0x4566bc]
    ./perf() [0x47abc5]
    ./perf(main+0x610) [0x421f90]
    /lib64/libc.so.6(__libc_start_main+0xf5) [0x7f46df132af5]
    ./perf() [0x4220a9]
  Refcount +1 => 2 at
    ./perf(__dsos__addnew+0xfb) [0x4a6eeb]
    ./perf(dsos__findnew+0xd1) [0x4a7281]
    ./perf(machine__findnew_kernel+0x27) [0x4a5e17]
    ./perf() [0x4b8df2]
    ./perf(machine__create_kernel_maps+0x28) [0x4bb528]
    ./perf(machine__new_host+0xfa) [0x4bb84a]
    ./perf(init_probe_symbol_maps+0x93) [0x506713]
    ./perf() [0x455ffa]
    ./perf(cmd_probe+0x6c) [0x4566bc]
    ./perf() [0x47abc5]
    ./perf(main+0x610) [0x421f90]
    /lib64/libc.so.6(__libc_start_main+0xf5) [0x7f46df132af5]
    ./perf() [0x4220a9]
  Refcount +1 => 3 at
    ./perf(dsos__findnew+0x7e) [0x4a722e]
    ./perf(machine__findnew_kernel+0x27) [0x4a5e17]
    ./perf() [0x4b8df2]
    ./perf(machine__create_kernel_maps+0x28) [0x4bb528]
    ./perf(machine__new_host+0xfa) [0x4bb84a]
    ./perf(init_probe_symbol_maps+0x93) [0x506713]
    ./perf() [0x455ffa]
    ./perf(cmd_probe+0x6c) [0x4566bc]
    ./perf() [0x47abc5]
    ./perf(main+0x610) [0x421f90]
    /lib64/libc.so.6(__libc_start_main+0xf5) [0x7f46df132af5]
    ./perf() [0x4220a9]
  [snip]

Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20151118064031.30709.81460.stgit@localhost.localdomain
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/dso.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c
index 425df5c..e8e9a9d 100644
--- a/tools/perf/util/dso.c
+++ b/tools/perf/util/dso.c
@@ -1243,6 +1243,8 @@ struct dso *__dsos__addnew(struct dsos *dsos, const char *name)
 	if (dso != NULL) {
 		__dsos__add(dsos, dso);
 		dso__set_basename(dso);
+		/* Put dso here because __dsos_add already got it */
+		dso__put(dso);
 	}
 	return dso;
 }
-- 
cgit v1.1


From 1154c957607afdf5936ae14e1be27d7ca4e7bd30 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Wed, 18 Nov 2015 15:40:33 +0900
Subject: perf tools: Fix machine__create_kernel_maps to put kernel dso
 refcount

Fix machine__create_kernel_maps() to put kernel dso because the dso has
been gotten via __machine__create_kernel_maps().

Refcnt debugger shows:
  ==== [0] ====
  Unreclaimed dso: 0x3036ab0
  Refcount +1 => 1 at
    ./perf(dso__new+0x1ff) [0x4a62df]
    ./perf(__dsos__addnew+0x29) [0x4a6e19]
    ./perf(dsos__findnew+0xd1) [0x4a7181]
    ./perf(machine__findnew_kernel+0x27) [0x4a5e17]
    ./perf() [0x4b8cf2]
    ./perf(machine__create_kernel_maps+0x28) [0x4bb428]
    ./perf(machine__new_host+0xfa) [0x4bb74a]
    ./perf(init_probe_symbol_maps+0x93) [0x506613]
    ./perf() [0x455ffa]
    ./perf(cmd_probe+0x6c) [0x4566bc]
    ./perf() [0x47abc5]
    ./perf(main+0x610) [0x421f90]
    /lib64/libc.so.6(__libc_start_main+0xf5) [0x7ffa6809eaf5]
    ./perf() [0x4220a9]
  [snip]
  Refcount +1 => 2 at
    ./perf(dsos__findnew+0x7e) [0x4a712e]
    ./perf(machine__findnew_kernel+0x27) [0x4a5e17]
    ./perf() [0x4b8cf2]
    ./perf(machine__create_kernel_maps+0x28) [0x4bb428]
    ./perf(machine__new_host+0xfa) [0x4bb74a]
    ./perf(init_probe_symbol_maps+0x93) [0x506613]
    ./perf() [0x455ffa]
    ./perf(cmd_probe+0x6c) [0x4566bc]
    ./perf() [0x47abc5]
    ./perf(main+0x610) [0x421f90]
    /lib64/libc.so.6(__libc_start_main+0xf5) [0x7ffa6809eaf5]
    ./perf() [0x4220a9]
  [snip]
  Refcount -1 => 1 at
    ./perf(dso__put+0x2f) [0x4a664f]
    ./perf(machine__delete+0xfe) [0x4b93ee]
    ./perf(exit_probe_symbol_maps+0x28) [0x5066b8]
    ./perf() [0x45628a]
    ./perf(cmd_probe+0x6c) [0x4566bc]
    ./perf() [0x47abc5]
    ./perf(main+0x610) [0x421f90]
    /lib64/libc.so.6(__libc_start_main+0xf5) [0x7ffa6809eaf5]
    ./perf() [0x4220a9]

Actually, dsos__findnew gets the dso before returning it, so the dso
user (in this case machine__create_kernel_maps) has to put the dso after
used.

Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20151118064033.30709.98954.stgit@localhost.localdomain
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/machine.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index a358771..0b4a05c 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -1088,11 +1088,14 @@ int machine__create_kernel_maps(struct machine *machine)
 	struct dso *kernel = machine__get_kernel(machine);
 	const char *name;
 	u64 addr = machine__get_running_kernel_start(machine, &name);
-	if (!addr)
+	int ret;
+
+	if (!addr || kernel == NULL)
 		return -1;
 
-	if (kernel == NULL ||
-	    __machine__create_kernel_maps(machine, kernel) < 0)
+	ret = __machine__create_kernel_maps(machine, kernel);
+	dso__put(kernel);
+	if (ret < 0)
 		return -1;
 
 	if (symbol_conf.use_modules && machine__create_modules(machine) < 0) {
-- 
cgit v1.1


From 566c69c36e6178774dd484ea4a02b76f6bd0ede4 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Wed, 18 Nov 2015 15:40:35 +0900
Subject: perf machine: Fix machine__findnew_module_map to put dso

Fix machine__findnew_module_map to drop the reference to the dso because
it is already referenced by both machine__findnew_module_dso() and
map__new2().

Refcnt debugger shows:

  ==== [1] ====
  Unreclaimed dso: 0x1ffd980
  Refcount +1 => 1 at
    ./perf(dso__new+0x1ff) [0x4a62df]
    ./perf(__dsos__addnew+0x29) [0x4a6e19]
    ./perf() [0x4b8b91]
    ./perf(modules__parse+0xfc) [0x4a9d5c]
    ./perf() [0x4b8460]
    ./perf(machine__create_kernel_maps+0x150) [0x4bb550]
    ./perf(machine__new_host+0xfa) [0x4bb75a]
    ./perf(init_probe_symbol_maps+0x93) [0x506623]
    ./perf() [0x455ffa]
    ./perf(cmd_probe+0x6c) [0x4566bc]
    ./perf() [0x47abc5]
    ./perf(main+0x610) [0x421f90]
    /lib64/libc.so.6(__libc_start_main+0xf5) [0x7f1345a8eaf5]
    ./perf() [0x4220a9]

This map_groups__insert(0x4b8b91) already gets a reference to the new
dso:

  ----
  eu-addr2line -e ./perf -f 0x4b8b91
  map_groups__insert inlined at util/machine.c:586 in
  machine__create_module
  util/map.h:207
  ----

So this dso refcnt will be released when map_groups gets released.

  [snip]
  Refcount +1 => 2 at
    ./perf(dso__get+0x34) [0x4a65f4]
    ./perf() [0x4b8b35]
    ./perf(modules__parse+0xfc) [0x4a9d5c]
    ./perf() [0x4b8460]
    ./perf(machine__create_kernel_maps+0x150) [0x4bb550]
    ./perf(machine__new_host+0xfa) [0x4bb75a]
    ./perf(init_probe_symbol_maps+0x93) [0x506623]
    ./perf() [0x455ffa]
    ./perf(cmd_probe+0x6c) [0x4566bc]
    ./perf() [0x47abc5]
    ./perf(main+0x610) [0x421f90]
    /lib64/libc.so.6(__libc_start_main+0xf5) [0x7f1345a8eaf5]
    ./perf() [0x4220a9]

Here, machine__findnew_module_dso(0x4b8b35) gets the dso (and stores it
in a local variable):

  ----
  # eu-addr2line -e ./perf -f 0x4b8b35
  machine__findnew_module_dso inlined at util/machine.c:578 in
  machine__create_module
  util/machine.c:514
  ----

  Refcount +1 => 3 at
    ./perf(dso__get+0x34) [0x4a65f4]
    ./perf(map__new2+0x76) [0x4be1c6]
    ./perf() [0x4b8b4f]
    ./perf(modules__parse+0xfc) [0x4a9d5c]
    ./perf() [0x4b8460]
    ./perf(machine__create_kernel_maps+0x150) [0x4bb550]
    ./perf(machine__new_host+0xfa) [0x4bb75a]
    ./perf(init_probe_symbol_maps+0x93) [0x506623]
    ./perf() [0x455ffa]
    ./perf(cmd_probe+0x6c) [0x4566bc]
    ./perf() [0x47abc5]
    ./perf(main+0x610) [0x421f90]
    /lib64/libc.so.6(__libc_start_main+0xf5) [0x7f1345a8eaf5]
    ./perf() [0x4220a9]

But also map__new2() gets the dso which will be put when the map is
released.

So, we have to drop the constructor reference obtained in
machine__findnew_module_dso().

Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20151118064035.30709.58824.stgit@localhost.localdomain
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/machine.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 0b4a05c..7f5071a 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -565,7 +565,7 @@ struct map *machine__findnew_module_map(struct machine *machine, u64 start,
 					const char *filename)
 {
 	struct map *map = NULL;
-	struct dso *dso;
+	struct dso *dso = NULL;
 	struct kmod_path m;
 
 	if (kmod_path__parse_name(&m, filename))
@@ -589,6 +589,8 @@ struct map *machine__findnew_module_map(struct machine *machine, u64 start,
 	/* Put the map here because map_groups__insert alread got it */
 	map__put(map);
 out:
+	/* put the dso here, corresponding to  machine__findnew_module_dso */
+	dso__put(dso);
 	free(m.name);
 	return map;
 }
-- 
cgit v1.1


From 26e779245dd6f5270c0696860438e5c03d0780fd Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Mon, 9 Nov 2015 14:45:37 +0900
Subject: perf report: Support folded callchain mode on --stdio

Add new call chain option (-g) 'folded' to print callchains in a line.
The callchains are separated by semicolons, and preceded by (absolute)
percent values and a space.

For example, the following 20 lines can be printed in 3 lines with the
folded output mode:

  $ perf report -g flat --no-children | grep -v ^# | head -20
      60.48%  swapper  [kernel.vmlinux]  [k] intel_idle
              54.60%
                 intel_idle
                 cpuidle_enter_state
                 cpuidle_enter
                 call_cpuidle
                 cpu_startup_entry
                 start_secondary

              5.88%
                 intel_idle
                 cpuidle_enter_state
                 cpuidle_enter
                 call_cpuidle
                 cpu_startup_entry
                 rest_init
                 start_kernel
                 x86_64_start_reservations
                 x86_64_start_kernel

  $ perf report -g folded --no-children | grep -v ^# | head -3
      60.48%  swapper  [kernel.vmlinux]  [k] intel_idle
  54.60% intel_idle;cpuidle_enter_state;cpuidle_enter;call_cpuidle;cpu_startup_entry;start_secondary
  5.88% intel_idle;cpuidle_enter_state;cpuidle_enter;call_cpuidle;cpu_startup_entry;rest_init;start_kernel;x86_64_start_reservations;x86_64_start_kernel

This mode is supported only for --stdio now and intended to be used by
some scripts like in FlameGraphs[1].  Support for other UI might be
added later.

[1] http://www.brendangregg.com/FlameGraphs/cpuflamegraphs.html

Requested-and-Tested-by: Brendan Gregg <brendan.d.gregg@gmail.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1447047946-1691-2-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-report.txt |  1 +
 tools/perf/ui/stdio/hist.c               | 55 ++++++++++++++++++++++++++++++++
 tools/perf/util/callchain.c              |  6 ++++
 tools/perf/util/callchain.h              |  5 +--
 4 files changed, 65 insertions(+), 2 deletions(-)

diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index 5ce8da1..f7d81aa 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -181,6 +181,7 @@ OPTIONS
 	- graph: use a graph tree, displaying absolute overhead rates. (default)
 	- fractal: like graph, but displays relative rates. Each branch of
 		 the tree is considered as a new profiled object.
+	- folded: call chains are displayed in a line, separated by semicolons
 	- none: disable call chain display.
 
 	threshold is a percentage value which specifies a minimum percent to be
diff --git a/tools/perf/ui/stdio/hist.c b/tools/perf/ui/stdio/hist.c
index dfcbc90..ea798493 100644
--- a/tools/perf/ui/stdio/hist.c
+++ b/tools/perf/ui/stdio/hist.c
@@ -260,6 +260,58 @@ static size_t callchain__fprintf_flat(FILE *fp, struct rb_root *tree,
 	return ret;
 }
 
+static size_t __callchain__fprintf_folded(FILE *fp, struct callchain_node *node)
+{
+	const char *sep = symbol_conf.field_sep ?: ";";
+	struct callchain_list *chain;
+	size_t ret = 0;
+	char bf[1024];
+	bool first;
+
+	if (!node)
+		return 0;
+
+	ret += __callchain__fprintf_folded(fp, node->parent);
+
+	first = (ret == 0);
+	list_for_each_entry(chain, &node->val, list) {
+		if (chain->ip >= PERF_CONTEXT_MAX)
+			continue;
+		ret += fprintf(fp, "%s%s", first ? "" : sep,
+			       callchain_list__sym_name(chain,
+						bf, sizeof(bf), false));
+		first = false;
+	}
+
+	return ret;
+}
+
+static size_t callchain__fprintf_folded(FILE *fp, struct rb_root *tree,
+					u64 total_samples)
+{
+	size_t ret = 0;
+	u32 entries_printed = 0;
+	struct callchain_node *chain;
+	struct rb_node *rb_node = rb_first(tree);
+
+	while (rb_node) {
+		double percent;
+
+		chain = rb_entry(rb_node, struct callchain_node, rb_node);
+		percent = chain->hit * 100.0 / total_samples;
+
+		ret += fprintf(fp, "%.2f%% ", percent);
+		ret += __callchain__fprintf_folded(fp, chain);
+		ret += fprintf(fp, "\n");
+		if (++entries_printed == callchain_param.print_limit)
+			break;
+
+		rb_node = rb_next(rb_node);
+	}
+
+	return ret;
+}
+
 static size_t hist_entry_callchain__fprintf(struct hist_entry *he,
 					    u64 total_samples, int left_margin,
 					    FILE *fp)
@@ -278,6 +330,9 @@ static size_t hist_entry_callchain__fprintf(struct hist_entry *he,
 	case CHAIN_FLAT:
 		return callchain__fprintf_flat(fp, &he->sorted_chain, total_samples);
 		break;
+	case CHAIN_FOLDED:
+		return callchain__fprintf_folded(fp, &he->sorted_chain, total_samples);
+		break;
 	case CHAIN_NONE:
 		break;
 	default:
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 735ad48..08cb220 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -44,6 +44,10 @@ static int parse_callchain_mode(const char *value)
 		callchain_param.mode = CHAIN_GRAPH_REL;
 		return 0;
 	}
+	if (!strncmp(value, "folded", strlen(value))) {
+		callchain_param.mode = CHAIN_FOLDED;
+		return 0;
+	}
 	return -1;
 }
 
@@ -218,6 +222,7 @@ rb_insert_callchain(struct rb_root *root, struct callchain_node *chain,
 
 		switch (mode) {
 		case CHAIN_FLAT:
+		case CHAIN_FOLDED:
 			if (rnode->hit < chain->hit)
 				p = &(*p)->rb_left;
 			else
@@ -338,6 +343,7 @@ int callchain_register_param(struct callchain_param *param)
 		param->sort = sort_chain_graph_rel;
 		break;
 	case CHAIN_FLAT:
+	case CHAIN_FOLDED:
 		param->sort = sort_chain_flat;
 		break;
 	case CHAIN_NONE:
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index fce8161..544d99a 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -24,7 +24,7 @@
 #define CALLCHAIN_RECORD_HELP  CALLCHAIN_HELP RECORD_MODE_HELP RECORD_SIZE_HELP
 
 #define CALLCHAIN_REPORT_HELP						\
-	HELP_PAD "print_type:\tcall graph printing style (graph|flat|fractal|none)\n" \
+	HELP_PAD "print_type:\tcall graph printing style (graph|flat|fractal|folded|none)\n" \
 	HELP_PAD "threshold:\tminimum call graph inclusion threshold (<percent>)\n" \
 	HELP_PAD "print_limit:\tmaximum number of call graph entry (<number>)\n" \
 	HELP_PAD "order:\t\tcall graph order (caller|callee)\n" \
@@ -43,7 +43,8 @@ enum chain_mode {
 	CHAIN_NONE,
 	CHAIN_FLAT,
 	CHAIN_GRAPH_ABS,
-	CHAIN_GRAPH_REL
+	CHAIN_GRAPH_REL,
+	CHAIN_FOLDED,
 };
 
 enum chain_order {
-- 
cgit v1.1


From 5ab250cafcd884a2638b102239870bddca42ff88 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Mon, 9 Nov 2015 14:45:39 +0900
Subject: perf callchain: Abstract callchain print function

This is a preparation to support for printing other type of callchain
value like count or period.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1447047946-1691-4-git-send-email-namhyung@kernel.org
[ renamed new _sprintf_ operation to _scnprintf_ ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/ui/browsers/hists.c |  8 +++++---
 tools/perf/ui/gtk/hists.c      |  8 ++------
 tools/perf/ui/stdio/hist.c     | 35 +++++++++++++++++------------------
 tools/perf/util/callchain.c    | 29 +++++++++++++++++++++++++++++
 tools/perf/util/callchain.h    |  4 ++++
 5 files changed, 57 insertions(+), 27 deletions(-)

diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index fa9eb92..0b18857 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -592,7 +592,6 @@ static int hist_browser__show_callchain(struct hist_browser *browser,
 	while (node) {
 		struct callchain_node *child = rb_entry(node, struct callchain_node, rb_node);
 		struct rb_node *next = rb_next(node);
-		u64 cumul = callchain_cumul_hits(child);
 		struct callchain_list *chain;
 		char folded_sign = ' ';
 		int first = true;
@@ -619,9 +618,12 @@ static int hist_browser__show_callchain(struct hist_browser *browser,
 						       browser->show_dso);
 
 			if (was_first && need_percent) {
-				double percent = cumul * 100.0 / total;
+				char buf[64];
 
-				if (asprintf(&alloc_str, "%2.2f%% %s", percent, str) < 0)
+				callchain_node__scnprintf_value(child, buf, sizeof(buf),
+								total);
+
+				if (asprintf(&alloc_str, "%s %s", buf, str) < 0)
 					str = "Not enough memory!";
 				else
 					str = alloc_str;
diff --git a/tools/perf/ui/gtk/hists.c b/tools/perf/ui/gtk/hists.c
index 4b3585e..cff7bb9 100644
--- a/tools/perf/ui/gtk/hists.c
+++ b/tools/perf/ui/gtk/hists.c
@@ -100,14 +100,10 @@ static void perf_gtk__add_callchain(struct rb_root *root, GtkTreeStore *store,
 		struct callchain_list *chain;
 		GtkTreeIter iter, new_parent;
 		bool need_new_parent;
-		double percent;
-		u64 hits, child_total;
+		u64 child_total;
 
 		node = rb_entry(nd, struct callchain_node, rb_node);
 
-		hits = callchain_cumul_hits(node);
-		percent = 100.0 * hits / total;
-
 		new_parent = *parent;
 		need_new_parent = !has_single_node && (node->val_nr > 1);
 
@@ -116,7 +112,7 @@ static void perf_gtk__add_callchain(struct rb_root *root, GtkTreeStore *store,
 
 			gtk_tree_store_append(store, &iter, &new_parent);
 
-			scnprintf(buf, sizeof(buf), "%5.2f%%", percent);
+			callchain_node__scnprintf_value(node, buf, sizeof(buf), total);
 			gtk_tree_store_set(store, &iter, 0, buf, -1);
 
 			callchain_list__sym_name(chain, buf, sizeof(buf), false);
diff --git a/tools/perf/ui/stdio/hist.c b/tools/perf/ui/stdio/hist.c
index ea798493..f4de055 100644
--- a/tools/perf/ui/stdio/hist.c
+++ b/tools/perf/ui/stdio/hist.c
@@ -34,10 +34,10 @@ static size_t ipchain__fprintf_graph_line(FILE *fp, int depth, int depth_mask,
 	return ret;
 }
 
-static size_t ipchain__fprintf_graph(FILE *fp, struct callchain_list *chain,
+static size_t ipchain__fprintf_graph(FILE *fp, struct callchain_node *node,
+				     struct callchain_list *chain,
 				     int depth, int depth_mask, int period,
-				     u64 total_samples, u64 hits,
-				     int left_margin)
+				     u64 total_samples, int left_margin)
 {
 	int i;
 	size_t ret = 0;
@@ -50,10 +50,9 @@ static size_t ipchain__fprintf_graph(FILE *fp, struct callchain_list *chain,
 		else
 			ret += fprintf(fp, " ");
 		if (!period && i == depth - 1) {
-			double percent;
-
-			percent = hits * 100.0 / total_samples;
-			ret += percent_color_fprintf(fp, "--%2.2f%%-- ", percent);
+			ret += fprintf(fp, "--");
+			ret += callchain_node__fprintf_value(node, fp, total_samples);
+			ret += fprintf(fp, "--");
 		} else
 			ret += fprintf(fp, "%s", "          ");
 	}
@@ -120,10 +119,9 @@ static size_t __callchain__fprintf_graph(FILE *fp, struct rb_root *root,
 						   left_margin);
 		i = 0;
 		list_for_each_entry(chain, &child->val, list) {
-			ret += ipchain__fprintf_graph(fp, chain, depth,
+			ret += ipchain__fprintf_graph(fp, child, chain, depth,
 						      new_depth_mask, i++,
 						      total_samples,
-						      cumul,
 						      left_margin);
 		}
 
@@ -143,14 +141,17 @@ static size_t __callchain__fprintf_graph(FILE *fp, struct rb_root *root,
 
 	if (callchain_param.mode == CHAIN_GRAPH_REL &&
 		remaining && remaining != total_samples) {
+		struct callchain_node rem_node = {
+			.hit = remaining,
+		};
 
 		if (!rem_sq_bracket)
 			return ret;
 
 		new_depth_mask &= ~(1 << (depth - 1));
-		ret += ipchain__fprintf_graph(fp, &rem_hits, depth,
+		ret += ipchain__fprintf_graph(fp, &rem_node, &rem_hits, depth,
 					      new_depth_mask, 0, total_samples,
-					      remaining, left_margin);
+					      left_margin);
 	}
 
 	return ret;
@@ -243,12 +244,11 @@ static size_t callchain__fprintf_flat(FILE *fp, struct rb_root *tree,
 	struct rb_node *rb_node = rb_first(tree);
 
 	while (rb_node) {
-		double percent;
-
 		chain = rb_entry(rb_node, struct callchain_node, rb_node);
-		percent = chain->hit * 100.0 / total_samples;
 
-		ret = percent_color_fprintf(fp, "           %6.2f%%\n", percent);
+		ret += fprintf(fp, "           ");
+		ret += callchain_node__fprintf_value(chain, fp, total_samples);
+		ret += fprintf(fp, "\n");
 		ret += __callchain__fprintf_flat(fp, chain, total_samples);
 		ret += fprintf(fp, "\n");
 		if (++entries_printed == callchain_param.print_limit)
@@ -295,12 +295,11 @@ static size_t callchain__fprintf_folded(FILE *fp, struct rb_root *tree,
 	struct rb_node *rb_node = rb_first(tree);
 
 	while (rb_node) {
-		double percent;
 
 		chain = rb_entry(rb_node, struct callchain_node, rb_node);
-		percent = chain->hit * 100.0 / total_samples;
 
-		ret += fprintf(fp, "%.2f%% ", percent);
+		ret += callchain_node__fprintf_value(chain, fp, total_samples);
+		ret += fprintf(fp, " ");
 		ret += __callchain__fprintf_folded(fp, chain);
 		ret += fprintf(fp, "\n");
 		if (++entries_printed == callchain_param.print_limit)
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 08cb220..b948bd0 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -805,6 +805,35 @@ char *callchain_list__sym_name(struct callchain_list *cl,
 	return bf;
 }
 
+char *callchain_node__scnprintf_value(struct callchain_node *node,
+				      char *bf, size_t bfsize, u64 total)
+{
+	double percent = 0.0;
+	u64 period = callchain_cumul_hits(node);
+
+	if (callchain_param.mode == CHAIN_FOLDED)
+		period = node->hit;
+	if (total)
+		percent = period * 100.0 / total;
+
+	scnprintf(bf, bfsize, "%.2f%%", percent);
+	return bf;
+}
+
+int callchain_node__fprintf_value(struct callchain_node *node,
+				 FILE *fp, u64 total)
+{
+	double percent = 0.0;
+	u64 period = callchain_cumul_hits(node);
+
+	if (callchain_param.mode == CHAIN_FOLDED)
+		period = node->hit;
+	if (total)
+		percent = period * 100.0 / total;
+
+	return percent_color_fprintf(fp, "%.2f%%", percent);
+}
+
 static void free_callchain_node(struct callchain_node *node)
 {
 	struct callchain_list *list, *tmp;
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index 544d99a..060e636 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -230,6 +230,10 @@ static inline int arch_skip_callchain_idx(struct thread *thread __maybe_unused,
 
 char *callchain_list__sym_name(struct callchain_list *cl,
 			       char *bf, size_t bfsize, bool show_dso);
+char *callchain_node__scnprintf_value(struct callchain_node *node,
+				      char *bf, size_t bfsize, u64 total);
+int callchain_node__fprintf_value(struct callchain_node *node,
+				  FILE *fp, u64 total);
 
 void free_callchain(struct callchain_root *root);
 
-- 
cgit v1.1


From 5e47f8ff406296bd078716d71283796ca5c6544b Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Mon, 9 Nov 2015 14:45:40 +0900
Subject: perf callchain: Add count fields to struct callchain_node

It's to track the count of occurrences of the callchains.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Brendan Gregg <brendan.d.gregg@gmail.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1447047946-1691-5-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/callchain.c | 10 ++++++++++
 tools/perf/util/callchain.h |  7 +++++++
 2 files changed, 17 insertions(+)

diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index b948bd0..e390edd 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -437,6 +437,8 @@ add_child(struct callchain_node *parent,
 
 	new->children_hit = 0;
 	new->hit = period;
+	new->children_count = 0;
+	new->count = 1;
 	return new;
 }
 
@@ -484,6 +486,9 @@ split_add_child(struct callchain_node *parent,
 	parent->children_hit = callchain_cumul_hits(new);
 	new->val_nr = parent->val_nr - idx_local;
 	parent->val_nr = idx_local;
+	new->count = parent->count;
+	new->children_count = parent->children_count;
+	parent->children_count = callchain_cumul_counts(new);
 
 	/* create a new child for the new branch if any */
 	if (idx_total < cursor->nr) {
@@ -494,6 +499,8 @@ split_add_child(struct callchain_node *parent,
 
 		parent->hit = 0;
 		parent->children_hit += period;
+		parent->count = 0;
+		parent->children_count += 1;
 
 		node = callchain_cursor_current(cursor);
 		new = add_child(parent, cursor, period);
@@ -516,6 +523,7 @@ split_add_child(struct callchain_node *parent,
 		rb_insert_color(&new->rb_node_in, &parent->rb_root_in);
 	} else {
 		parent->hit = period;
+		parent->count = 1;
 	}
 }
 
@@ -562,6 +570,7 @@ append_chain_children(struct callchain_node *root,
 
 inc_children_hit:
 	root->children_hit += period;
+	root->children_count++;
 }
 
 static int
@@ -614,6 +623,7 @@ append_chain(struct callchain_node *root,
 	/* we match 100% of the path, increment the hit */
 	if (matches == root->val_nr && cursor->pos == cursor->nr) {
 		root->hit += period;
+		root->count++;
 		return 0;
 	}
 
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index 060e636..cdb386d 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -60,6 +60,8 @@ struct callchain_node {
 	struct rb_root		rb_root_in; /* input tree of children */
 	struct rb_root		rb_root;    /* sorted output tree of children */
 	unsigned int		val_nr;
+	unsigned int		count;
+	unsigned int		children_count;
 	u64			hit;
 	u64			children_hit;
 };
@@ -145,6 +147,11 @@ static inline u64 callchain_cumul_hits(struct callchain_node *node)
 	return node->hit + node->children_hit;
 }
 
+static inline unsigned callchain_cumul_counts(struct callchain_node *node)
+{
+	return node->count + node->children_count;
+}
+
 int callchain_register_param(struct callchain_param *param);
 int callchain_append(struct callchain_root *root,
 		     struct callchain_cursor *cursor,
-- 
cgit v1.1


From f2af008695e0b54a58b76caecd52af7e6c97fb29 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Mon, 9 Nov 2015 14:45:41 +0900
Subject: perf report: Add callchain value option

Now -g/--call-graph option supports how to display callchain values.
Possible values are 'percent', 'period' and 'count'.  The percent is
same as before and it's the default behavior.  The period displays the
raw period value rather than the percentage.  The count displays the
number of occurrences.

  $ perf report --no-children --stdio -g percent
  ...
    39.93%  swapper  [kernel.vmlinux]  [k] intel_idel
            |
            ---intel_idle
               cpuidle_enter_state
               cpuidle_enter
               call_cpuidle
               cpu_startup_entry
               |
               |--28.63%-- start_secondary
               |
                --11.30%-- rest_init

  $ perf report --no-children --show-total-period --stdio -g period
  ...
    39.93%   13018705  swapper  [kernel.vmlinux]  [k] intel_idel
            |
            ---intel_idle
               cpuidle_enter_state
               cpuidle_enter
               call_cpuidle
               cpu_startup_entry
               |
               |--9334403-- start_secondary
               |
                --3684302-- rest_init

  $ perf report --no-children --show-nr-samples --stdio -g count
  ...
    39.93%     80  swapper  [kernel.vmlinux]  [k] intel_idel
            |
            ---intel_idle
               cpuidle_enter_state
               cpuidle_enter
               call_cpuidle
               cpu_startup_entry
               |
               |--57-- start_secondary
               |
                --23-- rest_init

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1447047946-1691-6-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-report.txt | 13 ++++---
 tools/perf/builtin-report.c              |  4 +--
 tools/perf/ui/stdio/hist.c               | 10 +++++-
 tools/perf/util/callchain.c              | 62 +++++++++++++++++++++++++++-----
 tools/perf/util/callchain.h              | 10 +++++-
 tools/perf/util/util.c                   |  3 +-
 6 files changed, 84 insertions(+), 18 deletions(-)

diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index f7d81aa..dab99ed 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -170,11 +170,11 @@ OPTIONS
         Dump raw trace in ASCII.
 
 -g::
---call-graph=<print_type,threshold[,print_limit],order,sort_key,branch>::
+--call-graph=<print_type,threshold[,print_limit],order,sort_key[,branch],value>::
         Display call chains using type, min percent threshold, print limit,
-	call order, sort key and branch.  Note that ordering of parameters is not
-	fixed so any parement can be given in an arbitraty order.  One exception
-	is the print_limit which should be preceded by threshold.
+	call order, sort key, optional branch and value.  Note that ordering of
+	parameters is not fixed so any parement can be given in an arbitraty order.
+	One exception is the print_limit which should be preceded by threshold.
 
 	print_type can be either:
 	- flat: single column, linear exposure of call chains.
@@ -205,6 +205,11 @@ OPTIONS
 	- branch: include last branch information in callgraph when available.
 	          Usually more convenient to use --branch-history for this.
 
+	value can be:
+	- percent: diplay overhead percent (default)
+	- period: display event period
+	- count: display event count
+
 --children::
 	Accumulate callchain of children to parent entry so that then can
 	show up in the output.  The output will have a new "Children" column
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index f256fac..1442834 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -625,7 +625,7 @@ parse_percent_limit(const struct option *opt, const char *str,
 	return 0;
 }
 
-#define CALLCHAIN_DEFAULT_OPT  "graph,0.5,caller,function"
+#define CALLCHAIN_DEFAULT_OPT  "graph,0.5,caller,function,percent"
 
 const char report_callchain_help[] = "Display call graph (stack chain/backtrace):\n\n"
 				     CALLCHAIN_REPORT_HELP
@@ -708,7 +708,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
 	OPT_BOOLEAN('x', "exclude-other", &symbol_conf.exclude_other,
 		    "Only display entries with parent-match"),
 	OPT_CALLBACK_DEFAULT('g', "call-graph", &report,
-			     "print_type,threshold[,print_limit],order,sort_key[,branch]",
+			     "print_type,threshold[,print_limit],order,sort_key[,branch],value",
 			     report_callchain_help, &report_parse_callchain_opt,
 			     callchain_default_opt),
 	OPT_BOOLEAN(0, "children", &symbol_conf.cumulate_callchain,
diff --git a/tools/perf/ui/stdio/hist.c b/tools/perf/ui/stdio/hist.c
index f4de055..7ebc661 100644
--- a/tools/perf/ui/stdio/hist.c
+++ b/tools/perf/ui/stdio/hist.c
@@ -81,13 +81,14 @@ static size_t __callchain__fprintf_graph(FILE *fp, struct rb_root *root,
 					 int depth_mask, int left_margin)
 {
 	struct rb_node *node, *next;
-	struct callchain_node *child;
+	struct callchain_node *child = NULL;
 	struct callchain_list *chain;
 	int new_depth_mask = depth_mask;
 	u64 remaining;
 	size_t ret = 0;
 	int i;
 	uint entries_printed = 0;
+	int cumul_count = 0;
 
 	remaining = total_samples;
 
@@ -99,6 +100,7 @@ static size_t __callchain__fprintf_graph(FILE *fp, struct rb_root *root,
 		child = rb_entry(node, struct callchain_node, rb_node);
 		cumul = callchain_cumul_hits(child);
 		remaining -= cumul;
+		cumul_count += callchain_cumul_counts(child);
 
 		/*
 		 * The depth mask manages the output of pipes that show
@@ -148,6 +150,12 @@ static size_t __callchain__fprintf_graph(FILE *fp, struct rb_root *root,
 		if (!rem_sq_bracket)
 			return ret;
 
+		if (callchain_param.value == CCVAL_COUNT && child && child->parent) {
+			rem_node.count = child->parent->children_count - cumul_count;
+			if (rem_node.count <= 0)
+				return ret;
+		}
+
 		new_depth_mask &= ~(1 << (depth - 1));
 		ret += ipchain__fprintf_graph(fp, &rem_node, &rem_hits, depth,
 					      new_depth_mask, 0, total_samples,
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index e390edd..717c58c 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -83,6 +83,23 @@ static int parse_callchain_sort_key(const char *value)
 	return -1;
 }
 
+static int parse_callchain_value(const char *value)
+{
+	if (!strncmp(value, "percent", strlen(value))) {
+		callchain_param.value = CCVAL_PERCENT;
+		return 0;
+	}
+	if (!strncmp(value, "period", strlen(value))) {
+		callchain_param.value = CCVAL_PERIOD;
+		return 0;
+	}
+	if (!strncmp(value, "count", strlen(value))) {
+		callchain_param.value = CCVAL_COUNT;
+		return 0;
+	}
+	return -1;
+}
+
 static int
 __parse_callchain_report_opt(const char *arg, bool allow_record_opt)
 {
@@ -106,7 +123,8 @@ __parse_callchain_report_opt(const char *arg, bool allow_record_opt)
 
 		if (!parse_callchain_mode(tok) ||
 		    !parse_callchain_order(tok) ||
-		    !parse_callchain_sort_key(tok)) {
+		    !parse_callchain_sort_key(tok) ||
+		    !parse_callchain_value(tok)) {
 			/* parsing ok - move on to the next */
 			try_stack_size = false;
 			goto next;
@@ -820,13 +838,27 @@ char *callchain_node__scnprintf_value(struct callchain_node *node,
 {
 	double percent = 0.0;
 	u64 period = callchain_cumul_hits(node);
+	unsigned count = callchain_cumul_counts(node);
 
-	if (callchain_param.mode == CHAIN_FOLDED)
+	if (callchain_param.mode == CHAIN_FOLDED) {
 		period = node->hit;
-	if (total)
-		percent = period * 100.0 / total;
+		count = node->count;
+	}
 
-	scnprintf(bf, bfsize, "%.2f%%", percent);
+	switch (callchain_param.value) {
+	case CCVAL_PERIOD:
+		scnprintf(bf, bfsize, "%"PRIu64, period);
+		break;
+	case CCVAL_COUNT:
+		scnprintf(bf, bfsize, "%u", count);
+		break;
+	case CCVAL_PERCENT:
+	default:
+		if (total)
+			percent = period * 100.0 / total;
+		scnprintf(bf, bfsize, "%.2f%%", percent);
+		break;
+	}
 	return bf;
 }
 
@@ -835,13 +867,25 @@ int callchain_node__fprintf_value(struct callchain_node *node,
 {
 	double percent = 0.0;
 	u64 period = callchain_cumul_hits(node);
+	unsigned count = callchain_cumul_counts(node);
 
-	if (callchain_param.mode == CHAIN_FOLDED)
+	if (callchain_param.mode == CHAIN_FOLDED) {
 		period = node->hit;
-	if (total)
-		percent = period * 100.0 / total;
+		count = node->count;
+	}
 
-	return percent_color_fprintf(fp, "%.2f%%", percent);
+	switch (callchain_param.value) {
+	case CCVAL_PERIOD:
+		return fprintf(fp, "%"PRIu64, period);
+	case CCVAL_COUNT:
+		return fprintf(fp, "%u", count);
+	case CCVAL_PERCENT:
+	default:
+		if (total)
+			percent = period * 100.0 / total;
+		return percent_color_fprintf(fp, "%.2f%%", percent);
+	}
+	return 0;
 }
 
 static void free_callchain_node(struct callchain_node *node)
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index cdb386d..47bc0c5 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -29,7 +29,8 @@
 	HELP_PAD "print_limit:\tmaximum number of call graph entry (<number>)\n" \
 	HELP_PAD "order:\t\tcall graph order (caller|callee)\n" \
 	HELP_PAD "sort_key:\tcall graph sort key (function|address)\n"	\
-	HELP_PAD "branch:\t\tinclude last branch info to call graph (branch)\n"
+	HELP_PAD "branch:\t\tinclude last branch info to call graph (branch)\n" \
+	HELP_PAD "value:\t\tcall graph value (percent|period|count)\n"
 
 enum perf_call_graph_mode {
 	CALLCHAIN_NONE,
@@ -81,6 +82,12 @@ enum chain_key {
 	CCKEY_ADDRESS
 };
 
+enum chain_value {
+	CCVAL_PERCENT,
+	CCVAL_PERIOD,
+	CCVAL_COUNT,
+};
+
 struct callchain_param {
 	bool			enabled;
 	enum perf_call_graph_mode record_mode;
@@ -93,6 +100,7 @@ struct callchain_param {
 	bool			order_set;
 	enum chain_key		key;
 	bool			branch_callstack;
+	enum chain_value	value;
 };
 
 extern struct callchain_param callchain_param;
diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c
index 47b1e36..75759ae 100644
--- a/tools/perf/util/util.c
+++ b/tools/perf/util/util.c
@@ -21,7 +21,8 @@ struct callchain_param	callchain_param = {
 	.mode	= CHAIN_GRAPH_ABS,
 	.min_percent = 0.5,
 	.order  = ORDER_CALLEE,
-	.key	= CCKEY_FUNCTION
+	.key	= CCKEY_FUNCTION,
+	.value	= CCVAL_PERCENT,
 };
 
 /*
-- 
cgit v1.1


From 18bb838129b08fb0009b1ba1dc2f748a9537ee89 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Mon, 9 Nov 2015 14:45:42 +0900
Subject: perf hists browser: Factor out hist_browser__show_callchain_list()

This function is to print a single callchain list entry.  As this
function will be used by other function, factor out to a separate
function.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1447047946-1691-7-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/ui/browsers/hists.c | 72 ++++++++++++++++++++++++++----------------
 1 file changed, 45 insertions(+), 27 deletions(-)

diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index 0b18857..0746d41 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -574,6 +574,44 @@ static bool hist_browser__check_dump_full(struct hist_browser *browser __maybe_u
 
 #define LEVEL_OFFSET_STEP 3
 
+static int hist_browser__show_callchain_list(struct hist_browser *browser,
+					     struct callchain_node *node,
+					     struct callchain_list *chain,
+					     unsigned short row, u64 total,
+					     bool need_percent, int offset,
+					     print_callchain_entry_fn print,
+					     struct callchain_print_arg *arg)
+{
+	char bf[1024], *alloc_str;
+	const char *str;
+
+	if (arg->row_offset != 0) {
+		arg->row_offset--;
+		return 0;
+	}
+
+	alloc_str = NULL;
+	str = callchain_list__sym_name(chain, bf, sizeof(bf),
+				       browser->show_dso);
+
+	if (need_percent) {
+		char buf[64];
+
+		callchain_node__scnprintf_value(node, buf, sizeof(buf),
+						total);
+
+		if (asprintf(&alloc_str, "%s %s", buf, str) < 0)
+			str = "Not enough memory!";
+		else
+			str = alloc_str;
+	}
+
+	print(browser, chain, str, offset, row, arg);
+
+	free(alloc_str);
+	return 1;
+}
+
 static int hist_browser__show_callchain(struct hist_browser *browser,
 					struct rb_root *root, int level,
 					unsigned short row, u64 total,
@@ -598,8 +636,6 @@ static int hist_browser__show_callchain(struct hist_browser *browser,
 		int extra_offset = 0;
 
 		list_for_each_entry(chain, &child->val, list) {
-			char bf[1024], *alloc_str;
-			const char *str;
 			bool was_first = first;
 
 			if (first)
@@ -608,34 +644,16 @@ static int hist_browser__show_callchain(struct hist_browser *browser,
 				extra_offset = LEVEL_OFFSET_STEP;
 
 			folded_sign = callchain_list__folded(chain);
-			if (arg->row_offset != 0) {
-				arg->row_offset--;
-				goto do_next;
-			}
 
-			alloc_str = NULL;
-			str = callchain_list__sym_name(chain, bf, sizeof(bf),
-						       browser->show_dso);
+			row += hist_browser__show_callchain_list(browser, child,
+							chain, row, total,
+							was_first && need_percent,
+							offset + extra_offset,
+							print, arg);
 
-			if (was_first && need_percent) {
-				char buf[64];
-
-				callchain_node__scnprintf_value(child, buf, sizeof(buf),
-								total);
-
-				if (asprintf(&alloc_str, "%s %s", buf, str) < 0)
-					str = "Not enough memory!";
-				else
-					str = alloc_str;
-			}
-
-			print(browser, chain, str, offset + extra_offset, row, arg);
-
-			free(alloc_str);
-
-			if (is_output_full(browser, ++row))
+			if (is_output_full(browser, row))
 				goto out;
-do_next:
+
 			if (folded_sign == '+')
 				break;
 		}
-- 
cgit v1.1


From 4b3a3212233a042f48b7b8fedc64933e1ccd8643 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Mon, 9 Nov 2015 14:45:43 +0900
Subject: perf hists browser: Support flat callchains

The flat callchain mode is to print all chains in a single, simple
hierarchy so make it easy to see.

Currently perf report --tui doesn't show flat callchains properly.  With
flat callchains, only leaf nodes are added to the final rbtree so it
should show entries in parent nodes.  To do that, add parent_val list to
struct callchain_node and show them along with the (normal) val list.

For example, consider following callchains with '-g graph'.

  $ perf report -g graph
  - 39.93%  swapper  [kernel.vmlinux]  [k] intel_idle
       intel_idle
       cpuidle_enter_state
       cpuidle_enter
       call_cpuidle
     - cpu_startup_entry
          28.63% start_secondary
        - 11.30% rest_init
             start_kernel
             x86_64_start_reservations
             x86_64_start_kernel

Before:
  $ perf report -g flat
  - 39.93%  swapper  [kernel.vmlinux]  [k] intel_idle
       28.63% start_secondary
     - 11.30% rest_init
          start_kernel
          x86_64_start_reservations
          x86_64_start_kernel

After:
  $ perf report -g flat
  - 39.93%  swapper  [kernel.vmlinux]  [k] intel_idle
     - 28.63% intel_idle
          cpuidle_enter_state
          cpuidle_enter
          call_cpuidle
          cpu_startup_entry
          start_secondary
     - 11.30% intel_idle
          cpuidle_enter_state
          cpuidle_enter
          call_cpuidle
          cpu_startup_entry
          start_kernel
          x86_64_start_reservations
          x86_64_start_kernel

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Tested-by: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1447047946-1691-8-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/ui/browsers/hists.c | 122 ++++++++++++++++++++++++++++++++++++++++-
 tools/perf/util/callchain.c    |  44 +++++++++++++++
 tools/perf/util/callchain.h    |   2 +
 3 files changed, 166 insertions(+), 2 deletions(-)

diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index 0746d41..c44af46 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -178,12 +178,44 @@ static int callchain_node__count_rows_rb_tree(struct callchain_node *node)
 	return n;
 }
 
+static int callchain_node__count_flat_rows(struct callchain_node *node)
+{
+	struct callchain_list *chain;
+	char folded_sign = 0;
+	int n = 0;
+
+	list_for_each_entry(chain, &node->parent_val, list) {
+		if (!folded_sign) {
+			/* only check first chain list entry */
+			folded_sign = callchain_list__folded(chain);
+			if (folded_sign == '+')
+				return 1;
+		}
+		n++;
+	}
+
+	list_for_each_entry(chain, &node->val, list) {
+		if (!folded_sign) {
+			/* node->parent_val list might be empty */
+			folded_sign = callchain_list__folded(chain);
+			if (folded_sign == '+')
+				return 1;
+		}
+		n++;
+	}
+
+	return n;
+}
+
 static int callchain_node__count_rows(struct callchain_node *node)
 {
 	struct callchain_list *chain;
 	bool unfolded = false;
 	int n = 0;
 
+	if (callchain_param.mode == CHAIN_FLAT)
+		return callchain_node__count_flat_rows(node);
+
 	list_for_each_entry(chain, &node->val, list) {
 		++n;
 		unfolded = chain->unfolded;
@@ -263,7 +295,7 @@ static void callchain_node__init_have_children(struct callchain_node *node,
 	chain = list_entry(node->val.next, struct callchain_list, list);
 	chain->has_children = has_sibling;
 
-	if (!list_empty(&node->val)) {
+	if (node->val.next != node->val.prev) {
 		chain = list_entry(node->val.prev, struct callchain_list, list);
 		chain->has_children = !RB_EMPTY_ROOT(&node->rb_root);
 	}
@@ -279,6 +311,8 @@ static void callchain__init_have_children(struct rb_root *root)
 	for (nd = rb_first(root); nd; nd = rb_next(nd)) {
 		struct callchain_node *node = rb_entry(nd, struct callchain_node, rb_node);
 		callchain_node__init_have_children(node, has_sibling);
+		if (callchain_param.mode == CHAIN_FLAT)
+			callchain_node__make_parent_list(node);
 	}
 }
 
@@ -612,6 +646,83 @@ static int hist_browser__show_callchain_list(struct hist_browser *browser,
 	return 1;
 }
 
+static int hist_browser__show_callchain_flat(struct hist_browser *browser,
+					     struct rb_root *root,
+					     unsigned short row, u64 total,
+					     print_callchain_entry_fn print,
+					     struct callchain_print_arg *arg,
+					     check_output_full_fn is_output_full)
+{
+	struct rb_node *node;
+	int first_row = row, offset = LEVEL_OFFSET_STEP;
+	bool need_percent;
+
+	node = rb_first(root);
+	need_percent = node && rb_next(node);
+
+	while (node) {
+		struct callchain_node *child = rb_entry(node, struct callchain_node, rb_node);
+		struct rb_node *next = rb_next(node);
+		struct callchain_list *chain;
+		char folded_sign = ' ';
+		int first = true;
+		int extra_offset = 0;
+
+		list_for_each_entry(chain, &child->parent_val, list) {
+			bool was_first = first;
+
+			if (first)
+				first = false;
+			else if (need_percent)
+				extra_offset = LEVEL_OFFSET_STEP;
+
+			folded_sign = callchain_list__folded(chain);
+
+			row += hist_browser__show_callchain_list(browser, child,
+							chain, row, total,
+							was_first && need_percent,
+							offset + extra_offset,
+							print, arg);
+
+			if (is_output_full(browser, row))
+				goto out;
+
+			if (folded_sign == '+')
+				goto next;
+		}
+
+		list_for_each_entry(chain, &child->val, list) {
+			bool was_first = first;
+
+			if (first)
+				first = false;
+			else if (need_percent)
+				extra_offset = LEVEL_OFFSET_STEP;
+
+			folded_sign = callchain_list__folded(chain);
+
+			row += hist_browser__show_callchain_list(browser, child,
+							chain, row, total,
+							was_first && need_percent,
+							offset + extra_offset,
+							print, arg);
+
+			if (is_output_full(browser, row))
+				goto out;
+
+			if (folded_sign == '+')
+				break;
+		}
+
+next:
+		if (is_output_full(browser, row))
+			break;
+		node = next;
+	}
+out:
+	return row - first_row;
+}
+
 static int hist_browser__show_callchain(struct hist_browser *browser,
 					struct rb_root *root, int level,
 					unsigned short row, u64 total,
@@ -864,10 +975,17 @@ static int hist_browser__show_entry(struct hist_browser *browser,
 				total = entry->stat.period;
 		}
 
-		printed += hist_browser__show_callchain(browser,
+		if (callchain_param.mode == CHAIN_FLAT) {
+			printed += hist_browser__show_callchain_flat(browser,
+					&entry->sorted_chain, row, total,
+					hist_browser__show_callchain_entry, &arg,
+					hist_browser__check_output_full);
+		} else {
+			printed += hist_browser__show_callchain(browser,
 					&entry->sorted_chain, 1, row, total,
 					hist_browser__show_callchain_entry, &arg,
 					hist_browser__check_output_full);
+		}
 
 		if (arg.is_current_entry)
 			browser->he_selection = entry;
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 717c58c..fc3b1e0 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -387,6 +387,7 @@ create_child(struct callchain_node *parent, bool inherit_children)
 	}
 	new->parent = parent;
 	INIT_LIST_HEAD(&new->val);
+	INIT_LIST_HEAD(&new->parent_val);
 
 	if (inherit_children) {
 		struct rb_node *n;
@@ -894,6 +895,11 @@ static void free_callchain_node(struct callchain_node *node)
 	struct callchain_node *child;
 	struct rb_node *n;
 
+	list_for_each_entry_safe(list, tmp, &node->parent_val, list) {
+		list_del(&list->list);
+		free(list);
+	}
+
 	list_for_each_entry_safe(list, tmp, &node->val, list) {
 		list_del(&list->list);
 		free(list);
@@ -917,3 +923,41 @@ void free_callchain(struct callchain_root *root)
 
 	free_callchain_node(&root->node);
 }
+
+int callchain_node__make_parent_list(struct callchain_node *node)
+{
+	struct callchain_node *parent = node->parent;
+	struct callchain_list *chain, *new;
+	LIST_HEAD(head);
+
+	while (parent) {
+		list_for_each_entry_reverse(chain, &parent->val, list) {
+			new = malloc(sizeof(*new));
+			if (new == NULL)
+				goto out;
+			*new = *chain;
+			new->has_children = false;
+			list_add_tail(&new->list, &head);
+		}
+		parent = parent->parent;
+	}
+
+	list_for_each_entry_safe_reverse(chain, new, &head, list)
+		list_move_tail(&chain->list, &node->parent_val);
+
+	if (!list_empty(&node->parent_val)) {
+		chain = list_first_entry(&node->parent_val, struct callchain_list, list);
+		chain->has_children = rb_prev(&node->rb_node) || rb_next(&node->rb_node);
+
+		chain = list_first_entry(&node->val, struct callchain_list, list);
+		chain->has_children = false;
+	}
+	return 0;
+
+out:
+	list_for_each_entry_safe(chain, new, &head, list) {
+		list_del(&chain->list);
+		free(chain);
+	}
+	return -ENOMEM;
+}
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index 47bc0c5..6e9b5f2 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -56,6 +56,7 @@ enum chain_order {
 struct callchain_node {
 	struct callchain_node	*parent;
 	struct list_head	val;
+	struct list_head	parent_val;
 	struct rb_node		rb_node_in; /* to insert nodes in an rbtree */
 	struct rb_node		rb_node;    /* to sort nodes in an output tree */
 	struct rb_root		rb_root_in; /* input tree of children */
@@ -251,5 +252,6 @@ int callchain_node__fprintf_value(struct callchain_node *node,
 				  FILE *fp, u64 total);
 
 void free_callchain(struct callchain_root *root);
+int callchain_node__make_parent_list(struct callchain_node *node);
 
 #endif	/* __PERF_CALLCHAIN_H */
-- 
cgit v1.1


From 8c430a34869946f1f5852f02d910ceef80040be5 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Mon, 9 Nov 2015 14:45:44 +0900
Subject: perf hists browser: Support folded callchains

The folded callchain mode prints all chains in a single line.

Currently perf report --tui doesn't support folded callchains.  Like
flat callchains, only leaf nodes are added to the final rbtree so it
should show entries in parent nodes.  To do that, add flat_val list to
struct callchain_node and show them along with the (normal) val list.

For example, folded callchain looks like below:

  $ perf report -g folded --tui
  Samples: 234  of event 'cycles:pp', Event count (approx.): 32605268
    Overhead  Command  Shared Object     Symbol
  -   39.93%  swapper  [kernel.vmlinux]  [k] intel_idle
     + 28.63% intel_idle; cpuidle_enter_state; cpuidle_enter; ...
     + 11.30% intel_idle; cpuidle_enter_state; cpuidle_enter; ...

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Tested-by: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1447047946-1691-9-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/ui/browsers/hists.c | 125 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 124 insertions(+), 1 deletion(-)

diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index c44af46..a211b7b 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -207,6 +207,11 @@ static int callchain_node__count_flat_rows(struct callchain_node *node)
 	return n;
 }
 
+static int callchain_node__count_folded_rows(struct callchain_node *node __maybe_unused)
+{
+	return 1;
+}
+
 static int callchain_node__count_rows(struct callchain_node *node)
 {
 	struct callchain_list *chain;
@@ -215,6 +220,8 @@ static int callchain_node__count_rows(struct callchain_node *node)
 
 	if (callchain_param.mode == CHAIN_FLAT)
 		return callchain_node__count_flat_rows(node);
+	else if (callchain_param.mode == CHAIN_FOLDED)
+		return callchain_node__count_folded_rows(node);
 
 	list_for_each_entry(chain, &node->val, list) {
 		++n;
@@ -311,7 +318,8 @@ static void callchain__init_have_children(struct rb_root *root)
 	for (nd = rb_first(root); nd; nd = rb_next(nd)) {
 		struct callchain_node *node = rb_entry(nd, struct callchain_node, rb_node);
 		callchain_node__init_have_children(node, has_sibling);
-		if (callchain_param.mode == CHAIN_FLAT)
+		if (callchain_param.mode == CHAIN_FLAT ||
+		    callchain_param.mode == CHAIN_FOLDED)
 			callchain_node__make_parent_list(node);
 	}
 }
@@ -723,6 +731,116 @@ out:
 	return row - first_row;
 }
 
+static char *hist_browser__folded_callchain_str(struct hist_browser *browser,
+						struct callchain_list *chain,
+						char *value_str, char *old_str)
+{
+	char bf[1024];
+	const char *str;
+	char *new;
+
+	str = callchain_list__sym_name(chain, bf, sizeof(bf),
+				       browser->show_dso);
+	if (old_str) {
+		if (asprintf(&new, "%s%s%s", old_str,
+			     symbol_conf.field_sep ?: ";", str) < 0)
+			new = NULL;
+	} else {
+		if (value_str) {
+			if (asprintf(&new, "%s %s", value_str, str) < 0)
+				new = NULL;
+		} else {
+			if (asprintf(&new, "%s", str) < 0)
+				new = NULL;
+		}
+	}
+	return new;
+}
+
+static int hist_browser__show_callchain_folded(struct hist_browser *browser,
+					       struct rb_root *root,
+					       unsigned short row, u64 total,
+					       print_callchain_entry_fn print,
+					       struct callchain_print_arg *arg,
+					       check_output_full_fn is_output_full)
+{
+	struct rb_node *node;
+	int first_row = row, offset = LEVEL_OFFSET_STEP;
+	bool need_percent;
+
+	node = rb_first(root);
+	need_percent = node && rb_next(node);
+
+	while (node) {
+		struct callchain_node *child = rb_entry(node, struct callchain_node, rb_node);
+		struct rb_node *next = rb_next(node);
+		struct callchain_list *chain, *first_chain = NULL;
+		int first = true;
+		char *value_str = NULL, *value_str_alloc = NULL;
+		char *chain_str = NULL, *chain_str_alloc = NULL;
+
+		if (arg->row_offset != 0) {
+			arg->row_offset--;
+			goto next;
+		}
+
+		if (need_percent) {
+			char buf[64];
+
+			callchain_node__scnprintf_value(child, buf, sizeof(buf), total);
+			if (asprintf(&value_str, "%s", buf) < 0) {
+				value_str = (char *)"<...>";
+				goto do_print;
+			}
+			value_str_alloc = value_str;
+		}
+
+		list_for_each_entry(chain, &child->parent_val, list) {
+			chain_str = hist_browser__folded_callchain_str(browser,
+						chain, value_str, chain_str);
+			if (first) {
+				first = false;
+				first_chain = chain;
+			}
+
+			if (chain_str == NULL) {
+				chain_str = (char *)"Not enough memory!";
+				goto do_print;
+			}
+
+			chain_str_alloc = chain_str;
+		}
+
+		list_for_each_entry(chain, &child->val, list) {
+			chain_str = hist_browser__folded_callchain_str(browser,
+						chain, value_str, chain_str);
+			if (first) {
+				first = false;
+				first_chain = chain;
+			}
+
+			if (chain_str == NULL) {
+				chain_str = (char *)"Not enough memory!";
+				goto do_print;
+			}
+
+			chain_str_alloc = chain_str;
+		}
+
+do_print:
+		print(browser, first_chain, chain_str, offset, row++, arg);
+		free(value_str_alloc);
+		free(chain_str_alloc);
+
+next:
+		if (is_output_full(browser, row))
+			break;
+		node = next;
+	}
+
+	return row - first_row;
+}
+
 static int hist_browser__show_callchain(struct hist_browser *browser,
 					struct rb_root *root, int level,
 					unsigned short row, u64 total,
@@ -980,6 +1098,11 @@ static int hist_browser__show_entry(struct hist_browser *browser,
 					&entry->sorted_chain, row, total,
 					hist_browser__show_callchain_entry, &arg,
 					hist_browser__check_output_full);
+		} else if (callchain_param.mode == CHAIN_FOLDED) {
+			printed += hist_browser__show_callchain_folded(browser,
+					&entry->sorted_chain, row, total,
+					hist_browser__show_callchain_entry, &arg,
+					hist_browser__check_output_full);
 		} else {
 			printed += hist_browser__show_callchain(browser,
 					&entry->sorted_chain, 1, row, total,
-- 
cgit v1.1


From 3cd99dfd1c87067fb28a19fee76500aed56d7c8f Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Mon, 9 Nov 2015 14:45:45 +0900
Subject: perf ui/gtk: Support flat callchains

The flat callchain mode is to print all chains in a simple flat
hierarchy so make it easy to see.

Currently perf report --gtk doesn't show flat callchains properly.  With
flat callchains, only leaf nodes are added to the final rbtree so it
should show entries in parent nodes.  To do that, add parent_val list to
struct callchain_node and show them along with the (normal) val list.

See the previous commit on TUI support for more information.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1447047946-1691-10-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/ui/gtk/hists.c | 80 ++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 76 insertions(+), 4 deletions(-)

diff --git a/tools/perf/ui/gtk/hists.c b/tools/perf/ui/gtk/hists.c
index cff7bb9..0b24cd6 100644
--- a/tools/perf/ui/gtk/hists.c
+++ b/tools/perf/ui/gtk/hists.c
@@ -89,8 +89,71 @@ void perf_gtk__init_hpp(void)
 				perf_gtk__hpp_color_overhead_acc;
 }
 
-static void perf_gtk__add_callchain(struct rb_root *root, GtkTreeStore *store,
-				    GtkTreeIter *parent, int col, u64 total)
+static void perf_gtk__add_callchain_flat(struct rb_root *root, GtkTreeStore *store,
+					 GtkTreeIter *parent, int col, u64 total)
+{
+	struct rb_node *nd;
+	bool has_single_node = (rb_first(root) == rb_last(root));
+
+	for (nd = rb_first(root); nd; nd = rb_next(nd)) {
+		struct callchain_node *node;
+		struct callchain_list *chain;
+		GtkTreeIter iter, new_parent;
+		bool need_new_parent;
+
+		node = rb_entry(nd, struct callchain_node, rb_node);
+
+		new_parent = *parent;
+		need_new_parent = !has_single_node;
+
+		callchain_node__make_parent_list(node);
+
+		list_for_each_entry(chain, &node->parent_val, list) {
+			char buf[128];
+
+			gtk_tree_store_append(store, &iter, &new_parent);
+
+			callchain_node__scnprintf_value(node, buf, sizeof(buf), total);
+			gtk_tree_store_set(store, &iter, 0, buf, -1);
+
+			callchain_list__sym_name(chain, buf, sizeof(buf), false);
+			gtk_tree_store_set(store, &iter, col, buf, -1);
+
+			if (need_new_parent) {
+				/*
+				 * Only show the top-most symbol in a callchain
+				 * if it's not the only callchain.
+				 */
+				new_parent = iter;
+				need_new_parent = false;
+			}
+		}
+
+		list_for_each_entry(chain, &node->val, list) {
+			char buf[128];
+
+			gtk_tree_store_append(store, &iter, &new_parent);
+
+			callchain_node__scnprintf_value(node, buf, sizeof(buf), total);
+			gtk_tree_store_set(store, &iter, 0, buf, -1);
+
+			callchain_list__sym_name(chain, buf, sizeof(buf), false);
+			gtk_tree_store_set(store, &iter, col, buf, -1);
+
+			if (need_new_parent) {
+				/*
+				 * Only show the top-most symbol in a callchain
+				 * if it's not the only callchain.
+				 */
+				new_parent = iter;
+				need_new_parent = false;
+			}
+		}
+	}
+}
+
+static void perf_gtk__add_callchain_graph(struct rb_root *root, GtkTreeStore *store,
+					  GtkTreeIter *parent, int col, u64 total)
 {
 	struct rb_node *nd;
 	bool has_single_node = (rb_first(root) == rb_last(root));
@@ -134,11 +197,20 @@ static void perf_gtk__add_callchain(struct rb_root *root, GtkTreeStore *store,
 			child_total = total;
 
 		/* Now 'iter' contains info of the last callchain_list */
-		perf_gtk__add_callchain(&node->rb_root, store, &iter, col,
-					child_total);
+		perf_gtk__add_callchain_graph(&node->rb_root, store, &iter, col,
+					      child_total);
 	}
 }
 
+static void perf_gtk__add_callchain(struct rb_root *root, GtkTreeStore *store,
+				    GtkTreeIter *parent, int col, u64 total)
+{
+	if (callchain_param.mode == CHAIN_FLAT)
+		perf_gtk__add_callchain_flat(root, store, parent, col, total);
+	else
+		perf_gtk__add_callchain_graph(root, store, parent, col, total);
+}
+
 static void on_row_activated(GtkTreeView *view, GtkTreePath *path,
 			     GtkTreeViewColumn *col __maybe_unused,
 			     gpointer user_data __maybe_unused)
-- 
cgit v1.1


From 2c6caff2b26fde8f3f87183f8c97f2cebfdbcb98 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Mon, 9 Nov 2015 14:45:46 +0900
Subject: perf ui/gtk: Support folded callchains

The folded callchain mode is to print all chains in a single line.
Currently perf report --gtk doesn't support folded callchains.  Like
flat callchains, only leaf nodes are added to the final rbtree so it
should show entries in parent nodes.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Tested-by: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1447047946-1691-11-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/ui/gtk/hists.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

diff --git a/tools/perf/ui/gtk/hists.c b/tools/perf/ui/gtk/hists.c
index 0b24cd6..4677172 100644
--- a/tools/perf/ui/gtk/hists.c
+++ b/tools/perf/ui/gtk/hists.c
@@ -152,6 +152,66 @@ static void perf_gtk__add_callchain_flat(struct rb_root *root, GtkTreeStore *sto
 	}
 }
 
+static void perf_gtk__add_callchain_folded(struct rb_root *root, GtkTreeStore *store,
+					   GtkTreeIter *parent, int col, u64 total)
+{
+	struct rb_node *nd;
+
+	for (nd = rb_first(root); nd; nd = rb_next(nd)) {
+		struct callchain_node *node;
+		struct callchain_list *chain;
+		GtkTreeIter iter;
+		char buf[64];
+		char *str, *str_alloc = NULL;
+		bool first = true;
+
+		node = rb_entry(nd, struct callchain_node, rb_node);
+
+		callchain_node__make_parent_list(node);
+
+		list_for_each_entry(chain, &node->parent_val, list) {
+			char name[1024];
+
+			callchain_list__sym_name(chain, name, sizeof(name), false);
+
+			if (asprintf(&str, "%s%s%s",
+				     first ? "" : str_alloc,
+				     first ? "" : symbol_conf.field_sep ?: "; ",
+				     name) < 0)
+				return;
+
+			first = false;
+			free(str_alloc);
+			str_alloc = str;
+		}
+
+		list_for_each_entry(chain, &node->val, list) {
+			char name[1024];
+
+			callchain_list__sym_name(chain, name, sizeof(name), false);
+
+			if (asprintf(&str, "%s%s%s",
+				     first ? "" : str_alloc,
+				     first ? "" : symbol_conf.field_sep ?: "; ",
+				     name) < 0)
+				return;
+
+			first = false;
+			free(str_alloc);
+			str_alloc = str;
+		}
+
+		gtk_tree_store_append(store, &iter, parent);
+
+		callchain_node__scnprintf_value(node, buf, sizeof(buf), total);
+		gtk_tree_store_set(store, &iter, 0, buf, -1);
+
+		gtk_tree_store_set(store, &iter, col, str, -1);
+
+		free(str_alloc);
+	}
+}
+
 static void perf_gtk__add_callchain_graph(struct rb_root *root, GtkTreeStore *store,
 					  GtkTreeIter *parent, int col, u64 total)
 {
@@ -207,6 +267,8 @@ static void perf_gtk__add_callchain(struct rb_root *root, GtkTreeStore *store,
 {
 	if (callchain_param.mode == CHAIN_FLAT)
 		perf_gtk__add_callchain_flat(root, store, parent, col, total);
+	else if (callchain_param.mode == CHAIN_FOLDED)
+		perf_gtk__add_callchain_folded(root, store, parent, col, total);
 	else
 		perf_gtk__add_callchain_graph(root, store, parent, col, total);
 }
-- 
cgit v1.1


From 2fde46b79e2fdbc90d0d97cf992782732b5a371c Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Sun, 22 Nov 2015 18:16:15 -0500
Subject: x86/smpboot: Re-enable init_udelay=0 by default on modern CPUs

Fix a Linux-4.3 corner case performance regression, introduced by commit:

  f1ccd249319e ("x86/smpboot: Fix cpu_init_udelay=10000 corner case boot parameter misbehavior")

which allowed the cmdline  "cpu_init_udelay=" to work with all values,
including the default of 10000.

But in setting the default of 10000, it over-rode the code stat sets
the delay 0 on modern processors.

Also, tidy up use of INT/UINT.

Reported-by: Shane <shrybman@teksavvy.com>
Signed-off-by: Len Brown <len.brown@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dparsons@brightdsl.net
Link: http://lkml.kernel.org/r/9082eb809ef40dad02db714759c7aaf618c518d4.1448232494.git.len.brown@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/smpboot.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 892ee2e5..fbabe4f 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -509,7 +509,7 @@ void __inquire_remote_apic(int apicid)
  */
 #define UDELAY_10MS_DEFAULT 10000
 
-static unsigned int init_udelay = INT_MAX;
+static unsigned int init_udelay = UINT_MAX;
 
 static int __init cpu_init_udelay(char *str)
 {
@@ -522,14 +522,15 @@ early_param("cpu_init_udelay", cpu_init_udelay);
 static void __init smp_quirk_init_udelay(void)
 {
 	/* if cmdline changed it from default, leave it alone */
-	if (init_udelay != INT_MAX)
+	if (init_udelay != UINT_MAX)
 		return;
 
 	/* if modern processor, use no delay */
 	if (((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 6)) ||
-	    ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF)))
+	    ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF))) {
 		init_udelay = 0;
-
+		return;
+	}
 	/* else, use legacy delay */
 	init_udelay = UDELAY_10MS_DEFAULT;
 }
-- 
cgit v1.1


From 38c6ade2dd4dcc3bca06c981e2a1b91289046177 Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Tue, 20 Oct 2015 13:04:41 +0100
Subject: sched/fair: Remove empty idle enter and exit functions

Commit cd126afe838d ("sched/fair: Remove rq's runnable avg") got rid of
rq->avg and so there is no need to update it any more when entering or
exiting idle.

Remove the now empty functions idle_{enter|exit}_fair().

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yuyang Du <yuyang.du@intel.com>
Link: http://lkml.kernel.org/r/1445342681-17171-1-git-send-email-dietmar.eggemann@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c      | 24 +-----------------------
 kernel/sched/idle_task.c |  1 -
 kernel/sched/sched.h     |  8 --------
 3 files changed, 1 insertion(+), 32 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f04fda8..2779dec 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2835,24 +2835,6 @@ void remove_entity_load_avg(struct sched_entity *se)
 	atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
 }
 
-/*
- * Update the rq's load with the elapsed running time before entering
- * idle. if the last scheduled task is not a CFS task, idle_enter will
- * be the only way to update the runnable statistic.
- */
-void idle_enter_fair(struct rq *this_rq)
-{
-}
-
-/*
- * Update the rq's load with the elapsed idle time before a task is
- * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
- * be the only way to update the runnable statistic.
- */
-void idle_exit_fair(struct rq *this_rq)
-{
-}
-
 static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
 {
 	return cfs_rq->runnable_load_avg;
@@ -7248,8 +7230,6 @@ static int idle_balance(struct rq *this_rq)
 	int pulled_task = 0;
 	u64 curr_cost = 0;
 
-	idle_enter_fair(this_rq);
-
 	/*
 	 * We must set idle_stamp _before_ calling idle_balance(), such that we
 	 * measure the duration of idle_balance() as idle time.
@@ -7330,10 +7310,8 @@ out:
 	if (this_rq->nr_running != this_rq->cfs.h_nr_running)
 		pulled_task = -1;
 
-	if (pulled_task) {
-		idle_exit_fair(this_rq);
+	if (pulled_task)
 		this_rq->idle_stamp = 0;
-	}
 
 	return pulled_task;
 }
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index c4ae0f1..47ce949 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -47,7 +47,6 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
 
 static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
 {
-	idle_exit_fair(rq);
 	rq_last_tick_reset(rq);
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index efd3bfc..2eb2002 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1249,16 +1249,8 @@ extern void update_group_capacity(struct sched_domain *sd, int cpu);
 
 extern void trigger_load_balance(struct rq *rq);
 
-extern void idle_enter_fair(struct rq *this_rq);
-extern void idle_exit_fair(struct rq *this_rq);
-
 extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask);
 
-#else
-
-static inline void idle_enter_fair(struct rq *rq) { }
-static inline void idle_exit_fair(struct rq *rq) { }
-
 #endif
 
 #ifdef CONFIG_CPU_IDLE
-- 
cgit v1.1


From 69e51e92a394088fc3266ed5136903074b44f3c4 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 23 Oct 2015 14:32:34 +0200
Subject: sched/wait: Document waitqueue_active()

Kosuku reports that there were a fair number of buggy
waitqueue_active() users and this function deserves a big comment in
order to avoid growing more.

Reported-by: Kosuke Tatsukawa <tatsu@ab.jp.nec.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/wait.h | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 1e1bf9f..f3bac30 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -102,6 +102,36 @@ init_waitqueue_func_entry(wait_queue_t *q, wait_queue_func_t func)
 	q->func		= func;
 }
 
+/**
+ * waitqueue_active -- locklessly test for waiters on the queue
+ * @q: the waitqueue to test for waiters
+ *
+ * returns true if the wait list is not empty
+ *
+ * NOTE: this function is lockless and requires care, incorrect usage _will_
+ * lead to sporadic and non-obvious failure.
+ *
+ * Use either while holding wait_queue_head_t::lock or when used for wakeups
+ * with an extra smp_mb() like:
+ *
+ *      CPU0 - waker                    CPU1 - waiter
+ *
+ *                                      for (;;) {
+ *      @cond = true;                     prepare_to_wait(&wq, &wait, state);
+ *      smp_mb();                         // smp_mb() from set_current_state()
+ *      if (waitqueue_active(wq))         if (@cond)
+ *        wake_up(wq);                      break;
+ *                                        schedule();
+ *                                      }
+ *                                      finish_wait(&wq, &wait);
+ *
+ * Because without the explicit smp_mb() it's possible for the
+ * waitqueue_active() load to get hoisted over the @cond store such that we'll
+ * observe an empty wait list while the waiter might not observe @cond.
+ *
+ * Also note that this 'optimization' trades a spin_lock() for an smp_mb(),
+ * which (when the lock is uncontended) are of roughly equal cost.
+ */
 static inline int waitqueue_active(wait_queue_head_t *q)
 {
 	return !list_empty(&q->task_list);
-- 
cgit v1.1


From d937cdc59e363baf8d5c757d944b13ebfa33e729 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 19 Oct 2015 13:49:30 +0200
Subject: sched/fair: Clean up the explanation around decaying load update
 misses

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 53 ++++++++++++++++++++++++-----------------------------
 1 file changed, 24 insertions(+), 29 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2779dec..8f3905e3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4222,42 +4222,37 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
  */
 
 /*
- * The exact cpuload at various idx values, calculated at every tick would be
- * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
+ * The exact cpuload calculated at every tick would be:
  *
- * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
- * on nth tick when cpu may be busy, then we have:
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
- * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
+ *   load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
+ *
+ * If a cpu misses updates for n ticks (as it was idle) and update gets
+ * called on the n+1-th tick when cpu may be busy, then we have:
+ *
+ *   load_n   = (1 - 1/2^i)^n * load_0
+ *   load_n+1 = (1 - 1/2^i)   * load_n + (1/2^i) * cur_load
  *
  * decay_load_missed() below does efficient calculation of
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
- * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
  *
- * The calculation is approximated on a 128 point scale.
- * degrade_zero_ticks is the number of ticks after which load at any
- * particular idx is approximated to be zero.
- * degrade_factor is a precomputed table, a row for each load idx.
- * Each column corresponds to degradation factor for a power of two ticks,
- * based on 128 point scale.
- * Example:
- * row 2, col 3 (=12) says that the degradation at load idx 2 after
- * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
+ *   load' = (1 - 1/2^i)^n * load
+ *
+ * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors.
+ * This allows us to precompute the above in said factors, thereby allowing the
+ * reduction of an arbitrary n in O(log_2 n) steps. (See also
+ * fixed_power_int())
  *
- * With this power of 2 load factors, we can degrade the load n times
- * by looking at 1 bits in n and doing as many mult/shift instead of
- * n mult/shifts needed by the exact degradation.
+ * The calculation is approximated on a 128 point scale.
  */
 #define DEGRADE_SHIFT		7
-static const unsigned char
-		degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
-static const unsigned char
-		degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
-					{0, 0, 0, 0, 0, 0, 0, 0},
-					{64, 32, 8, 0, 0, 0, 0, 0},
-					{96, 72, 40, 12, 1, 0, 0},
-					{112, 98, 75, 43, 15, 1, 0},
-					{120, 112, 98, 76, 45, 16, 2} };
+
+static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
+static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
+	{   0,   0,  0,  0,  0,  0, 0, 0 },
+	{  64,  32,  8,  0,  0,  0, 0, 0 },
+	{  96,  72, 40, 12,  1,  0, 0, 0 },
+	{ 112,  98, 75, 43, 15,  1, 0, 0 },
+	{ 120, 112, 98, 76, 45, 16, 2, 0 }
+};
 
 /*
  * Update cpu_load for any missed ticks, due to tickless idle. The backlog
-- 
cgit v1.1


From 59543275488d18d878cd2ab2b1072efc1e9ac1c4 Mon Sep 17 00:00:00 2001
From: Byungchul Park <byungchul.park@lge.com>
Date: Wed, 14 Oct 2015 18:47:35 +0900
Subject: sched/fair: Prepare __update_cpu_load() to handle active tickless

There are some cases where distance between ticks is more than one tick
while the CPU is not idle, e.g. full NOHZ.

However __update_cpu_load() assumes it is the idle tickless case if the
distance between ticks is more than 1, even though it can be the active
tickless case as well. Thus in the active tickless case, updating the CPU
load will not be performed correctly.

Where the current code assumes the load for each tick is zero, this is
(obviously) not true in non-idle tickless case. We can approximately
consider the load ~= this_rq->cpu_load[0] during tickless in non-idle
tickless case.

Signed-off-by: Byungchul Park <byungchul.park@lge.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1444816056-11886-2-git-send-email-byungchul.park@lge.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 49 +++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 41 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8f3905e3..404006a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4283,14 +4283,46 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
 	return load;
 }
 
-/*
+/**
+ * __update_cpu_load - update the rq->cpu_load[] statistics
+ * @this_rq: The rq to update statistics for
+ * @this_load: The current load
+ * @pending_updates: The number of missed updates
+ * @active: !0 for NOHZ_FULL
+ *
  * Update rq->cpu_load[] statistics. This function is usually called every
- * scheduler tick (TICK_NSEC). With tickless idle this will not be called
- * every tick. We fix it up based on jiffies.
+ * scheduler tick (TICK_NSEC).
+ *
+ * This function computes a decaying average:
+ *
+ *   load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load
+ *
+ * Because of NOHZ it might not get called on every tick which gives need for
+ * the @pending_updates argument.
+ *
+ *   load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1
+ *             = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load
+ *             = A * (A * load[i]_n-2 + B) + B
+ *             = A * (A * (A * load[i]_n-3 + B) + B) + B
+ *             = A^3 * load[i]_n-3 + (A^2 + A + 1) * B
+ *             = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B
+ *             = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B
+ *             = (1 - 1/2^i)^n * (load[i]_0 - load) + load
+ *
+ * In the above we've assumed load_n := load, which is true for NOHZ_FULL as
+ * any change in load would have resulted in the tick being turned back on.
+ *
+ * For regular NOHZ, this reduces to:
+ *
+ *   load[i]_n = (1 - 1/2^i)^n * load[i]_0
+ *
+ * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
+ * term. See the @active paramter.
  */
 static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
-			      unsigned long pending_updates)
+			      unsigned long pending_updates, int active)
 {
+	unsigned long tickless_load = active ? this_rq->cpu_load[0] : 0;
 	int i, scale;
 
 	this_rq->nr_load_updates++;
@@ -4302,8 +4334,9 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
 
 		/* scale is effectively 1 << i now, and >> i divides by scale */
 
-		old_load = this_rq->cpu_load[i];
+		old_load = this_rq->cpu_load[i] - tickless_load;
 		old_load = decay_load_missed(old_load, pending_updates - 1, i);
+		old_load += tickless_load;
 		new_load = this_load;
 		/*
 		 * Round up the averaging division if load is increasing. This
@@ -4358,7 +4391,7 @@ static void update_idle_cpu_load(struct rq *this_rq)
 	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
 	this_rq->last_load_update_tick = curr_jiffies;
 
-	__update_cpu_load(this_rq, load, pending_updates);
+	__update_cpu_load(this_rq, load, pending_updates, 0);
 }
 
 /*
@@ -4381,7 +4414,7 @@ void update_cpu_load_nohz(void)
 		 * We were idle, this means load 0, the current load might be
 		 * !0 due to remote wakeups and the sort.
 		 */
-		__update_cpu_load(this_rq, 0, pending_updates);
+		__update_cpu_load(this_rq, 0, pending_updates, 0);
 	}
 	raw_spin_unlock(&this_rq->lock);
 }
@@ -4397,7 +4430,7 @@ void update_cpu_load_active(struct rq *this_rq)
 	 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
 	 */
 	this_rq->last_load_update_tick = jiffies;
-	__update_cpu_load(this_rq, load, 1);
+	__update_cpu_load(this_rq, load, 1, 1);
 }
 
 /*
-- 
cgit v1.1


From 525705d15e63b7455977408e4601e76e6bc41524 Mon Sep 17 00:00:00 2001
From: Byungchul Park <byungchul.park@lge.com>
Date: Tue, 10 Nov 2015 09:36:02 +0900
Subject: sched/fair: Consider missed ticks in NOHZ_FULL in
 update_cpu_load_nohz()

Usually the tick can be stopped for an idle CPU in NOHZ. However in NOHZ_FULL
mode, a non-idle CPU's tick can also be stopped. However, update_cpu_load_nohz()
does not consider the case a non-idle CPU's tick has been stopped at all.

This patch makes the update_cpu_load_nohz() know if the calling path comes
from NOHZ_FULL or idle NOHZ.

Signed-off-by: Byungchul Park <byungchul.park@lge.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1447115762-19734-3-git-send-email-byungchul.park@lge.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h    |  4 ++--
 kernel/sched/fair.c      | 10 ++++++----
 kernel/time/tick-sched.c |  8 ++++----
 3 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index edad7a4..f425aac 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -177,9 +177,9 @@ extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load);
 extern void calc_global_load(unsigned long ticks);
 
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
-extern void update_cpu_load_nohz(void);
+extern void update_cpu_load_nohz(int active);
 #else
-static inline void update_cpu_load_nohz(void) { }
+static inline void update_cpu_load_nohz(int active) { }
 #endif
 
 extern unsigned long get_parent_ip(unsigned long addr);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 404006a..309b1d5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4397,10 +4397,11 @@ static void update_idle_cpu_load(struct rq *this_rq)
 /*
  * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
  */
-void update_cpu_load_nohz(void)
+void update_cpu_load_nohz(int active)
 {
 	struct rq *this_rq = this_rq();
 	unsigned long curr_jiffies = READ_ONCE(jiffies);
+	unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0;
 	unsigned long pending_updates;
 
 	if (curr_jiffies == this_rq->last_load_update_tick)
@@ -4411,10 +4412,11 @@ void update_cpu_load_nohz(void)
 	if (pending_updates) {
 		this_rq->last_load_update_tick = curr_jiffies;
 		/*
-		 * We were idle, this means load 0, the current load might be
-		 * !0 due to remote wakeups and the sort.
+		 * In the regular NOHZ case, we were idle, this means load 0.
+		 * In the NOHZ_FULL case, we were non-idle, we should consider
+		 * its weighted load.
 		 */
-		__update_cpu_load(this_rq, 0, pending_updates, 0);
+		__update_cpu_load(this_rq, load, pending_updates, active);
 	}
 	raw_spin_unlock(&this_rq->lock);
 }
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 7c7ec45..515edf3 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -694,11 +694,11 @@ out:
 	return tick;
 }
 
-static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
+static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now, int active)
 {
 	/* Update jiffies first */
 	tick_do_update_jiffies64(now);
-	update_cpu_load_nohz();
+	update_cpu_load_nohz(active);
 
 	calc_load_exit_idle();
 	touch_softlockup_watchdog();
@@ -725,7 +725,7 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts)
 	if (can_stop_full_tick())
 		tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
 	else if (ts->tick_stopped)
-		tick_nohz_restart_sched_tick(ts, ktime_get());
+		tick_nohz_restart_sched_tick(ts, ktime_get(), 1);
 #endif
 }
 
@@ -916,7 +916,7 @@ void tick_nohz_idle_exit(void)
 		tick_nohz_stop_idle(ts, now);
 
 	if (ts->tick_stopped) {
-		tick_nohz_restart_sched_tick(ts, now);
+		tick_nohz_restart_sched_tick(ts, now, 0);
 		tick_nohz_account_idle_ticks(ts);
 	}
 
-- 
cgit v1.1


From 51170840fe91dfca10fd533b303ea39b2524782a Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Thu, 5 Nov 2015 15:56:23 -0500
Subject: sched/numa: Cap PTE scanning overhead to 3% of run time

There is a fundamental mismatch between the runtime based NUMA scanning
at the task level, and the wall clock time NUMA scanning at the mm level.
On a severely overloaded system, with very large processes, this mismatch
can cause the system to spend all of its time in change_prot_numa().

This can happen if the task spends at least two ticks in change_prot_numa(),
and only gets two ticks of CPU time in the real time between two scan
intervals of the mm.

This patch ensures that a task never spends more than 3% of run
time scanning PTEs. It does that by ensuring that in-between
task_numa_work() runs, the task spends at least 32x as much time on
other things than it did on task_numa_work().

This is done stochastically: if a timer tick happens, or the task
gets rescheduled during task_numa_work(), we delay a future run of
task_numa_work() until the task has spent at least 32x the amount of
CPU time doing something else, as it spent inside task_numa_work().
The longer task_numa_work() takes, the more likely it is this happens.

If task_numa_work() takes very little time, chances are low that that
code will do anything, but we will not care.

Reported-and-tested-by: Jan Stancek <jstancek@redhat.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: mgorman@suse.de
Link: http://lkml.kernel.org/r/1446756983-28173-3-git-send-email-riel@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 309b1d5..95b944e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2155,6 +2155,7 @@ void task_numa_work(struct callback_head *work)
 	unsigned long migrate, next_scan, now = jiffies;
 	struct task_struct *p = current;
 	struct mm_struct *mm = p->mm;
+	u64 runtime = p->se.sum_exec_runtime;
 	struct vm_area_struct *vma;
 	unsigned long start, end;
 	unsigned long nr_pte_updates = 0;
@@ -2277,6 +2278,17 @@ out:
 	else
 		reset_ptenuma_scan(p);
 	up_read(&mm->mmap_sem);
+
+	/*
+	 * Make sure tasks use at least 32x as much time to run other code
+	 * than they used here, to limit NUMA PTE scanning overhead to 3% max.
+	 * Usually update_task_scan_period slows down scanning enough; on an
+	 * overloaded system we need to limit overhead on a per task basis.
+	 */
+	if (unlikely(p->se.sum_exec_runtime != runtime)) {
+		u64 diff = p->se.sum_exec_runtime - runtime;
+		p->node_stamp += 32 * diff;
+	}
 }
 
 /*
-- 
cgit v1.1


From 3ea94de15ce9f3a217f6d0a7e9e0f48388902bb7 Mon Sep 17 00:00:00 2001
From: Joonwoo Park <joonwoop@codeaurora.org>
Date: Thu, 12 Nov 2015 19:38:54 -0800
Subject: sched/core: Fix incorrect wait time and wait count statistics

At present scheduler resets task's wait start timestamp when the task
migrates to another rq.  This misleads scheduler itself into reporting
less wait time than actual by omitting time spent for waiting prior to
migration and also more wait count than actual by counting migration as
wait end event which can be seen by trace or /proc/<pid>/sched with
CONFIG_SCHEDSTATS=y.

Carry forward migrating task's wait time prior to migration and
don't count migration as a wait end event to fix such statistics error.

In order to determine whether task is migrating mark task->on_rq with
TASK_ON_RQ_MIGRATING while dequeuing and enqueuing due to migration.

Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: ohaugan@codeaurora.org
Link: http://lkml.kernel.org/r/20151113033854.GA4247@codeaurora.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 15 ++++++++++--
 kernel/sched/fair.c | 67 +++++++++++++++++++++++++++++++++++++----------------
 2 files changed, 60 insertions(+), 22 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4d568ac9..1b7cb5e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1071,8 +1071,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
 {
 	lockdep_assert_held(&rq->lock);
 
-	dequeue_task(rq, p, 0);
 	p->on_rq = TASK_ON_RQ_MIGRATING;
+	dequeue_task(rq, p, 0);
 	set_task_cpu(p, new_cpu);
 	raw_spin_unlock(&rq->lock);
 
@@ -1080,8 +1080,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
 
 	raw_spin_lock(&rq->lock);
 	BUG_ON(task_cpu(p) != new_cpu);
-	p->on_rq = TASK_ON_RQ_QUEUED;
 	enqueue_task(rq, p, 0);
+	p->on_rq = TASK_ON_RQ_QUEUED;
 	check_preempt_curr(rq, p, 0);
 
 	return rq;
@@ -1274,6 +1274,15 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 	WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
 			!p->on_rq);
 
+	/*
+	 * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING,
+	 * because schedstat_wait_{start,end} rebase migrating task's wait_start
+	 * time relying on p->on_rq.
+	 */
+	WARN_ON_ONCE(p->state == TASK_RUNNING &&
+		     p->sched_class == &fair_sched_class &&
+		     (p->on_rq && !task_on_rq_migrating(p)));
+
 #ifdef CONFIG_LOCKDEP
 	/*
 	 * The caller should hold either p->pi_lock or rq->lock, when changing
@@ -1310,9 +1319,11 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
 		src_rq = task_rq(p);
 		dst_rq = cpu_rq(cpu);
 
+		p->on_rq = TASK_ON_RQ_MIGRATING;
 		deactivate_task(src_rq, p, 0);
 		set_task_cpu(p, cpu);
 		activate_task(dst_rq, p, 0);
+		p->on_rq = TASK_ON_RQ_QUEUED;
 		check_preempt_curr(dst_rq, p, 0);
 	} else {
 		/*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 95b944e..f7017ad 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -738,12 +738,56 @@ static void update_curr_fair(struct rq *rq)
 	update_curr(cfs_rq_of(&rq->curr->se));
 }
 
+#ifdef CONFIG_SCHEDSTATS
 static inline void
 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
+	u64 wait_start = rq_clock(rq_of(cfs_rq));
+
+	if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
+	    likely(wait_start > se->statistics.wait_start))
+		wait_start -= se->statistics.wait_start;
+
+	se->statistics.wait_start = wait_start;
 }
 
+static void
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	struct task_struct *p;
+	u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
+
+	if (entity_is_task(se)) {
+		p = task_of(se);
+		if (task_on_rq_migrating(p)) {
+			/*
+			 * Preserve migrating task's wait time so wait_start
+			 * time stamp can be adjusted to accumulate wait time
+			 * prior to migration.
+			 */
+			se->statistics.wait_start = delta;
+			return;
+		}
+		trace_sched_stat_wait(p, delta);
+	}
+
+	se->statistics.wait_max = max(se->statistics.wait_max, delta);
+	se->statistics.wait_count++;
+	se->statistics.wait_sum += delta;
+	se->statistics.wait_start = 0;
+}
+#else
+static inline void
+update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+
+static inline void
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+#endif
+
 /*
  * Task is being enqueued - update stats:
  */
@@ -757,23 +801,6 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		update_stats_wait_start(cfs_rq, se);
 }
 
-static void
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-	schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
-			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
-	schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
-	schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
-			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
-#ifdef CONFIG_SCHEDSTATS
-	if (entity_is_task(se)) {
-		trace_sched_stat_wait(task_of(se),
-			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
-	}
-#endif
-	schedstat_set(se->statistics.wait_start, 0);
-}
-
 static inline void
 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
@@ -5745,8 +5772,8 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
 {
 	lockdep_assert_held(&env->src_rq->lock);
 
-	deactivate_task(env->src_rq, p, 0);
 	p->on_rq = TASK_ON_RQ_MIGRATING;
+	deactivate_task(env->src_rq, p, 0);
 	set_task_cpu(p, env->dst_cpu);
 }
 
@@ -5879,8 +5906,8 @@ static void attach_task(struct rq *rq, struct task_struct *p)
 	lockdep_assert_held(&rq->lock);
 
 	BUG_ON(task_rq(p) != rq);
-	p->on_rq = TASK_ON_RQ_QUEUED;
 	activate_task(rq, p, 0);
+	p->on_rq = TASK_ON_RQ_QUEUED;
 	check_preempt_curr(rq, p, 0);
 }
 
-- 
cgit v1.1


From 01783e0d452736d7deff1b920c5eccad67adc428 Mon Sep 17 00:00:00 2001
From: Geliang Tang <geliangtang@163.com>
Date: Sun, 15 Nov 2015 18:18:40 +0800
Subject: sched/core: Use list_is_singular() in sched_can_stop_tick()

Use list_is_singular() to check if run_list has only one entry.

Signed-off-by: Geliang Tang <geliangtang@163.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/a5453fafd735affcf28e53a1d0a3d6965cb5dbb5.1447582547.git.geliangtang@163.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1b7cb5e..5b420d2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -731,7 +731,7 @@ bool sched_can_stop_tick(void)
 	if (current->policy == SCHED_RR) {
 		struct sched_rt_entity *rt_se = &current->rt;
 
-		return rt_se->run_list.prev == rt_se->run_list.next;
+		return list_is_singular(&rt_se->run_list);
 	}
 
 	/*
-- 
cgit v1.1


From 64038f292a1b33c7d46bd11f62f7798101152c00 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Sun, 15 Nov 2015 20:33:11 +0100
Subject: stop_machine: Fix possible cpu_stopper_thread() crash

stop_one_cpu_nowait(fn) will crash the kernel if the callback returns
nonzero, work->done == NULL in this case.

This needs more cleanups, cpu_stop_signal_done() is called right after
we check done != NULL and it does the same check.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Milos Vyletel <milos@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20151115193311.GA8242@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/stop_machine.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 867bc20..1a66a95 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -454,7 +454,7 @@ repeat:
 		preempt_disable();
 
 		ret = fn(arg);
-		if (ret)
+		if (ret && done)
 			done->ret = ret;
 
 		/* restore preemption and check it's still balanced */
-- 
cgit v1.1


From 6a19005157c464b47b2082f2617d12bc11198a0d Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Sun, 15 Nov 2015 20:33:14 +0100
Subject: stop_machine: Don't disable preemption in stop_two_cpus()

Now that stop_two_cpus() path does not check cpu_active() we can remove
preempt_disable(), it was only needed to ensure that stop_machine() can
not be called after we observe cpu_active() == T and before we queue the
new work.

Also, turn the pointless and confusing ->executed check into WARN_ON().
We know that both works must be executed, otherwise we have a bug. And
in fact I think that done->executed should die, see the next changes.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Milos Vyletel <milos@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20151115193314.GA8249@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/stop_machine.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 1a66a95..17f01a9 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -258,7 +258,6 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
 	struct cpu_stop_work work1, work2;
 	struct multi_stop_data msdata;
 
-	preempt_disable();
 	msdata = (struct multi_stop_data){
 		.fn = fn,
 		.data = arg,
@@ -277,16 +276,12 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
 
 	if (cpu1 > cpu2)
 		swap(cpu1, cpu2);
-	if (cpu_stop_queue_two_works(cpu1, &work1, cpu2, &work2)) {
-		preempt_enable();
+	if (cpu_stop_queue_two_works(cpu1, &work1, cpu2, &work2))
 		return -ENOENT;
-	}
-
-	preempt_enable();
 
 	wait_for_completion(&done.completion);
-
-	return done.executed ? done.ret : -ENOENT;
+	WARN_ON(!done.executed);
+	return done.ret;
 }
 
 /**
-- 
cgit v1.1


From 1b034bd989aa4a396c13d305759c376c52595a97 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 17 Nov 2015 18:05:23 +0100
Subject: stop_machine: Make cpu_stop_queue_work() and stop_one_cpu_nowait()
 return bool

Change cpu_stop_queue_work() to return true if the work was queued and
change stop_one_cpu_nowait() to return the result of cpu_stop_queue_work().
This makes it more useful, for example now you can alloc cpu_stop_work for
stop_one_cpu_nowait() and free it in the callback or if stop_one_cpu_nowait()
fails, currently this is impossible because you can't know if @fn will be
called or not.

Also, this allows to kill cpu_stop_done->executed, see the next changes.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Milos Vyletel <milos@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20151117170523.GA13955@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/stop_machine.h |  7 +++++--
 kernel/stop_machine.c        | 16 ++++++++++++----
 2 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index 0adedca..9ef42e1 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -29,7 +29,7 @@ struct cpu_stop_work {
 
 int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg);
 int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg);
-void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
+bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
 			 struct cpu_stop_work *work_buf);
 int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
 int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
@@ -65,7 +65,7 @@ static void stop_one_cpu_nowait_workfn(struct work_struct *work)
 	preempt_enable();
 }
 
-static inline void stop_one_cpu_nowait(unsigned int cpu,
+static inline bool stop_one_cpu_nowait(unsigned int cpu,
 				       cpu_stop_fn_t fn, void *arg,
 				       struct cpu_stop_work *work_buf)
 {
@@ -74,7 +74,10 @@ static inline void stop_one_cpu_nowait(unsigned int cpu,
 		work_buf->fn = fn;
 		work_buf->arg = arg;
 		schedule_work(&work_buf->work);
+		return true;
 	}
+
+	return false;
 }
 
 static inline int stop_cpus(const struct cpumask *cpumask,
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 17f01a9..0ec1f16 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -81,17 +81,21 @@ static void __cpu_stop_queue_work(struct cpu_stopper *stopper,
 }
 
 /* queue @work to @stopper.  if offline, @work is completed immediately */
-static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
+static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
 {
 	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
 	unsigned long flags;
+	bool enabled;
 
 	spin_lock_irqsave(&stopper->lock, flags);
-	if (stopper->enabled)
+	enabled = stopper->enabled;
+	if (enabled)
 		__cpu_stop_queue_work(stopper, work);
 	else
 		cpu_stop_signal_done(work->done, false);
 	spin_unlock_irqrestore(&stopper->lock, flags);
+
+	return enabled;
 }
 
 /**
@@ -297,12 +301,16 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
  *
  * CONTEXT:
  * Don't care.
+ *
+ * RETURNS:
+ * true if cpu_stop_work was queued successfully and @fn will be called,
+ * false otherwise.
  */
-void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
+bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
 			struct cpu_stop_work *work_buf)
 {
 	*work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, };
-	cpu_stop_queue_work(cpu, work_buf);
+	return cpu_stop_queue_work(cpu, work_buf);
 }
 
 /* static data for stop_cpus */
-- 
cgit v1.1


From 958c5f848e17e216df138cc2161b07b7120e2d15 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Sun, 15 Nov 2015 20:33:20 +0100
Subject: stop_machine: Change stop_one_cpu() to rely on cpu_stop_queue_work()

Change stop_one_cpu() to return -ENOENT if cpu_stop_queue_work() fails.
Otherwise we know that ->executed must be true after wait_for_completion()
so we can just return done.ret.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Milos Vyletel <milos@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20151115193320.GA8259@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/stop_machine.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 0ec1f16..68b73c4 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -128,9 +128,11 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
 	struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };
 
 	cpu_stop_init_done(&done, 1);
-	cpu_stop_queue_work(cpu, &work);
+	if (!cpu_stop_queue_work(cpu, &work))
+		return -ENOENT;
 	wait_for_completion(&done.completion);
-	return done.executed ? done.ret : -ENOENT;
+	WARN_ON(!done.executed);
+	return done.ret;
 }
 
 /* This controls the threads on each CPU. */
-- 
cgit v1.1


From 4aff1ca6970afbf9cd916c34a9c442c8ccba905e Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Sun, 15 Nov 2015 20:33:23 +0100
Subject: stop_machine: Change __stop_cpus() to rely on cpu_stop_queue_work()

Change queue_stop_cpus_work() to return true if it queues at least one
work, this means that the caller should wait.

__stop_cpus() can check the value returned by queue_stop_cpus_work() and
avoid done.executed, just like stop_one_cpu() does.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Milos Vyletel <milos@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20151115193323.GA8262@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/stop_machine.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 68b73c4..ed2019a 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -318,12 +318,13 @@ bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
 /* static data for stop_cpus */
 static DEFINE_MUTEX(stop_cpus_mutex);
 
-static void queue_stop_cpus_work(const struct cpumask *cpumask,
+static bool queue_stop_cpus_work(const struct cpumask *cpumask,
 				 cpu_stop_fn_t fn, void *arg,
 				 struct cpu_stop_done *done)
 {
 	struct cpu_stop_work *work;
 	unsigned int cpu;
+	bool queued = false;
 
 	/*
 	 * Disable preemption while queueing to avoid getting
@@ -336,9 +337,12 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask,
 		work->fn = fn;
 		work->arg = arg;
 		work->done = done;
-		cpu_stop_queue_work(cpu, work);
+		if (cpu_stop_queue_work(cpu, work))
+			queued = true;
 	}
 	lg_global_unlock(&stop_cpus_lock);
+
+	return queued;
 }
 
 static int __stop_cpus(const struct cpumask *cpumask,
@@ -347,9 +351,11 @@ static int __stop_cpus(const struct cpumask *cpumask,
 	struct cpu_stop_done done;
 
 	cpu_stop_init_done(&done, cpumask_weight(cpumask));
-	queue_stop_cpus_work(cpumask, fn, arg, &done);
+	if (!queue_stop_cpus_work(cpumask, fn, arg, &done))
+		return -ENOENT;
 	wait_for_completion(&done.completion);
-	return done.executed ? done.ret : -ENOENT;
+	WARN_ON(!done.executed);
+	return done.ret;
 }
 
 /**
-- 
cgit v1.1


From 6fa3b826bcb3309157166e6e523a4be236fe267a Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Sun, 15 Nov 2015 20:33:26 +0100
Subject: stop_machine: Kill cpu_stop_done->executed

Now that cpu_stop_done->executed becomes write-only (ignoring WARN_ON()
checks) we can remove it.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Milos Vyletel <milos@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20151115193326.GA8269@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/stop_machine.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index ed2019a..09eb83f 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -28,7 +28,6 @@
  */
 struct cpu_stop_done {
 	atomic_t		nr_todo;	/* nr left to execute */
-	bool			executed;	/* actually executed? */
 	int			ret;		/* collected return value */
 	struct completion	completion;	/* fired if nr_todo reaches 0 */
 };
@@ -63,11 +62,9 @@ static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
 }
 
 /* signal completion unless @done is NULL */
-static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
+static void cpu_stop_signal_done(struct cpu_stop_done *done)
 {
 	if (done) {
-		if (executed)
-			done->executed = true;
 		if (atomic_dec_and_test(&done->nr_todo))
 			complete(&done->completion);
 	}
@@ -92,7 +89,7 @@ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
 	if (enabled)
 		__cpu_stop_queue_work(stopper, work);
 	else
-		cpu_stop_signal_done(work->done, false);
+		cpu_stop_signal_done(work->done);
 	spin_unlock_irqrestore(&stopper->lock, flags);
 
 	return enabled;
@@ -131,7 +128,6 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
 	if (!cpu_stop_queue_work(cpu, &work))
 		return -ENOENT;
 	wait_for_completion(&done.completion);
-	WARN_ON(!done.executed);
 	return done.ret;
 }
 
@@ -286,7 +282,6 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
 		return -ENOENT;
 
 	wait_for_completion(&done.completion);
-	WARN_ON(!done.executed);
 	return done.ret;
 }
 
@@ -354,7 +349,6 @@ static int __stop_cpus(const struct cpumask *cpumask,
 	if (!queue_stop_cpus_work(cpumask, fn, arg, &done))
 		return -ENOENT;
 	wait_for_completion(&done.completion);
-	WARN_ON(!done.executed);
 	return done.ret;
 }
 
@@ -467,6 +461,7 @@ repeat:
 		ret = fn(arg);
 		if (ret && done)
 			done->ret = ret;
+		cpu_stop_signal_done(done);
 
 		/* restore preemption and check it's still balanced */
 		preempt_enable();
@@ -475,7 +470,6 @@ repeat:
 			  kallsyms_lookup((unsigned long)fn, NULL, NULL, NULL,
 					  ksym_buf), arg);
 
-		cpu_stop_signal_done(done, true);
 		goto repeat;
 	}
 }
-- 
cgit v1.1


From dd2e3121e3cb16d03a6e3f2db48f260f046f39c2 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Sun, 15 Nov 2015 20:33:29 +0100
Subject: stop_machine: Shift the 'done != NULL' check from
 cpu_stop_signal_done() to callers

Change cpu_stop_queue_work() and cpu_stopper_thread() to check done != NULL
before cpu_stop_signal_done(done). This makes the code more clean imo, note
that cpu_stopper_thread() has to do this check anyway.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Milos Vyletel <milos@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20151115193329.GA8274@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/stop_machine.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 09eb83f..7ff7ace 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -64,10 +64,8 @@ static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
 /* signal completion unless @done is NULL */
 static void cpu_stop_signal_done(struct cpu_stop_done *done)
 {
-	if (done) {
-		if (atomic_dec_and_test(&done->nr_todo))
-			complete(&done->completion);
-	}
+	if (atomic_dec_and_test(&done->nr_todo))
+		complete(&done->completion);
 }
 
 static void __cpu_stop_queue_work(struct cpu_stopper *stopper,
@@ -88,7 +86,7 @@ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
 	enabled = stopper->enabled;
 	if (enabled)
 		__cpu_stop_queue_work(stopper, work);
-	else
+	else if (work->done)
 		cpu_stop_signal_done(work->done);
 	spin_unlock_irqrestore(&stopper->lock, flags);
 
@@ -457,12 +455,12 @@ repeat:
 
 		/* cpu stop callbacks are not allowed to sleep */
 		preempt_disable();
-
 		ret = fn(arg);
-		if (ret && done)
-			done->ret = ret;
-		cpu_stop_signal_done(done);
-
+		if (done) {
+			if (ret)
+				done->ret = ret;
+			cpu_stop_signal_done(done);
+		}
 		/* restore preemption and check it's still balanced */
 		preempt_enable();
 		WARN_ONCE(preempt_count(),
-- 
cgit v1.1


From accaf6ea3db6f5fb997f096b6eefd5431d03f7e5 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Sun, 15 Nov 2015 20:33:32 +0100
Subject: stop_machine: Clean up the usage of the preemption counter in
 cpu_stopper_thread()

1. Change this code to use preempt_count_inc/preempt_count_dec; this way
   it works even if CONFIG_PREEMPT_COUNT=n, and we avoid the unnecessary
   __preempt_schedule() check (stop_sched_class is not preemptible).

   And this makes clear that we only want to make preempt_count() != 0
   for __might_sleep() / schedule_debug().

2. Change WARN_ONCE() to use %pf to print the function name and remove
   kallsyms_lookup/ksym_buf.

3. Move "int ret" into the "if (work)" block, this looks more consistent.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Milos Vyletel <milos@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20151115193332.GA8281@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/stop_machine.c | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 7ff7ace..6110119 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -435,7 +435,6 @@ static void cpu_stopper_thread(unsigned int cpu)
 {
 	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
 	struct cpu_stop_work *work;
-	int ret;
 
 repeat:
 	work = NULL;
@@ -451,23 +450,19 @@ repeat:
 		cpu_stop_fn_t fn = work->fn;
 		void *arg = work->arg;
 		struct cpu_stop_done *done = work->done;
-		char ksym_buf[KSYM_NAME_LEN] __maybe_unused;
+		int ret;
 
-		/* cpu stop callbacks are not allowed to sleep */
-		preempt_disable();
+		/* cpu stop callbacks must not sleep, make in_atomic() == T */
+		preempt_count_inc();
 		ret = fn(arg);
 		if (done) {
 			if (ret)
 				done->ret = ret;
 			cpu_stop_signal_done(done);
 		}
-		/* restore preemption and check it's still balanced */
-		preempt_enable();
+		preempt_count_dec();
 		WARN_ONCE(preempt_count(),
-			  "cpu_stop: %s(%p) leaked preempt count\n",
-			  kallsyms_lookup((unsigned long)fn, NULL, NULL, NULL,
-					  ksym_buf), arg);
-
+			  "cpu_stop: %pf(%p) leaked preempt count\n", fn, arg);
 		goto repeat;
 	}
 }
-- 
cgit v1.1


From 525628c73bd6af65f27d927e699e7460d7d55ed3 Mon Sep 17 00:00:00 2001
From: Byungchul Park <byungchul.park@lge.com>
Date: Wed, 18 Nov 2015 09:34:59 +0900
Subject: sched/fair: Modify the comment about lock assumptions in
 migrate_task_rq_fair()

The comment describing migrate_task_rq_fair() says that the caller
should hold p->pi_lock. But in some cases the caller can hold
task_rq(p)->lock instead of p->pi_lock. So the comment is broken and
this patch fixes it.

Signed-off-by: Byungchul Park <byungchul.park@lge.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1447806899-20303-1-git-send-email-byungchul.park@lge.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f7017ad..ff8ec86 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5058,8 +5058,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 /*
  * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
  * cfs_rq_of(p) references at time of call are still valid and identify the
- * previous cpu.  However, the caller only guarantees p->pi_lock is held; no
- * other assumptions, including the state of rq->lock, should be made.
+ * previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
  */
 static void migrate_task_rq_fair(struct task_struct *p)
 {
-- 
cgit v1.1


From 10013ebb5d7856c243541870f4e62fed68253e88 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Thu, 22 Oct 2015 15:07:20 -0700
Subject: x86: Add an inlined __copy_from_user_nmi() variant

Add a inlined __ variant of copy_from_user_nmi. The inlined variant allows
the user to:

 - batch the access_ok() check for multiple accesses

 - avoid having a pagefault_disable/enable() on every access if the
   caller already ensures disabled page faults due to its context.

 - get all the optimizations in copy_*_user() for small constant sized
   transfers

It is just a define to __copy_from_user_inatomic().

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1445551641-13379-1-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/uaccess.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 09b1b0a..660458a 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -745,5 +745,14 @@ copy_to_user(void __user *to, const void *from, unsigned long n)
 #undef __copy_from_user_overflow
 #undef __copy_to_user_overflow
 
+/*
+ * We rely on the nested NMI work to allow atomic faults from the NMI path; the
+ * nested NMI paths are careful to preserve CR2.
+ *
+ * Caller must use pagefault_enable/disable, or run in interrupt context,
+ * and also do a uaccess_ok() check
+ */
+#define __copy_from_user_nmi __copy_from_user_inatomic
+
 #endif /* _ASM_X86_UACCESS_H */
 
-- 
cgit v1.1


From 75925e1ad7f5a4e867bd14ff8e7f114ea1596434 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Thu, 22 Oct 2015 15:07:21 -0700
Subject: perf/x86: Optimize stack walk user accesses

Change the perf user stack walking to use the new
__copy_from_user_nmi(), and split each access into word sized transfer
sizes. This allows to inline the complete access and optimize it all
into a single load.

The main advantage is that this avoids the overhead of double page
faults.  When normal copy_from_user() fails it reexecutes the copy to
compute an accurate number of non copied bytes. This leads to
executing the expensive page fault twice.

While walking stacks having a fault at some point is relatively common
(typically when some part of the program isn't compiled with frame
pointers), so this is a large overhead.

With the optimized copies we avoid this problem because they only do
all accesses once. And of course they're much faster too when the
access does not fault because they're just single instructions instead
of complex function calls.

While profiling a kernel build with -g, the patch brings down the
average time of the PMI handler from 966ns to 552ns (-43%).

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1445551641-13379-2-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 2bf79d7..9dfbba5 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -2250,12 +2250,19 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
 	ss_base = get_segment_base(regs->ss);
 
 	fp = compat_ptr(ss_base + regs->bp);
+	pagefault_disable();
 	while (entry->nr < PERF_MAX_STACK_DEPTH) {
 		unsigned long bytes;
 		frame.next_frame     = 0;
 		frame.return_address = 0;
 
-		bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
+		if (!access_ok(VERIFY_READ, fp, 8))
+			break;
+
+		bytes = __copy_from_user_nmi(&frame.next_frame, fp, 4);
+		if (bytes != 0)
+			break;
+		bytes = __copy_from_user_nmi(&frame.return_address, fp+4, 4);
 		if (bytes != 0)
 			break;
 
@@ -2265,6 +2272,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
 		perf_callchain_store(entry, cs_base + frame.return_address);
 		fp = compat_ptr(ss_base + frame.next_frame);
 	}
+	pagefault_enable();
 	return 1;
 }
 #else
@@ -2302,12 +2310,19 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 	if (perf_callchain_user32(regs, entry))
 		return;
 
+	pagefault_disable();
 	while (entry->nr < PERF_MAX_STACK_DEPTH) {
 		unsigned long bytes;
 		frame.next_frame	     = NULL;
 		frame.return_address = 0;
 
-		bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
+		if (!access_ok(VERIFY_READ, fp, 16))
+			break;
+
+		bytes = __copy_from_user_nmi(&frame.next_frame, fp, 8);
+		if (bytes != 0)
+			break;
+		bytes = __copy_from_user_nmi(&frame.return_address, fp+8, 8);
 		if (bytes != 0)
 			break;
 
@@ -2315,8 +2330,9 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 			break;
 
 		perf_callchain_store(entry, frame.return_address);
-		fp = frame.next_frame;
+		fp = (void __user *)frame.next_frame;
 	}
+	pagefault_enable();
 }
 
 /*
-- 
cgit v1.1


From b16a5b52eb90d92b597257778e51e1fdc6423e64 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Tue, 20 Oct 2015 11:46:34 -0700
Subject: perf/x86: Add option to disable reading branch flags/cycles

With LBRv5 reading the extra LBR flags like mispredict, TSX, cycles is
not free anymore, as it has moved to a separate MSR.

For callstack mode we don't need any of this information; so we can
avoid the unnecessary MSR read. Add flags to the perf interface where
perf record can request not collecting this information.

Add branch_sample_type flags for CYCLES and FLAGS. It's a bit unusual
for branch_sample_types to be negative (disable), not positive (enable),
but since the legacy ABI reported the flags we need some form of
explicit disabling to avoid breaking the ABI.

After we have the flags the x86 perf code can keep track if any users
need the flags. If noone needs it the information is not collected.

This cuts down the cost of LBR callstack on Skylake significantly.
Profiling a kernel build with LBR call stack the average run time of
the PMI handler drops by 43%.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: acme@kernel.org
Cc: jolsa@kernel.org
Link: http://lkml.kernel.org/r/1445366797-30894-2-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 19 +++++++++++++++++--
 include/uapi/linux/perf_event.h            |  6 ++++++
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 659f01e..e2fad0c 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -42,6 +42,13 @@ static enum {
 #define LBR_FAR_BIT		8 /* do not capture far branches */
 #define LBR_CALL_STACK_BIT	9 /* enable call stack */
 
+/*
+ * Following bit only exists in Linux; we mask it out before writing it to
+ * the actual MSR. But it helps the constraint perf code to understand
+ * that this is a separate configuration.
+ */
+#define LBR_NO_INFO_BIT	       63 /* don't read LBR_INFO. */
+
 #define LBR_KERNEL	(1 << LBR_KERNEL_BIT)
 #define LBR_USER	(1 << LBR_USER_BIT)
 #define LBR_JCC		(1 << LBR_JCC_BIT)
@@ -52,6 +59,7 @@ static enum {
 #define LBR_IND_JMP	(1 << LBR_IND_JMP_BIT)
 #define LBR_FAR		(1 << LBR_FAR_BIT)
 #define LBR_CALL_STACK	(1 << LBR_CALL_STACK_BIT)
+#define LBR_NO_INFO	(1ULL << LBR_NO_INFO_BIT)
 
 #define LBR_PLM (LBR_KERNEL | LBR_USER)
 
@@ -152,7 +160,7 @@ static void __intel_pmu_lbr_enable(bool pmi)
 	 * did not change.
 	 */
 	if (cpuc->lbr_sel)
-		lbr_select = cpuc->lbr_sel->config;
+		lbr_select = cpuc->lbr_sel->config & x86_pmu.lbr_sel_mask;
 	if (!pmi)
 		wrmsrl(MSR_LBR_SELECT, lbr_select);
 
@@ -422,6 +430,7 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
  */
 static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 {
+	bool need_info = !(cpuc->lbr_sel->config & LBR_NO_INFO);
 	unsigned long mask = x86_pmu.lbr_nr - 1;
 	int lbr_format = x86_pmu.intel_cap.lbr_format;
 	u64 tos = intel_pmu_lbr_tos();
@@ -442,7 +451,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 		rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
 		rdmsrl(x86_pmu.lbr_to   + lbr_idx, to);
 
-		if (lbr_format == LBR_FORMAT_INFO) {
+		if (lbr_format == LBR_FORMAT_INFO && need_info) {
 			u64 info;
 
 			rdmsrl(MSR_LBR_INFO_0 + lbr_idx, info);
@@ -590,6 +599,7 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
 		if (v != LBR_IGN)
 			mask |= v;
 	}
+
 	reg = &event->hw.branch_reg;
 	reg->idx = EXTRA_REG_LBR;
 
@@ -600,6 +610,11 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
 	 */
 	reg->config = mask ^ x86_pmu.lbr_sel_mask;
 
+	if ((br_type & PERF_SAMPLE_BRANCH_NO_CYCLES) &&
+	    (br_type & PERF_SAMPLE_BRANCH_NO_FLAGS) &&
+	    (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO))
+		reg->config |= LBR_NO_INFO;
+
 	return 0;
 }
 
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index d801bb0..1afe962 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -171,6 +171,9 @@ enum perf_branch_sample_type_shift {
 	PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT	= 12, /* indirect jumps */
 	PERF_SAMPLE_BRANCH_CALL_SHIFT		= 13, /* direct call */
 
+	PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT	= 14, /* no flags */
+	PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT	= 15, /* no cycles */
+
 	PERF_SAMPLE_BRANCH_MAX_SHIFT		/* non-ABI */
 };
 
@@ -192,6 +195,9 @@ enum perf_branch_sample_type {
 	PERF_SAMPLE_BRANCH_IND_JUMP	= 1U << PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT,
 	PERF_SAMPLE_BRANCH_CALL		= 1U << PERF_SAMPLE_BRANCH_CALL_SHIFT,
 
+	PERF_SAMPLE_BRANCH_NO_FLAGS	= 1U << PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT,
+	PERF_SAMPLE_BRANCH_NO_CYCLES	= 1U << PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT,
+
 	PERF_SAMPLE_BRANCH_MAX		= 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT,
 };
 
-- 
cgit v1.1


From 24cc12b17679f8e9046746f92fd377f589efc163 Mon Sep 17 00:00:00 2001
From: Takao Indoh <indou.takao@jp.fujitsu.com>
Date: Wed, 4 Nov 2015 14:22:32 +0900
Subject: perf/x86/intel/pt: Add interface to stop Intel PT logging

This patch add a function for external components to stop Intel PT.
Basically this function is used when kernel panic occurs. When it is
called, the intel_pt driver disables Intel PT and saves its registers
using pt_event_stop(), which is also used by pmu.stop handler.

This function stops Intel PT on the CPU where it is working, therefore
users of it need to call it for each CPU to stop all logging.

Signed-off-by: Takao Indoh <indou.takao@jp.fujitsu.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin<alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: H.Peter Anvin <hpa@zytor.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Vivek Goyal <vgoyal@redhat.com>
Link: http://lkml.kernel.org/r/1446614553-6072-2-git-send-email-indou.takao@jp.fujitsu.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/intel_pt.h           | 10 ++++++++++
 arch/x86/kernel/cpu/perf_event_intel_pt.c |  9 +++++++++
 2 files changed, 19 insertions(+)
 create mode 100644 arch/x86/include/asm/intel_pt.h

diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h
new file mode 100644
index 0000000..e1a4117
--- /dev/null
+++ b/arch/x86/include/asm/intel_pt.h
@@ -0,0 +1,10 @@
+#ifndef _ASM_X86_INTEL_PT_H
+#define _ASM_X86_INTEL_PT_H
+
+#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
+void cpu_emergency_stop_pt(void);
+#else
+static inline void cpu_emergency_stop_pt(void) {}
+#endif
+
+#endif /* _ASM_X86_INTEL_PT_H */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c
index 868e119..c0bbd10 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_pt.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c
@@ -27,6 +27,7 @@
 #include <asm/perf_event.h>
 #include <asm/insn.h>
 #include <asm/io.h>
+#include <asm/intel_pt.h>
 
 #include "perf_event.h"
 #include "intel_pt.h"
@@ -1122,6 +1123,14 @@ static int pt_event_init(struct perf_event *event)
 	return 0;
 }
 
+void cpu_emergency_stop_pt(void)
+{
+	struct pt *pt = this_cpu_ptr(&pt_ctx);
+
+	if (pt->handle.event)
+		pt_event_stop(pt->handle.event, PERF_EF_UPDATE);
+}
+
 static __init int pt_init(void)
 {
 	int ret, cpu, prior_warn = 0;
-- 
cgit v1.1


From da06a43d3f3f3df87416f654fe15d29fecb5e321 Mon Sep 17 00:00:00 2001
From: Takao Indoh <indou.takao@jp.fujitsu.com>
Date: Wed, 4 Nov 2015 14:22:33 +0900
Subject: perf, x86: Stop Intel PT before kdump starts

This patch stops Intel PT logging and saves its registers in memory
before kdump is started. This feature is needed to prevent Intel PT from
overwriting its log buffer after panic, and saved registers are needed to
find the last position where Intel PT wrote data.

After the crash dump is captured by kdump, users can retrieve the log buffer
from the vmcore and use it to investigate bad kernel behavior.

Signed-off-by: Takao Indoh <indou.takao@jp.fujitsu.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin<alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: H.Peter Anvin <hpa@zytor.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Vivek Goyal <vgoyal@redhat.com>
Link: http://lkml.kernel.org/r/1446614553-6072-3-git-send-email-indou.takao@jp.fujitsu.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/crash.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 2c1910f..58f3431 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -35,6 +35,7 @@
 #include <asm/cpu.h>
 #include <asm/reboot.h>
 #include <asm/virtext.h>
+#include <asm/intel_pt.h>
 
 /* Alignment required for elf header segment */
 #define ELF_CORE_HEADER_ALIGN   4096
@@ -125,6 +126,11 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
 	cpu_emergency_vmxoff();
 	cpu_emergency_svm_disable();
 
+	/*
+	 * Disable Intel PT to stop its logging
+	 */
+	cpu_emergency_stop_pt();
+
 	disable_local_APIC();
 }
 
@@ -169,6 +175,11 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
 	cpu_emergency_vmxoff();
 	cpu_emergency_svm_disable();
 
+	/*
+	 * Disable Intel PT to stop its logging
+	 */
+	cpu_emergency_stop_pt();
+
 #ifdef CONFIG_X86_IO_APIC
 	/* Prevent crash_kexec() from deadlocking on ioapic_lock. */
 	ioapic_zap_locks();
-- 
cgit v1.1


From b7883a1c4f75edb62fc49da6000c59fb881e3c7b Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Mon, 16 Nov 2015 16:21:07 -0800
Subject: perf/x86: Handle multiple umask bits for BDW CYCLE_ACTIVITY.*

The earlier constraint fix for Broadwell CYCLE_ACTIVITY.*
forced umask 8 to counter 2. For this it used UEVENT,
to match the complete umask.

The event list for Broadwell has an additional
STALLS_L1D_PENDIND event that uses umask 8, but also
sets other bits in the umask.  The earlier strict umask match
didn't handle this case.

Add a new UBIT_EVENT constraint macro that only matches
the specified bits in the umask. Then use that macro
to handle CYCLE_ACTIVITY.* on Broadwell.

The documented event also uses cmask, but there's no
need to let the event scheduler know about the cmask,
as the scheduling restriction is only tied to the umask.

Reported-by: Grant Ayers <ayers@cs.stanford.edu>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1447719667-9998-1-git-send-email-andi@firstfloor.org
[ Filled in the missing email address of Grant Ayers - hopefully I got the right one. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.h       | 4 ++++
 arch/x86/kernel/cpu/perf_event_intel.c | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index ab18b8a..58402f6 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -318,6 +318,10 @@ struct cpu_hw_events {
 #define INTEL_UEVENT_CONSTRAINT(c, n)	\
 	EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
 
+/* Constraint on specific umask bit only + event */
+#define INTEL_UBIT_EVENT_CONSTRAINT(c, n)	\
+	EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT|(c))
+
 /* Like UEVENT_CONSTRAINT, but match flags too */
 #define INTEL_FLAGS_UEVENT_CONSTRAINT(c, n)	\
 	EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS)
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index f63360b..61f2577 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -255,7 +255,7 @@ struct event_constraint intel_bdw_event_constraints[] = {
 	FIXED_EVENT_CONSTRAINT(0x003c, 1),	/* CPU_CLK_UNHALTED.CORE */
 	FIXED_EVENT_CONSTRAINT(0x0300, 2),	/* CPU_CLK_UNHALTED.REF */
 	INTEL_UEVENT_CONSTRAINT(0x148, 0x4),	/* L1D_PEND_MISS.PENDING */
-	INTEL_UEVENT_CONSTRAINT(0x8a3, 0x4),	/* CYCLE_ACTIVITY.CYCLES_L1D_MISS */
+	INTEL_UBIT_EVENT_CONSTRAINT(0x8a3, 0x4),	/* CYCLE_ACTIVITY.CYCLES_L1D_MISS */
 	EVENT_CONSTRAINT_END
 };
 
-- 
cgit v1.1


From 978e5a3692c3b674b4c7c412e96835fd996c2ff4 Mon Sep 17 00:00:00 2001
From: Boqun Feng <boqun.feng@gmail.com>
Date: Wed, 4 Nov 2015 18:52:45 +0800
Subject: atomics: Add test for atomic operations with _relaxed variants

Some atomic operations now have _relaxed/acquire/release variants, this
patch adds some trivial tests for two purposes:

  1. test the behavior of these new operations in single-CPU
     environment.

  2. make their code generated before we actually use them somewhere,
     so that we can examine their assembly code.

Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Waiman Long <waiman.long@hp.com>
Cc: Will Deacon <will.deacon@arm.com>
Link: http://lkml.kernel.org/r/1446634365-25176-1-git-send-email-boqun.feng@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 lib/atomic64_test.c | 120 ++++++++++++++++++++++++++++++++++------------------
 1 file changed, 79 insertions(+), 41 deletions(-)

diff --git a/lib/atomic64_test.c b/lib/atomic64_test.c
index 83c33a5b..18e422b 100644
--- a/lib/atomic64_test.c
+++ b/lib/atomic64_test.c
@@ -27,6 +27,65 @@ do {								\
 		(unsigned long long)r);				\
 } while (0)
 
+/*
+ * Test for a atomic operation family,
+ * @test should be a macro accepting parameters (bit, op, ...)
+ */
+
+#define FAMILY_TEST(test, bit, op, args...)	\
+do {						\
+	test(bit, op, ##args);		\
+	test(bit, op##_acquire, ##args);	\
+	test(bit, op##_release, ##args);	\
+	test(bit, op##_relaxed, ##args);	\
+} while (0)
+
+#define TEST_RETURN(bit, op, c_op, val)				\
+do {								\
+	atomic##bit##_set(&v, v0);				\
+	r = v0;							\
+	r c_op val;						\
+	BUG_ON(atomic##bit##_##op(val, &v) != r);		\
+	BUG_ON(atomic##bit##_read(&v) != r);			\
+} while (0)
+
+#define RETURN_FAMILY_TEST(bit, op, c_op, val)			\
+do {								\
+	FAMILY_TEST(TEST_RETURN, bit, op, c_op, val);		\
+} while (0)
+
+#define TEST_ARGS(bit, op, init, ret, expect, args...)		\
+do {								\
+	atomic##bit##_set(&v, init);				\
+	BUG_ON(atomic##bit##_##op(&v, ##args) != ret);		\
+	BUG_ON(atomic##bit##_read(&v) != expect);		\
+} while (0)
+
+#define XCHG_FAMILY_TEST(bit, init, new)				\
+do {									\
+	FAMILY_TEST(TEST_ARGS, bit, xchg, init, init, new, new);	\
+} while (0)
+
+#define CMPXCHG_FAMILY_TEST(bit, init, new, wrong)			\
+do {									\
+	FAMILY_TEST(TEST_ARGS, bit, cmpxchg, 				\
+			init, init, new, init, new);			\
+	FAMILY_TEST(TEST_ARGS, bit, cmpxchg,				\
+			init, init, init, wrong, new);			\
+} while (0)
+
+#define INC_RETURN_FAMILY_TEST(bit, i)			\
+do {							\
+	FAMILY_TEST(TEST_ARGS, bit, inc_return,		\
+			i, (i) + one, (i) + one);	\
+} while (0)
+
+#define DEC_RETURN_FAMILY_TEST(bit, i)			\
+do {							\
+	FAMILY_TEST(TEST_ARGS, bit, dec_return,		\
+			i, (i) - one, (i) - one);	\
+} while (0)
+
 static __init void test_atomic(void)
 {
 	int v0 = 0xaaa31337;
@@ -45,6 +104,18 @@ static __init void test_atomic(void)
 	TEST(, and, &=, v1);
 	TEST(, xor, ^=, v1);
 	TEST(, andnot, &= ~, v1);
+
+	RETURN_FAMILY_TEST(, add_return, +=, onestwos);
+	RETURN_FAMILY_TEST(, add_return, +=, -one);
+	RETURN_FAMILY_TEST(, sub_return, -=, onestwos);
+	RETURN_FAMILY_TEST(, sub_return, -=, -one);
+
+	INC_RETURN_FAMILY_TEST(, v0);
+	DEC_RETURN_FAMILY_TEST(, v0);
+
+	XCHG_FAMILY_TEST(, v0, v1);
+	CMPXCHG_FAMILY_TEST(, v0, v1, onestwos);
+
 }
 
 #define INIT(c) do { atomic64_set(&v, c); r = c; } while (0)
@@ -74,25 +145,10 @@ static __init void test_atomic64(void)
 	TEST(64, xor, ^=, v1);
 	TEST(64, andnot, &= ~, v1);
 
-	INIT(v0);
-	r += onestwos;
-	BUG_ON(atomic64_add_return(onestwos, &v) != r);
-	BUG_ON(v.counter != r);
-
-	INIT(v0);
-	r += -one;
-	BUG_ON(atomic64_add_return(-one, &v) != r);
-	BUG_ON(v.counter != r);
-
-	INIT(v0);
-	r -= onestwos;
-	BUG_ON(atomic64_sub_return(onestwos, &v) != r);
-	BUG_ON(v.counter != r);
-
-	INIT(v0);
-	r -= -one;
-	BUG_ON(atomic64_sub_return(-one, &v) != r);
-	BUG_ON(v.counter != r);
+	RETURN_FAMILY_TEST(64, add_return, +=, onestwos);
+	RETURN_FAMILY_TEST(64, add_return, +=, -one);
+	RETURN_FAMILY_TEST(64, sub_return, -=, onestwos);
+	RETURN_FAMILY_TEST(64, sub_return, -=, -one);
 
 	INIT(v0);
 	atomic64_inc(&v);
@@ -100,33 +156,15 @@ static __init void test_atomic64(void)
 	BUG_ON(v.counter != r);
 
 	INIT(v0);
-	r += one;
-	BUG_ON(atomic64_inc_return(&v) != r);
-	BUG_ON(v.counter != r);
-
-	INIT(v0);
 	atomic64_dec(&v);
 	r -= one;
 	BUG_ON(v.counter != r);
 
-	INIT(v0);
-	r -= one;
-	BUG_ON(atomic64_dec_return(&v) != r);
-	BUG_ON(v.counter != r);
-
-	INIT(v0);
-	BUG_ON(atomic64_xchg(&v, v1) != v0);
-	r = v1;
-	BUG_ON(v.counter != r);
-
-	INIT(v0);
-	BUG_ON(atomic64_cmpxchg(&v, v0, v1) != v0);
-	r = v1;
-	BUG_ON(v.counter != r);
+	INC_RETURN_FAMILY_TEST(64, v0);
+	DEC_RETURN_FAMILY_TEST(64, v0);
 
-	INIT(v0);
-	BUG_ON(atomic64_cmpxchg(&v, v2, v1) != v0);
-	BUG_ON(v.counter != r);
+	XCHG_FAMILY_TEST(64, v0, v1);
+	CMPXCHG_FAMILY_TEST(64, v0, v1, v2);
 
 	INIT(v0);
 	BUG_ON(atomic64_add_unless(&v, one, v0));
-- 
cgit v1.1


From 64d816cba06c67eeee455b8c78ebcda349d49c24 Mon Sep 17 00:00:00 2001
From: Waiman Long <Waiman.Long@hpe.com>
Date: Mon, 9 Nov 2015 19:09:21 -0500
Subject: locking/qspinlock: Use _acquire/_release() versions of cmpxchg() &
 xchg()

This patch replaces the cmpxchg() and xchg() calls in the native
qspinlock code with the more relaxed _acquire or _release versions of
those calls to enable other architectures to adopt queued spinlocks
with less memory barrier performance overhead.

Signed-off-by: Waiman Long <Waiman.Long@hpe.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Douglas Hatch <doug.hatch@hpe.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Scott J Norton <scott.norton@hpe.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1447114167-47185-2-git-send-email-Waiman.Long@hpe.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/asm-generic/qspinlock.h |  9 +++++----
 kernel/locking/qspinlock.c      | 29 ++++++++++++++++++++++++-----
 2 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
index e2aadbc..39e1cb2 100644
--- a/include/asm-generic/qspinlock.h
+++ b/include/asm-generic/qspinlock.h
@@ -12,8 +12,9 @@
  * GNU General Public License for more details.
  *
  * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
+ * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
  *
- * Authors: Waiman Long <waiman.long@hp.com>
+ * Authors: Waiman Long <waiman.long@hpe.com>
  */
 #ifndef __ASM_GENERIC_QSPINLOCK_H
 #define __ASM_GENERIC_QSPINLOCK_H
@@ -62,7 +63,7 @@ static __always_inline int queued_spin_is_contended(struct qspinlock *lock)
 static __always_inline int queued_spin_trylock(struct qspinlock *lock)
 {
 	if (!atomic_read(&lock->val) &&
-	   (atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL) == 0))
+	   (atomic_cmpxchg_acquire(&lock->val, 0, _Q_LOCKED_VAL) == 0))
 		return 1;
 	return 0;
 }
@@ -77,7 +78,7 @@ static __always_inline void queued_spin_lock(struct qspinlock *lock)
 {
 	u32 val;
 
-	val = atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL);
+	val = atomic_cmpxchg_acquire(&lock->val, 0, _Q_LOCKED_VAL);
 	if (likely(val == 0))
 		return;
 	queued_spin_lock_slowpath(lock, val);
@@ -93,7 +94,7 @@ static __always_inline void queued_spin_unlock(struct qspinlock *lock)
 	/*
 	 * smp_mb__before_atomic() in order to guarantee release semantics
 	 */
-	smp_mb__before_atomic_dec();
+	smp_mb__before_atomic();
 	atomic_sub(_Q_LOCKED_VAL, &lock->val);
 }
 #endif
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 87e9ce6a..7868418 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -14,8 +14,9 @@
  * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
  * (C) Copyright 2013-2014 Red Hat, Inc.
  * (C) Copyright 2015 Intel Corp.
+ * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
  *
- * Authors: Waiman Long <waiman.long@hp.com>
+ * Authors: Waiman Long <waiman.long@hpe.com>
  *          Peter Zijlstra <peterz@infradead.org>
  */
 
@@ -176,7 +177,12 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
 {
 	struct __qspinlock *l = (void *)lock;
 
-	return (u32)xchg(&l->tail, tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
+	/*
+	 * Use release semantics to make sure that the MCS node is properly
+	 * initialized before changing the tail code.
+	 */
+	return (u32)xchg_release(&l->tail,
+				 tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
 }
 
 #else /* _Q_PENDING_BITS == 8 */
@@ -208,7 +214,11 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
 
 	for (;;) {
 		new = (val & _Q_LOCKED_PENDING_MASK) | tail;
-		old = atomic_cmpxchg(&lock->val, val, new);
+		/*
+		 * Use release semantics to make sure that the MCS node is
+		 * properly initialized before changing the tail code.
+		 */
+		old = atomic_cmpxchg_release(&lock->val, val, new);
 		if (old == val)
 			break;
 
@@ -319,7 +329,11 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
 		if (val == new)
 			new |= _Q_PENDING_VAL;
 
-		old = atomic_cmpxchg(&lock->val, val, new);
+		/*
+		 * Acquire semantic is required here as the function may
+		 * return immediately if the lock was free.
+		 */
+		old = atomic_cmpxchg_acquire(&lock->val, val, new);
 		if (old == val)
 			break;
 
@@ -426,7 +440,12 @@ queue:
 			set_locked(lock);
 			break;
 		}
-		old = atomic_cmpxchg(&lock->val, val, _Q_LOCKED_VAL);
+		/*
+		 * The smp_load_acquire() call above has provided the necessary
+		 * acquire semantics required for locking. At most two
+		 * iterations of this loop may be ran.
+		 */
+		old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL);
 		if (old == val)
 			goto release;	/* No contention */
 
-- 
cgit v1.1


From 81b5598665a24083dd889fbd8cb08b0d8de4b8ad Mon Sep 17 00:00:00 2001
From: Waiman Long <Waiman.Long@hpe.com>
Date: Mon, 9 Nov 2015 19:09:22 -0500
Subject: locking/qspinlock: Prefetch the next node cacheline

A queue head CPU, after acquiring the lock, will have to notify
the next CPU in the wait queue that it has became the new queue
head. This involves loading a new cacheline from the MCS node of the
next CPU. That operation can be expensive and add to the latency of
locking operation.

This patch addes code to optmistically prefetch the next MCS node
cacheline if the next pointer is defined and it has been spinning
for the MCS lock for a while. This reduces the locking latency and
improves the system throughput.

The performance change will depend on whether the prefetch overhead
can be hidden within the latency of the lock spin loop. On really
short critical section, there may not be performance gain at all. With
longer critical section, however, it was found to have a performance
boost of 5-10% over a range of different queue depths with a spinlock
loop microbenchmark.

Signed-off-by: Waiman Long <Waiman.Long@hpe.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Douglas Hatch <doug.hatch@hpe.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Scott J Norton <scott.norton@hpe.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1447114167-47185-3-git-send-email-Waiman.Long@hpe.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/qspinlock.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 7868418..365b203 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -407,6 +407,16 @@ queue:
 
 		pv_wait_node(node);
 		arch_mcs_spin_lock_contended(&node->locked);
+
+		/*
+		 * While waiting for the MCS lock, the next pointer may have
+		 * been set by another lock waiter. We optimistically load
+		 * the next pointer & prefetch the cacheline for writing
+		 * to reduce latency in the upcoming MCS unlock operation.
+		 */
+		next = READ_ONCE(node->next);
+		if (next)
+			prefetchw(next);
 	}
 
 	/*
-- 
cgit v1.1


From aa68744f80bfb6f26fbe7f10e42876066f7dac1b Mon Sep 17 00:00:00 2001
From: Waiman Long <Waiman.Long@hpe.com>
Date: Mon, 9 Nov 2015 19:09:23 -0500
Subject: locking/qspinlock: Avoid redundant read of next pointer

With optimistic prefetch of the next node cacheline, the next pointer
may have been properly inititalized. As a result, the reading
of node->next in the contended path may be redundant. This patch
eliminates the redundant read if the next pointer value is not NULL.

Signed-off-by: Waiman Long <Waiman.Long@hpe.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Douglas Hatch <doug.hatch@hpe.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Scott J Norton <scott.norton@hpe.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1447114167-47185-4-git-send-email-Waiman.Long@hpe.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/qspinlock.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 365b203..9862078 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -396,6 +396,7 @@ queue:
 	 * p,*,* -> n,*,*
 	 */
 	old = xchg_tail(lock, tail);
+	next = NULL;
 
 	/*
 	 * if there was a previous node; link it and wait until reaching the
@@ -463,10 +464,12 @@ queue:
 	}
 
 	/*
-	 * contended path; wait for next, release.
+	 * contended path; wait for next if not observed yet, release.
 	 */
-	while (!(next = READ_ONCE(node->next)))
-		cpu_relax();
+	if (!next) {
+		while (!(next = READ_ONCE(node->next)))
+			cpu_relax();
+	}
 
 	arch_mcs_spin_unlock_contended(&next->locked);
 	pv_kick_node(lock, next);
-- 
cgit v1.1


From d78045306c41bd9334b956e4e7fa77cc72f06a40 Mon Sep 17 00:00:00 2001
From: Waiman Long <Waiman.Long@hpe.com>
Date: Mon, 9 Nov 2015 19:09:24 -0500
Subject: locking/pvqspinlock, x86: Optimize the PV unlock code path

The unlock function in queued spinlocks was optimized for better
performance on bare metal systems at the expense of virtualized guests.

For x86-64 systems, the unlock call needs to go through a
PV_CALLEE_SAVE_REGS_THUNK() which saves and restores 8 64-bit
registers before calling the real __pv_queued_spin_unlock()
function. The thunk code may also be in a separate cacheline from
__pv_queued_spin_unlock().

This patch optimizes the PV unlock code path by:

 1) Moving the unlock slowpath code from the fastpath into a separate
    __pv_queued_spin_unlock_slowpath() function to make the fastpath
    as simple as possible..

 2) For x86-64, hand-coded an assembly function to combine the register
    saving thunk code with the fastpath code. Only registers that
    are used in the fastpath will be saved and restored. If the
    fastpath fails, the slowpath function will be called via another
    PV_CALLEE_SAVE_REGS_THUNK(). For 32-bit, it falls back to the C
    __pv_queued_spin_unlock() code as the thunk saves and restores
    only one 32-bit register.

With a microbenchmark of 5M lock-unlock loop, the table below shows
the execution times before and after the patch with different number
of threads in a VM running on a 32-core Westmere-EX box with x86-64
4.2-rc1 based kernels:

  Threads	Before patch	After patch	% Change
  -------	------------	-----------	--------
     1		   134.1 ms	  119.3 ms	  -11%
     2		   1286  ms	   953  ms	  -26%
     3		   3715  ms	  3480  ms	  -6.3%
     4		   4092  ms	  3764  ms	  -8.0%

Signed-off-by: Waiman Long <Waiman.Long@hpe.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Douglas Hatch <doug.hatch@hpe.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Scott J Norton <scott.norton@hpe.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1447114167-47185-5-git-send-email-Waiman.Long@hpe.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/qspinlock_paravirt.h | 59 +++++++++++++++++++++++++++++++
 kernel/locking/qspinlock_paravirt.h       | 43 +++++++++++++---------
 2 files changed, 86 insertions(+), 16 deletions(-)

diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h
index b002e71..9f92c18 100644
--- a/arch/x86/include/asm/qspinlock_paravirt.h
+++ b/arch/x86/include/asm/qspinlock_paravirt.h
@@ -1,6 +1,65 @@
 #ifndef __ASM_QSPINLOCK_PARAVIRT_H
 #define __ASM_QSPINLOCK_PARAVIRT_H
 
+/*
+ * For x86-64, PV_CALLEE_SAVE_REGS_THUNK() saves and restores 8 64-bit
+ * registers. For i386, however, only 1 32-bit register needs to be saved
+ * and restored. So an optimized version of __pv_queued_spin_unlock() is
+ * hand-coded for 64-bit, but it isn't worthwhile to do it for 32-bit.
+ */
+#ifdef CONFIG_64BIT
+
+PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath);
+#define __pv_queued_spin_unlock	__pv_queued_spin_unlock
+#define PV_UNLOCK		"__raw_callee_save___pv_queued_spin_unlock"
+#define PV_UNLOCK_SLOWPATH	"__raw_callee_save___pv_queued_spin_unlock_slowpath"
+
+/*
+ * Optimized assembly version of __raw_callee_save___pv_queued_spin_unlock
+ * which combines the registers saving trunk and the body of the following
+ * C code:
+ *
+ * void __pv_queued_spin_unlock(struct qspinlock *lock)
+ * {
+ *	struct __qspinlock *l = (void *)lock;
+ *	u8 lockval = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
+ *
+ *	if (likely(lockval == _Q_LOCKED_VAL))
+ *		return;
+ *	pv_queued_spin_unlock_slowpath(lock, lockval);
+ * }
+ *
+ * For x86-64,
+ *   rdi = lock              (first argument)
+ *   rsi = lockval           (second argument)
+ *   rdx = internal variable (set to 0)
+ */
+asm    (".pushsection .text;"
+	".globl " PV_UNLOCK ";"
+	".align 4,0x90;"
+	PV_UNLOCK ": "
+	"push  %rdx;"
+	"mov   $0x1,%eax;"
+	"xor   %edx,%edx;"
+	"lock cmpxchg %dl,(%rdi);"
+	"cmp   $0x1,%al;"
+	"jne   .slowpath;"
+	"pop   %rdx;"
+	"ret;"
+	".slowpath: "
+	"push   %rsi;"
+	"movzbl %al,%esi;"
+	"call " PV_UNLOCK_SLOWPATH ";"
+	"pop    %rsi;"
+	"pop    %rdx;"
+	"ret;"
+	".size " PV_UNLOCK ", .-" PV_UNLOCK ";"
+	".popsection");
+
+#else /* CONFIG_64BIT */
+
+extern void __pv_queued_spin_unlock(struct qspinlock *lock);
 PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock);
 
+#endif /* CONFIG_64BIT */
 #endif
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index f0450ff..4bd323d 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -308,23 +308,14 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
 }
 
 /*
- * PV version of the unlock function to be used in stead of
- * queued_spin_unlock().
+ * PV versions of the unlock fastpath and slowpath functions to be used
+ * instead of queued_spin_unlock().
  */
-__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
+__visible void
+__pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
 {
 	struct __qspinlock *l = (void *)lock;
 	struct pv_node *node;
-	u8 locked;
-
-	/*
-	 * We must not unlock if SLOW, because in that case we must first
-	 * unhash. Otherwise it would be possible to have multiple @lock
-	 * entries, which would be BAD.
-	 */
-	locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
-	if (likely(locked == _Q_LOCKED_VAL))
-		return;
 
 	if (unlikely(locked != _Q_SLOW_VAL)) {
 		WARN(!debug_locks_silent,
@@ -363,12 +354,32 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
 	 */
 	pv_kick(node->cpu);
 }
+
 /*
  * Include the architecture specific callee-save thunk of the
  * __pv_queued_spin_unlock(). This thunk is put together with
- * __pv_queued_spin_unlock() near the top of the file to make sure
- * that the callee-save thunk and the real unlock function are close
- * to each other sharing consecutive instruction cachelines.
+ * __pv_queued_spin_unlock() to make the callee-save thunk and the real unlock
+ * function close to each other sharing consecutive instruction cachelines.
+ * Alternatively, architecture specific version of __pv_queued_spin_unlock()
+ * can be defined.
  */
 #include <asm/qspinlock_paravirt.h>
 
+#ifndef __pv_queued_spin_unlock
+__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
+{
+	struct __qspinlock *l = (void *)lock;
+	u8 locked;
+
+	/*
+	 * We must not unlock if SLOW, because in that case we must first
+	 * unhash. Otherwise it would be possible to have multiple @lock
+	 * entries, which would be BAD.
+	 */
+	locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
+	if (likely(locked == _Q_LOCKED_VAL))
+		return;
+
+	__pv_queued_spin_unlock_slowpath(lock, locked);
+}
+#endif /* __pv_queued_spin_unlock */
-- 
cgit v1.1


From 5fdf5d37f40a3b18c0d613463867f71c017b75ef Mon Sep 17 00:00:00 2001
From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Date: Thu, 19 Nov 2015 16:55:45 -0500
Subject: x86/xen: Avoid fast syscall path for Xen PV guests

After 32-bit syscall rewrite, and specifically after commit:

  5f310f739b4c ("x86/entry/32: Re-implement SYSENTER using the new C path")

... the stack frame that is passed to xen_sysexit is no longer a
"standard" one (i.e. it's not pt_regs).

Since we end up calling xen_iret from xen_sysexit we don't need
to fix up the stack and instead follow entry_SYSENTER_32's IRET
path directly to xen_iret.

We can do the same thing for compat mode even though stack does
not need to be fixed. This will allow us to drop usergs_sysret32
paravirt op (in the subsequent patch)

Suggested-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Acked-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: david.vrabel@citrix.com
Cc: konrad.wilk@oracle.com
Cc: virtualization@lists.linux-foundation.org
Cc: xen-devel@lists.xenproject.org
Link: http://lkml.kernel.org/r/1447970147-1733-2-git-send-email-boris.ostrovsky@oracle.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/entry_32.S         |  5 +++--
 arch/x86/entry/entry_64_compat.S  | 10 ++++++----
 arch/x86/include/asm/cpufeature.h |  1 +
 arch/x86/xen/enlighten.c          |  4 +++-
 4 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 3eb572e..0870825 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -308,8 +308,9 @@ sysenter_past_esp:
 
 	movl	%esp, %eax
 	call	do_fast_syscall_32
-	testl	%eax, %eax
-	jz	.Lsyscall_32_done
+	/* XEN PV guests always use IRET path */
+	ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
+		    "jmp .Lsyscall_32_done", X86_FEATURE_XENPV
 
 /* Opportunistic SYSEXIT */
 	TRACE_IRQS_ON			/* User mode traces as IRQs on. */
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index c320183..402e34a 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -121,8 +121,9 @@ sysenter_flags_fixed:
 
 	movq	%rsp, %rdi
 	call	do_fast_syscall_32
-	testl	%eax, %eax
-	jz	.Lsyscall_32_done
+	/* XEN PV guests always use IRET path */
+	ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
+		    "jmp .Lsyscall_32_done", X86_FEATURE_XENPV
 	jmp	sysret32_from_system_call
 
 sysenter_fix_flags:
@@ -200,8 +201,9 @@ ENTRY(entry_SYSCALL_compat)
 
 	movq	%rsp, %rdi
 	call	do_fast_syscall_32
-	testl	%eax, %eax
-	jz	.Lsyscall_32_done
+	/* XEN PV guests always use IRET path */
+	ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
+		    "jmp .Lsyscall_32_done", X86_FEATURE_XENPV
 
 	/* Opportunistic SYSRET */
 sysret32_from_system_call:
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index e4f8010..f7ba9fb 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -216,6 +216,7 @@
 #define X86_FEATURE_PAUSEFILTER ( 8*32+13) /* AMD filtered pause intercept */
 #define X86_FEATURE_PFTHRESHOLD ( 8*32+14) /* AMD pause filter threshold */
 #define X86_FEATURE_VMMCALL     ( 8*32+15) /* Prefer vmmcall to vmcall */
+#define X86_FEATURE_XENPV       ( 8*32+16) /* "" Xen paravirtual guest */
 
 
 /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 5774800..d315151 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1886,8 +1886,10 @@ EXPORT_SYMBOL_GPL(xen_hvm_need_lapic);
 
 static void xen_set_cpu_features(struct cpuinfo_x86 *c)
 {
-	if (xen_pv_domain())
+	if (xen_pv_domain()) {
 		clear_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
+		set_cpu_cap(c, X86_FEATURE_XENPV);
+	}
 }
 
 const struct hypervisor_x86 x86_hyper_xen = {
-- 
cgit v1.1


From 88c15ec90ff16880efab92b519436ee17b198477 Mon Sep 17 00:00:00 2001
From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Date: Thu, 19 Nov 2015 16:55:46 -0500
Subject: x86/paravirt: Remove the unused irq_enable_sysexit pv op

As result of commit "x86/xen: Avoid fast syscall path for Xen PV
guests", the irq_enable_sysexit pv op is not called by Xen PV guests
anymore and since they were the only ones who used it we can
safely remove it.

Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Acked-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: david.vrabel@citrix.com
Cc: konrad.wilk@oracle.com
Cc: virtualization@lists.linux-foundation.org
Cc: xen-devel@lists.xenproject.org
Link: http://lkml.kernel.org/r/1447970147-1733-3-git-send-email-boris.ostrovsky@oracle.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/entry_32.S             |  8 ++------
 arch/x86/include/asm/paravirt.h       |  7 -------
 arch/x86/include/asm/paravirt_types.h |  9 ---------
 arch/x86/kernel/asm-offsets.c         |  3 ---
 arch/x86/kernel/paravirt.c            |  7 -------
 arch/x86/kernel/paravirt_patch_32.c   |  2 --
 arch/x86/kernel/paravirt_patch_64.c   |  1 -
 arch/x86/xen/enlighten.c              |  3 ---
 arch/x86/xen/xen-asm_32.S             | 14 --------------
 arch/x86/xen/xen-ops.h                |  3 ---
 10 files changed, 2 insertions(+), 55 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 0870825..9870c97 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -329,7 +329,8 @@ sysenter_past_esp:
 	 * Return back to the vDSO, which will pop ecx and edx.
 	 * Don't bother with DS and ES (they already contain __USER_DS).
 	 */
-	ENABLE_INTERRUPTS_SYSEXIT
+	sti
+	sysexit
 
 .pushsection .fixup, "ax"
 2:	movl	$0, PT_FS(%esp)
@@ -552,11 +553,6 @@ ENTRY(native_iret)
 	iret
 	_ASM_EXTABLE(native_iret, iret_exc)
 END(native_iret)
-
-ENTRY(native_irq_enable_sysexit)
-	sti
-	sysexit
-END(native_irq_enable_sysexit)
 #endif
 
 ENTRY(overflow)
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 10d0596..c28518e 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -932,13 +932,6 @@ extern void default_banner(void);
 	push %ecx; push %edx;				\
 	call PARA_INDIRECT(pv_cpu_ops+PV_CPU_read_cr0);	\
 	pop %edx; pop %ecx
-
-#define ENABLE_INTERRUPTS_SYSEXIT					\
-	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit),	\
-		  CLBR_NONE,						\
-		  jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_sysexit))
-
-
 #else	/* !CONFIG_X86_32 */
 
 /*
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 31247b5..608bbf3 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -157,15 +157,6 @@ struct pv_cpu_ops {
 
 	u64 (*read_pmc)(int counter);
 
-#ifdef CONFIG_X86_32
-	/*
-	 * Atomically enable interrupts and return to userspace.  This
-	 * is only used in 32-bit kernels.  64-bit kernels use
-	 * usergs_sysret32 instead.
-	 */
-	void (*irq_enable_sysexit)(void);
-#endif
-
 	/*
 	 * Switch to usermode gs and return to 64-bit usermode using
 	 * sysret.  Only used in 64-bit kernels to return to 64-bit
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 439df97..84a7524 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -65,9 +65,6 @@ void common(void) {
 	OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
 	OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
 	OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
-#ifdef CONFIG_X86_32
-	OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
-#endif
 	OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
 	OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2);
 #endif
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index c2130ae..c55f437 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -162,9 +162,6 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
 		ret = paravirt_patch_ident_64(insnbuf, len);
 
 	else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
-#ifdef CONFIG_X86_32
-		 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) ||
-#endif
 		 type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) ||
 		 type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret64))
 		/* If operation requires a jmp, then jmp */
@@ -220,7 +217,6 @@ static u64 native_steal_clock(int cpu)
 
 /* These are in entry.S */
 extern void native_iret(void);
-extern void native_irq_enable_sysexit(void);
 extern void native_usergs_sysret32(void);
 extern void native_usergs_sysret64(void);
 
@@ -379,9 +375,6 @@ __visible struct pv_cpu_ops pv_cpu_ops = {
 
 	.load_sp0 = native_load_sp0,
 
-#if defined(CONFIG_X86_32)
-	.irq_enable_sysexit = native_irq_enable_sysexit,
-#endif
 #ifdef CONFIG_X86_64
 #ifdef CONFIG_IA32_EMULATION
 	.usergs_sysret32 = native_usergs_sysret32,
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index c89f50a..158dc06 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -5,7 +5,6 @@ DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
 DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf");
 DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax");
 DEF_NATIVE(pv_cpu_ops, iret, "iret");
-DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "sti; sysexit");
 DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
 DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
 DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
@@ -46,7 +45,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 		PATCH_SITE(pv_irq_ops, restore_fl);
 		PATCH_SITE(pv_irq_ops, save_fl);
 		PATCH_SITE(pv_cpu_ops, iret);
-		PATCH_SITE(pv_cpu_ops, irq_enable_sysexit);
 		PATCH_SITE(pv_mmu_ops, read_cr2);
 		PATCH_SITE(pv_mmu_ops, read_cr3);
 		PATCH_SITE(pv_mmu_ops, write_cr3);
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index 8aa0558..17c00f8 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -13,7 +13,6 @@ DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");
 DEF_NATIVE(pv_cpu_ops, clts, "clts");
 DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
 
-DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "swapgs; sti; sysexit");
 DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq");
 DEF_NATIVE(pv_cpu_ops, usergs_sysret32, "swapgs; sysretl");
 DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs");
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index d315151..a068e36 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1229,10 +1229,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
 
 	.iret = xen_iret,
 #ifdef CONFIG_X86_64
-	.usergs_sysret32 = xen_sysret32,
 	.usergs_sysret64 = xen_sysret64,
-#else
-	.irq_enable_sysexit = xen_sysexit,
 #endif
 
 	.load_tr_desc = paravirt_nop,
diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S
index fd92a64..feb6d40 100644
--- a/arch/x86/xen/xen-asm_32.S
+++ b/arch/x86/xen/xen-asm_32.S
@@ -35,20 +35,6 @@ check_events:
 	ret
 
 /*
- * We can't use sysexit directly, because we're not running in ring0.
- * But we can easily fake it up using iret.  Assuming xen_sysexit is
- * jumped to with a standard stack frame, we can just strip it back to
- * a standard iret frame and use iret.
- */
-ENTRY(xen_sysexit)
-	movl PT_EAX(%esp), %eax			/* Shouldn't be necessary? */
-	orl $X86_EFLAGS_IF, PT_EFLAGS(%esp)
-	lea PT_EIP(%esp), %esp
-
-	jmp xen_iret
-ENDPROC(xen_sysexit)
-
-/*
  * This is run where a normal iret would be run, with the same stack setup:
  *	8: eflags
  *	4: cs
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 1399423..4140b07 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -139,9 +139,6 @@ DECL_ASM(void, xen_restore_fl_direct, unsigned long);
 
 /* These are not functions, and cannot be called normally */
 __visible void xen_iret(void);
-#ifdef CONFIG_X86_32
-__visible void xen_sysexit(void);
-#endif
 __visible void xen_sysret32(void);
 __visible void xen_sysret64(void);
 __visible void xen_adjust_exception_frame(void);
-- 
cgit v1.1


From 75ef82190dceac3d84cdc209fdf82800a7cc6609 Mon Sep 17 00:00:00 2001
From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Date: Thu, 19 Nov 2015 16:55:47 -0500
Subject: x86/entry, x86/paravirt: Remove the unused usergs_sysret32 PV op

As result of commit "x86/xen: Avoid fast syscall path for Xen PV
guests", usergs_sysret32 pv op is not called by Xen PV guests
anymore and since they were the only ones who used it we can
safely remove it.

Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Acked-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: david.vrabel@citrix.com
Cc: konrad.wilk@oracle.com
Cc: virtualization@lists.linux-foundation.org
Cc: xen-devel@lists.xenproject.org
Link: http://lkml.kernel.org/r/1447970147-1733-4-git-send-email-boris.ostrovsky@oracle.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/entry_64_compat.S      | 10 ++--------
 arch/x86/include/asm/paravirt.h       |  5 -----
 arch/x86/include/asm/paravirt_types.h |  8 --------
 arch/x86/kernel/asm-offsets_64.c      |  1 -
 arch/x86/kernel/paravirt.c            |  5 -----
 arch/x86/kernel/paravirt_patch_64.c   |  2 --
 arch/x86/xen/xen-asm_64.S             | 19 -------------------
 7 files changed, 2 insertions(+), 48 deletions(-)

diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 402e34a..bbcb285 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -18,13 +18,6 @@
 
 	.section .entry.text, "ax"
 
-#ifdef CONFIG_PARAVIRT
-ENTRY(native_usergs_sysret32)
-	swapgs
-	sysretl
-ENDPROC(native_usergs_sysret32)
-#endif
-
 /*
  * 32-bit SYSENTER instruction entry.
  *
@@ -238,7 +231,8 @@ sysret32_from_system_call:
 	xorq	%r9, %r9
 	xorq	%r10, %r10
 	movq	RSP-ORIG_RAX(%rsp), %rsp
-        USERGS_SYSRET32
+	swapgs
+	sysretl
 END(entry_SYSCALL_compat)
 
 /*
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index c28518e..1b71c3a 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -922,11 +922,6 @@ extern void default_banner(void);
 		  call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_enable);	\
 		  PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
 
-#define USERGS_SYSRET32							\
-	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret32),	\
-		  CLBR_NONE,						\
-		  jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret32))
-
 #ifdef CONFIG_X86_32
 #define GET_CR0_INTO_EAX				\
 	push %ecx; push %edx;				\
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 608bbf3..702c8bd 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -165,14 +165,6 @@ struct pv_cpu_ops {
 	 */
 	void (*usergs_sysret64)(void);
 
-	/*
-	 * Switch to usermode gs and return to 32-bit usermode using
-	 * sysret.  Used to return to 32-on-64 compat processes.
-	 * Other usermode register state, including %esp, must already
-	 * be restored.
-	 */
-	void (*usergs_sysret32)(void);
-
 	/* Normal iret.  Jump to this with the standard iret stack
 	   frame set up. */
 	void (*iret)(void);
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index d8f42f9..f2edafb 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -23,7 +23,6 @@ int main(void)
 {
 #ifdef CONFIG_PARAVIRT
 	OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame);
-	OFFSET(PV_CPU_usergs_sysret32, pv_cpu_ops, usergs_sysret32);
 	OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64);
 	OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
 	BLANK();
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index c55f437..8c19b4d 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -162,7 +162,6 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
 		ret = paravirt_patch_ident_64(insnbuf, len);
 
 	else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
-		 type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) ||
 		 type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret64))
 		/* If operation requires a jmp, then jmp */
 		ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len);
@@ -217,7 +216,6 @@ static u64 native_steal_clock(int cpu)
 
 /* These are in entry.S */
 extern void native_iret(void);
-extern void native_usergs_sysret32(void);
 extern void native_usergs_sysret64(void);
 
 static struct resource reserve_ioports = {
@@ -376,9 +374,6 @@ __visible struct pv_cpu_ops pv_cpu_ops = {
 	.load_sp0 = native_load_sp0,
 
 #ifdef CONFIG_X86_64
-#ifdef CONFIG_IA32_EMULATION
-	.usergs_sysret32 = native_usergs_sysret32,
-#endif
 	.usergs_sysret64 = native_usergs_sysret64,
 #endif
 	.iret = native_iret,
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index 17c00f8..e70087a 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -14,7 +14,6 @@ DEF_NATIVE(pv_cpu_ops, clts, "clts");
 DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
 
 DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq");
-DEF_NATIVE(pv_cpu_ops, usergs_sysret32, "swapgs; sysretl");
 DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs");
 
 DEF_NATIVE(, mov32, "mov %edi, %eax");
@@ -54,7 +53,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 		PATCH_SITE(pv_irq_ops, save_fl);
 		PATCH_SITE(pv_irq_ops, irq_enable);
 		PATCH_SITE(pv_irq_ops, irq_disable);
-		PATCH_SITE(pv_cpu_ops, usergs_sysret32);
 		PATCH_SITE(pv_cpu_ops, usergs_sysret64);
 		PATCH_SITE(pv_cpu_ops, swapgs);
 		PATCH_SITE(pv_mmu_ops, read_cr2);
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index f22667a..cc8acc4 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -68,25 +68,6 @@ ENTRY(xen_sysret64)
 ENDPATCH(xen_sysret64)
 RELOC(xen_sysret64, 1b+1)
 
-ENTRY(xen_sysret32)
-	/*
-	 * We're already on the usermode stack at this point, but
-	 * still with the kernel gs, so we can easily switch back
-	 */
-	movq %rsp, PER_CPU_VAR(rsp_scratch)
-	movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
-
-	pushq $__USER32_DS
-	pushq PER_CPU_VAR(rsp_scratch)
-	pushq %r11
-	pushq $__USER32_CS
-	pushq %rcx
-
-	pushq $0
-1:	jmp hypercall_iret
-ENDPATCH(xen_sysret32)
-RELOC(xen_sysret32, 1b+1)
-
 /*
  * Xen handles syscall callbacks much like ordinary exceptions, which
  * means we have:
-- 
cgit v1.1


From 8609d1b5daa36350e020e737946c40887af1743a Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 19 Nov 2015 17:07:55 -0800
Subject: x86/mm: Turn CONFIG_X86_PTDUMP into a module

Being able to examine page tables is handy, so make this a
module that can be loaded as needed.

Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Vladimir Murzin <vladimir.murzin@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Link: http://lkml.kernel.org/r/20151120010755.GA9060@www.outflux.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kconfig.debug         |  2 +-
 arch/x86/mm/Makefile           |  1 +
 arch/x86/mm/debug_pagetables.c | 46 ++++++++++++++++++++++++++++++++++++++++++
 arch/x86/mm/dump_pagetables.c  | 34 ++-----------------------------
 4 files changed, 50 insertions(+), 33 deletions(-)
 create mode 100644 arch/x86/mm/debug_pagetables.c

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 137dfa9..110253c 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -69,7 +69,7 @@ config X86_PTDUMP_CORE
 	def_bool n
 
 config X86_PTDUMP
-	bool "Export kernel pagetable layout to userspace via debugfs"
+	tristate "Export kernel pagetable layout to userspace via debugfs"
 	depends on DEBUG_KERNEL
 	select DEBUG_FS
 	select X86_PTDUMP_CORE
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 65c47fd..f9d38a4 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -15,6 +15,7 @@ obj-$(CONFIG_X86_32)		+= pgtable_32.o iomap_32.o
 
 obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
 obj-$(CONFIG_X86_PTDUMP_CORE)	+= dump_pagetables.o
+obj-$(CONFIG_X86_PTDUMP)	+= debug_pagetables.o
 
 obj-$(CONFIG_HIGHMEM)		+= highmem_32.o
 
diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c
new file mode 100644
index 0000000..b35ee86
--- /dev/null
+++ b/arch/x86/mm/debug_pagetables.c
@@ -0,0 +1,46 @@
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <asm/pgtable.h>
+
+static int ptdump_show(struct seq_file *m, void *v)
+{
+	ptdump_walk_pgd_level(m, NULL);
+	return 0;
+}
+
+static int ptdump_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, ptdump_show, NULL);
+}
+
+static const struct file_operations ptdump_fops = {
+	.owner		= THIS_MODULE,
+	.open		= ptdump_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static struct dentry *pe;
+
+static int __init pt_dump_debug_init(void)
+{
+	pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL,
+				 &ptdump_fops);
+	if (!pe)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void __exit pt_dump_debug_exit(void)
+{
+	debugfs_remove_recursive(pe);
+}
+
+module_init(pt_dump_debug_init);
+module_exit(pt_dump_debug_exit);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
+MODULE_DESCRIPTION("Kernel debugging helper that dumps pagetables");
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index a035c2a..90a1dc0 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -426,38 +426,15 @@ void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
 {
 	ptdump_walk_pgd_level_core(m, pgd, false);
 }
+EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level);
 
 void ptdump_walk_pgd_level_checkwx(void)
 {
 	ptdump_walk_pgd_level_core(NULL, NULL, true);
 }
 
-#ifdef CONFIG_X86_PTDUMP
-static int ptdump_show(struct seq_file *m, void *v)
+static int __init pt_dump_init(void)
 {
-	ptdump_walk_pgd_level(m, NULL);
-	return 0;
-}
-
-static int ptdump_open(struct inode *inode, struct file *filp)
-{
-	return single_open(filp, ptdump_show, NULL);
-}
-
-static const struct file_operations ptdump_fops = {
-	.open		= ptdump_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-#endif
-
-static int pt_dump_init(void)
-{
-#ifdef CONFIG_X86_PTDUMP
-	struct dentry *pe;
-#endif
-
 #ifdef CONFIG_X86_32
 	/* Not a compile-time constant on x86-32 */
 	address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
@@ -468,13 +445,6 @@ static int pt_dump_init(void)
 	address_markers[FIXADDR_START_NR].start_address = FIXADDR_START;
 #endif
 
-#ifdef CONFIG_X86_PTDUMP
-	pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL,
-				 &ptdump_fops);
-	if (!pe)
-		return -ENOMEM;
-#endif
-
 	return 0;
 }
 
-- 
cgit v1.1


From 1c97be677f72b3c338312aecd36d8fff20322f32 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 20 Sep 2015 22:02:17 -0700
Subject: list: Use WRITE_ONCE() when adding to lists and hlists

Code that does lockless emptiness testing of non-RCU lists is relying
on the list-addition code to write the list head's ->next pointer
atomically.  This commit therefore adds WRITE_ONCE() to list-addition
pointer stores that could affect the head's ->next pointer.

Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/list.h | 8 ++++----
 lib/list_debug.c     | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/linux/list.h b/include/linux/list.h
index 993395a..d7e31fe 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -42,7 +42,7 @@ static inline void __list_add(struct list_head *new,
 	next->prev = new;
 	new->next = next;
 	new->prev = prev;
-	prev->next = new;
+	WRITE_ONCE(prev->next, new);
 }
 #else
 extern void __list_add(struct list_head *new,
@@ -642,7 +642,7 @@ static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h)
 	n->next = first;
 	if (first)
 		first->pprev = &n->next;
-	h->first = n;
+	WRITE_ONCE(h->first, n);
 	n->pprev = &h->first;
 }
 
@@ -653,14 +653,14 @@ static inline void hlist_add_before(struct hlist_node *n,
 	n->pprev = next->pprev;
 	n->next = next;
 	next->pprev = &n->next;
-	*(n->pprev) = n;
+	WRITE_ONCE(*(n->pprev), n);
 }
 
 static inline void hlist_add_behind(struct hlist_node *n,
 				    struct hlist_node *prev)
 {
 	n->next = prev->next;
-	prev->next = n;
+	WRITE_ONCE(prev->next, n);
 	n->pprev = &prev->next;
 
 	if (n->next)
diff --git a/lib/list_debug.c b/lib/list_debug.c
index c24c2f7..3859bf6 100644
--- a/lib/list_debug.c
+++ b/lib/list_debug.c
@@ -37,7 +37,7 @@ void __list_add(struct list_head *new,
 	next->prev = new;
 	new->next = next;
 	new->prev = prev;
-	prev->next = new;
+	WRITE_ONCE(prev->next, new);
 }
 EXPORT_SYMBOL(__list_add);
 
-- 
cgit v1.1


From 2a67e741bbbc022e0fadf8c6dbc3a76019ecd0cf Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 8 Oct 2015 12:24:23 +0200
Subject: rcu: Create transitive rnp->lock acquisition functions

Providing RCU's memory-ordering guarantees requires that the rcu_node
tree's locking provide transitive memory ordering, which the Linux kernel's
spinlocks currently do not provide unless smp_mb__after_unlock_lock()
is used.  Having a separate smp_mb__after_unlock_lock() after each and
every lock acquisition is error-prone, hard to read, and a bit annoying,
so this commit provides wrapper functions that pull in the
smp_mb__after_unlock_lock() invocations.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c        | 78 ++++++++++++++++--------------------------------
 kernel/rcu/tree.h        | 39 ++++++++++++++++++++++++
 kernel/rcu/tree_plugin.h | 18 ++++-------
 3 files changed, 71 insertions(+), 64 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index f07343b..daf17e2 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1534,10 +1534,8 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
 	 * hold it, acquire the root rcu_node structure's lock in order to
 	 * start one (if needed).
 	 */
-	if (rnp != rnp_root) {
-		raw_spin_lock(&rnp_root->lock);
-		smp_mb__after_unlock_lock();
-	}
+	if (rnp != rnp_root)
+		raw_spin_lock_rcu_node(rnp_root);
 
 	/*
 	 * Get a new grace-period number.  If there really is no grace
@@ -1786,11 +1784,10 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
 	if ((rdp->gpnum == READ_ONCE(rnp->gpnum) &&
 	     rdp->completed == READ_ONCE(rnp->completed) &&
 	     !unlikely(READ_ONCE(rdp->gpwrap))) || /* w/out lock. */
-	    !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
+	    !raw_spin_trylock_rcu_node(rnp)) { /* irqs already off, so later. */
 		local_irq_restore(flags);
 		return;
 	}
-	smp_mb__after_unlock_lock();
 	needwake = __note_gp_changes(rsp, rnp, rdp);
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	if (needwake)
@@ -1814,8 +1811,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
 	WRITE_ONCE(rsp->gp_activity, jiffies);
-	raw_spin_lock_irq(&rnp->lock);
-	smp_mb__after_unlock_lock();
+	raw_spin_lock_irq_rcu_node(rnp);
 	if (!READ_ONCE(rsp->gp_flags)) {
 		/* Spurious wakeup, tell caller to go back to sleep.  */
 		raw_spin_unlock_irq(&rnp->lock);
@@ -1847,8 +1843,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
 	 */
 	rcu_for_each_leaf_node(rsp, rnp) {
 		rcu_gp_slow(rsp, gp_preinit_delay);
-		raw_spin_lock_irq(&rnp->lock);
-		smp_mb__after_unlock_lock();
+		raw_spin_lock_irq_rcu_node(rnp);
 		if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
 		    !rnp->wait_blkd_tasks) {
 			/* Nothing to do on this leaf rcu_node structure. */
@@ -1904,8 +1899,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
 	 */
 	rcu_for_each_node_breadth_first(rsp, rnp) {
 		rcu_gp_slow(rsp, gp_init_delay);
-		raw_spin_lock_irq(&rnp->lock);
-		smp_mb__after_unlock_lock();
+		raw_spin_lock_irq_rcu_node(rnp);
 		rdp = this_cpu_ptr(rsp->rda);
 		rcu_preempt_check_blocked_tasks(rnp);
 		rnp->qsmask = rnp->qsmaskinit;
@@ -1973,8 +1967,7 @@ static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time)
 	}
 	/* Clear flag to prevent immediate re-entry. */
 	if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
-		raw_spin_lock_irq(&rnp->lock);
-		smp_mb__after_unlock_lock();
+		raw_spin_lock_irq_rcu_node(rnp);
 		WRITE_ONCE(rsp->gp_flags,
 			   READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS);
 		raw_spin_unlock_irq(&rnp->lock);
@@ -1993,8 +1986,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
 	WRITE_ONCE(rsp->gp_activity, jiffies);
-	raw_spin_lock_irq(&rnp->lock);
-	smp_mb__after_unlock_lock();
+	raw_spin_lock_irq_rcu_node(rnp);
 	gp_duration = jiffies - rsp->gp_start;
 	if (gp_duration > rsp->gp_max)
 		rsp->gp_max = gp_duration;
@@ -2019,8 +2011,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
 	 * grace period is recorded in any of the rcu_node structures.
 	 */
 	rcu_for_each_node_breadth_first(rsp, rnp) {
-		raw_spin_lock_irq(&rnp->lock);
-		smp_mb__after_unlock_lock();
+		raw_spin_lock_irq_rcu_node(rnp);
 		WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
 		WARN_ON_ONCE(rnp->qsmask);
 		WRITE_ONCE(rnp->completed, rsp->gpnum);
@@ -2035,8 +2026,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
 		rcu_gp_slow(rsp, gp_cleanup_delay);
 	}
 	rnp = rcu_get_root(rsp);
-	raw_spin_lock_irq(&rnp->lock);
-	smp_mb__after_unlock_lock(); /* Order GP before ->completed update. */
+	raw_spin_lock_irq_rcu_node(rnp); /* Order GP before ->completed update. */
 	rcu_nocb_gp_set(rnp, nocb);
 
 	/* Declare grace period done. */
@@ -2284,8 +2274,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		rnp_c = rnp;
 		rnp = rnp->parent;
-		raw_spin_lock_irqsave(&rnp->lock, flags);
-		smp_mb__after_unlock_lock();
+		raw_spin_lock_irqsave_rcu_node(rnp, flags);
 		oldmask = rnp_c->qsmask;
 	}
 
@@ -2332,8 +2321,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
 	gps = rnp->gpnum;
 	mask = rnp->grpmask;
 	raw_spin_unlock(&rnp->lock);	/* irqs remain disabled. */
-	raw_spin_lock(&rnp_p->lock);	/* irqs already disabled. */
-	smp_mb__after_unlock_lock();
+	raw_spin_lock_rcu_node(rnp_p);	/* irqs already disabled. */
 	rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags);
 }
 
@@ -2355,8 +2343,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
 	struct rcu_node *rnp;
 
 	rnp = rdp->mynode;
-	raw_spin_lock_irqsave(&rnp->lock, flags);
-	smp_mb__after_unlock_lock();
+	raw_spin_lock_irqsave_rcu_node(rnp, flags);
 	if ((rdp->cpu_no_qs.b.norm &&
 	     rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) ||
 	    rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum ||
@@ -2582,8 +2569,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
 		rnp = rnp->parent;
 		if (!rnp)
 			break;
-		raw_spin_lock(&rnp->lock); /* irqs already disabled. */
-		smp_mb__after_unlock_lock(); /* GP memory ordering. */
+		raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
 		rnp->qsmaskinit &= ~mask;
 		rnp->qsmask &= ~mask;
 		if (rnp->qsmaskinit) {
@@ -2611,8 +2597,7 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
 
 	/* Remove outgoing CPU from mask in the leaf rcu_node structure. */
 	mask = rdp->grpmask;
-	raw_spin_lock_irqsave(&rnp->lock, flags);
-	smp_mb__after_unlock_lock();	/* Enforce GP memory-order guarantee. */
+	raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
 	rnp->qsmaskinitnext &= ~mask;
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
@@ -2809,8 +2794,7 @@ static void force_qs_rnp(struct rcu_state *rsp,
 	rcu_for_each_leaf_node(rsp, rnp) {
 		cond_resched_rcu_qs();
 		mask = 0;
-		raw_spin_lock_irqsave(&rnp->lock, flags);
-		smp_mb__after_unlock_lock();
+		raw_spin_lock_irqsave_rcu_node(rnp, flags);
 		if (rnp->qsmask == 0) {
 			if (rcu_state_p == &rcu_sched_state ||
 			    rsp != rcu_state_p ||
@@ -2881,8 +2865,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
 	/* rnp_old == rcu_get_root(rsp), rnp == NULL. */
 
 	/* Reached the root of the rcu_node tree, acquire lock. */
-	raw_spin_lock_irqsave(&rnp_old->lock, flags);
-	smp_mb__after_unlock_lock();
+	raw_spin_lock_irqsave_rcu_node(rnp_old, flags);
 	raw_spin_unlock(&rnp_old->fqslock);
 	if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
 		rsp->n_force_qs_lh++;
@@ -3005,8 +2988,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
 		if (!rcu_gp_in_progress(rsp)) {
 			struct rcu_node *rnp_root = rcu_get_root(rsp);
 
-			raw_spin_lock(&rnp_root->lock);
-			smp_mb__after_unlock_lock();
+			raw_spin_lock_rcu_node(rnp_root);
 			needwake = rcu_start_gp(rsp);
 			raw_spin_unlock(&rnp_root->lock);
 			if (needwake)
@@ -3426,8 +3408,7 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp)
 	 * CPUs for the current rcu_node structure up the rcu_node tree.
 	 */
 	rcu_for_each_leaf_node(rsp, rnp) {
-		raw_spin_lock_irqsave(&rnp->lock, flags);
-		smp_mb__after_unlock_lock();
+		raw_spin_lock_irqsave_rcu_node(rnp, flags);
 		if (rnp->expmaskinit == rnp->expmaskinitnext) {
 			raw_spin_unlock_irqrestore(&rnp->lock, flags);
 			continue;  /* No new CPUs, nothing to do. */
@@ -3447,8 +3428,7 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp)
 		rnp_up = rnp->parent;
 		done = false;
 		while (rnp_up) {
-			raw_spin_lock_irqsave(&rnp_up->lock, flags);
-			smp_mb__after_unlock_lock();
+			raw_spin_lock_irqsave_rcu_node(rnp_up, flags);
 			if (rnp_up->expmaskinit)
 				done = true;
 			rnp_up->expmaskinit |= mask;
@@ -3472,8 +3452,7 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp)
 
 	sync_exp_reset_tree_hotplug(rsp);
 	rcu_for_each_node_breadth_first(rsp, rnp) {
-		raw_spin_lock_irqsave(&rnp->lock, flags);
-		smp_mb__after_unlock_lock();
+		raw_spin_lock_irqsave_rcu_node(rnp, flags);
 		WARN_ON_ONCE(rnp->expmask);
 		rnp->expmask = rnp->expmaskinit;
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -3531,8 +3510,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
 		mask = rnp->grpmask;
 		raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
 		rnp = rnp->parent;
-		raw_spin_lock(&rnp->lock); /* irqs already disabled */
-		smp_mb__after_unlock_lock();
+		raw_spin_lock_rcu_node(rnp); /* irqs already disabled */
 		WARN_ON_ONCE(!(rnp->expmask & mask));
 		rnp->expmask &= ~mask;
 	}
@@ -3549,8 +3527,7 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
 {
 	unsigned long flags;
 
-	raw_spin_lock_irqsave(&rnp->lock, flags);
-	smp_mb__after_unlock_lock();
+	raw_spin_lock_irqsave_rcu_node(rnp, flags);
 	__rcu_report_exp_rnp(rsp, rnp, wake, flags);
 }
 
@@ -3564,8 +3541,7 @@ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp,
 {
 	unsigned long flags;
 
-	raw_spin_lock_irqsave(&rnp->lock, flags);
-	smp_mb__after_unlock_lock();
+	raw_spin_lock_irqsave_rcu_node(rnp, flags);
 	if (!(rnp->expmask & mask)) {
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		return;
@@ -3708,8 +3684,7 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
 
 	sync_exp_reset_tree(rsp);
 	rcu_for_each_leaf_node(rsp, rnp) {
-		raw_spin_lock_irqsave(&rnp->lock, flags);
-		smp_mb__after_unlock_lock();
+		raw_spin_lock_irqsave_rcu_node(rnp, flags);
 
 		/* Each pass checks a CPU for identity, offline, and idle. */
 		mask_ofl_test = 0;
@@ -4198,8 +4173,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
 	 */
 	rnp = rdp->mynode;
 	mask = rdp->grpmask;
-	raw_spin_lock(&rnp->lock);		/* irqs already disabled. */
-	smp_mb__after_unlock_lock();
+	raw_spin_lock_rcu_node(rnp);		/* irqs already disabled. */
 	rnp->qsmaskinitnext |= mask;
 	rnp->expmaskinitnext |= mask;
 	if (!rdp->beenonline)
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 9fb4e23..f32bebb 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -664,3 +664,42 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
 #else /* #ifdef CONFIG_PPC */
 #define smp_mb__after_unlock_lock()	do { } while (0)
 #endif /* #else #ifdef CONFIG_PPC */
+
+/*
+ * Wrappers for the rcu_node::lock acquire.
+ *
+ * Because the rcu_nodes form a tree, the tree traversal locking will observe
+ * different lock values, this in turn means that an UNLOCK of one level
+ * followed by a LOCK of another level does not imply a full memory barrier;
+ * and most importantly transitivity is lost.
+ *
+ * In order to restore full ordering between tree levels, augment the regular
+ * lock acquire functions with smp_mb__after_unlock_lock().
+ */
+static inline void raw_spin_lock_rcu_node(struct rcu_node *rnp)
+{
+	raw_spin_lock(&rnp->lock);
+	smp_mb__after_unlock_lock();
+}
+
+static inline void raw_spin_lock_irq_rcu_node(struct rcu_node *rnp)
+{
+	raw_spin_lock_irq(&rnp->lock);
+	smp_mb__after_unlock_lock();
+}
+
+#define raw_spin_lock_irqsave_rcu_node(rnp, flags)	\
+do {							\
+	typecheck(unsigned long, flags);		\
+	raw_spin_lock_irqsave(&(rnp)->lock, flags);	\
+	smp_mb__after_unlock_lock();			\
+} while (0)
+
+static inline bool raw_spin_trylock_rcu_node(struct rcu_node *rnp)
+{
+	bool locked = raw_spin_trylock(&rnp->lock);
+
+	if (locked)
+		smp_mb__after_unlock_lock();
+	return locked;
+}
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 630c197..fa0e3b9 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -301,8 +301,7 @@ static void rcu_preempt_note_context_switch(void)
 		/* Possibly blocking in an RCU read-side critical section. */
 		rdp = this_cpu_ptr(rcu_state_p->rda);
 		rnp = rdp->mynode;
-		raw_spin_lock_irqsave(&rnp->lock, flags);
-		smp_mb__after_unlock_lock();
+		raw_spin_lock_irqsave_rcu_node(rnp, flags);
 		t->rcu_read_unlock_special.b.blocked = true;
 		t->rcu_blocked_node = rnp;
 
@@ -457,8 +456,7 @@ void rcu_read_unlock_special(struct task_struct *t)
 		 */
 		for (;;) {
 			rnp = t->rcu_blocked_node;
-			raw_spin_lock(&rnp->lock);  /* irqs already disabled. */
-			smp_mb__after_unlock_lock();
+			raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
 			if (rnp == t->rcu_blocked_node)
 				break;
 			WARN_ON_ONCE(1);
@@ -989,8 +987,7 @@ static int rcu_boost(struct rcu_node *rnp)
 	    READ_ONCE(rnp->boost_tasks) == NULL)
 		return 0;  /* Nothing left to boost. */
 
-	raw_spin_lock_irqsave(&rnp->lock, flags);
-	smp_mb__after_unlock_lock();
+	raw_spin_lock_irqsave_rcu_node(rnp, flags);
 
 	/*
 	 * Recheck under the lock: all tasks in need of boosting
@@ -1176,8 +1173,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
 			   "rcub/%d", rnp_index);
 	if (IS_ERR(t))
 		return PTR_ERR(t);
-	raw_spin_lock_irqsave(&rnp->lock, flags);
-	smp_mb__after_unlock_lock();
+	raw_spin_lock_irqsave_rcu_node(rnp, flags);
 	rnp->boost_kthread_task = t;
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	sp.sched_priority = kthread_prio;
@@ -1567,8 +1563,7 @@ static void rcu_prepare_for_idle(void)
 		if (!*rdp->nxttail[RCU_DONE_TAIL])
 			continue;
 		rnp = rdp->mynode;
-		raw_spin_lock(&rnp->lock); /* irqs already disabled. */
-		smp_mb__after_unlock_lock();
+		raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
 		needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
 		raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
 		if (needwake)
@@ -2068,8 +2063,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
 	bool needwake;
 	struct rcu_node *rnp = rdp->mynode;
 
-	raw_spin_lock_irqsave(&rnp->lock, flags);
-	smp_mb__after_unlock_lock();
+	raw_spin_lock_irqsave_rcu_node(rnp, flags);
 	needwake = rcu_start_future_gp(rnp, rdp, &c);
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	if (needwake)
-- 
cgit v1.1


From 1658d35ead5d8dd76f2b2d6ad0e32c08d123faa2 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 20 Sep 2015 17:03:16 -0700
Subject: list: Use READ_ONCE() when testing for empty lists

Most of the list-empty-check macros (list_empty(), hlist_empty(),
hlist_bl_empty(), hlist_nulls_empty(), and hlist_nulls_empty()) use
an unadorned load to check the list header.  Given that these macros
are sometimes invoked without the protection of a lock, this is
not sufficient.  This commit therefore adds READ_ONCE() calls to
them.  This commit does not touch llist_empty() because it already
has the needed ACCESS_ONCE().

Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/list.h       | 4 ++--
 include/linux/list_bl.h    | 2 +-
 include/linux/list_nulls.h | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/linux/list.h b/include/linux/list.h
index d7e31fe..06c2d88 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -186,7 +186,7 @@ static inline int list_is_last(const struct list_head *list,
  */
 static inline int list_empty(const struct list_head *head)
 {
-	return head->next == head;
+	return READ_ONCE(head->next) == head;
 }
 
 /**
@@ -608,7 +608,7 @@ static inline int hlist_unhashed(const struct hlist_node *h)
 
 static inline int hlist_empty(const struct hlist_head *h)
 {
-	return !h->first;
+	return !READ_ONCE(h->first);
 }
 
 static inline void __hlist_del(struct hlist_node *n)
diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h
index 8132214..ee7229a 100644
--- a/include/linux/list_bl.h
+++ b/include/linux/list_bl.h
@@ -70,7 +70,7 @@ static inline void hlist_bl_set_first(struct hlist_bl_head *h,
 
 static inline int hlist_bl_empty(const struct hlist_bl_head *h)
 {
-	return !((unsigned long)h->first & ~LIST_BL_LOCKMASK);
+	return !((unsigned long)READ_ONCE(h->first) & ~LIST_BL_LOCKMASK);
 }
 
 static inline void hlist_bl_add_head(struct hlist_bl_node *n,
diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h
index 444d2b1..b01fe10 100644
--- a/include/linux/list_nulls.h
+++ b/include/linux/list_nulls.h
@@ -57,7 +57,7 @@ static inline int hlist_nulls_unhashed(const struct hlist_nulls_node *h)
 
 static inline int hlist_nulls_empty(const struct hlist_nulls_head *h)
 {
-	return is_a_nulls(h->first);
+	return is_a_nulls(READ_ONCE(h->first));
 }
 
 static inline void hlist_nulls_add_head(struct hlist_nulls_node *n,
-- 
cgit v1.1


From 6cf10081220ae21175a867d446b3167bcbcb937b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 8 Oct 2015 15:36:54 -0700
Subject: rcu: Add transitivity to remaining rcu_node ->lock acquisitions

The rule is that all acquisitions of the rcu_node structure's ->lock
must provide transitivity:  The lock is not acquired that frequently,
and sorting out exactly which required it and which did not would be
a maintenance nightmare.  This commit therefore supplies the needed
transitivity to the remaining ->lock acquisitions.

Reported-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c        | 24 ++++++++++++------------
 kernel/rcu/tree_plugin.h |  2 +-
 kernel/rcu/tree_trace.c  |  2 +-
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index daf17e2..81aa1cd 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1214,7 +1214,7 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
 	struct rcu_node *rnp;
 
 	rcu_for_each_leaf_node(rsp, rnp) {
-		raw_spin_lock_irqsave(&rnp->lock, flags);
+		raw_spin_lock_irqsave_rcu_node(rnp, flags);
 		if (rnp->qsmask != 0) {
 			for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
 				if (rnp->qsmask & (1UL << cpu))
@@ -1237,7 +1237,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
 
 	/* Only let one CPU complain about others per time interval. */
 
-	raw_spin_lock_irqsave(&rnp->lock, flags);
+	raw_spin_lock_irqsave_rcu_node(rnp, flags);
 	delta = jiffies - READ_ONCE(rsp->jiffies_stall);
 	if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -1256,7 +1256,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
 	       rsp->name);
 	print_cpu_stall_info_begin();
 	rcu_for_each_leaf_node(rsp, rnp) {
-		raw_spin_lock_irqsave(&rnp->lock, flags);
+		raw_spin_lock_irqsave_rcu_node(rnp, flags);
 		ndetected += rcu_print_task_stall(rnp);
 		if (rnp->qsmask != 0) {
 			for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
@@ -1327,7 +1327,7 @@ static void print_cpu_stall(struct rcu_state *rsp)
 
 	rcu_dump_cpu_stacks(rsp);
 
-	raw_spin_lock_irqsave(&rnp->lock, flags);
+	raw_spin_lock_irqsave_rcu_node(rnp, flags);
 	if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall)))
 		WRITE_ONCE(rsp->jiffies_stall,
 			   jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
@@ -2897,7 +2897,7 @@ __rcu_process_callbacks(struct rcu_state *rsp)
 	/* Does this CPU require a not-yet-started grace period? */
 	local_irq_save(flags);
 	if (cpu_needs_another_gp(rsp, rdp)) {
-		raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */
+		raw_spin_lock_rcu_node(rcu_get_root(rsp)); /* irqs disabled. */
 		needwake = rcu_start_gp(rsp);
 		raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
 		if (needwake)
@@ -3718,7 +3718,7 @@ retry_ipi:
 				mask_ofl_ipi &= ~mask;
 			} else {
 				/* Failed, raced with offline. */
-				raw_spin_lock_irqsave(&rnp->lock, flags);
+				raw_spin_lock_irqsave_rcu_node(rnp, flags);
 				if (cpu_online(cpu) &&
 				    (rnp->expmask & mask)) {
 					raw_spin_unlock_irqrestore(&rnp->lock,
@@ -3727,8 +3727,8 @@ retry_ipi:
 					if (cpu_online(cpu) &&
 					    (rnp->expmask & mask))
 						goto retry_ipi;
-					raw_spin_lock_irqsave(&rnp->lock,
-							      flags);
+					raw_spin_lock_irqsave_rcu_node(rnp,
+								       flags);
 				}
 				if (!(rnp->expmask & mask))
 					mask_ofl_ipi &= ~mask;
@@ -4110,7 +4110,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
 		rnp = rnp->parent;
 		if (rnp == NULL)
 			return;
-		raw_spin_lock(&rnp->lock); /* Interrupts already disabled. */
+		raw_spin_lock_rcu_node(rnp); /* Interrupts already disabled. */
 		rnp->qsmaskinit |= mask;
 		raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
 	}
@@ -4127,7 +4127,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
 	/* Set up local state, ensuring consistent view of global state. */
-	raw_spin_lock_irqsave(&rnp->lock, flags);
+	raw_spin_lock_irqsave_rcu_node(rnp, flags);
 	rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
 	rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
 	WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
@@ -4154,7 +4154,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
 	/* Set up local state, ensuring consistent view of global state. */
-	raw_spin_lock_irqsave(&rnp->lock, flags);
+	raw_spin_lock_irqsave_rcu_node(rnp, flags);
 	rdp->qlen_last_fqs_check = 0;
 	rdp->n_force_qs_snap = rsp->n_force_qs;
 	rdp->blimit = blimit;
@@ -4301,7 +4301,7 @@ static int __init rcu_spawn_gp_kthread(void)
 		t = kthread_create(rcu_gp_kthread, rsp, "%s", rsp->name);
 		BUG_ON(IS_ERR(t));
 		rnp = rcu_get_root(rsp);
-		raw_spin_lock_irqsave(&rnp->lock, flags);
+		raw_spin_lock_irqsave_rcu_node(rnp, flags);
 		rsp->gp_kthread = t;
 		if (kthread_prio) {
 			sp.sched_priority = kthread_prio;
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index fa0e3b9..57ba873 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -525,7 +525,7 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
 	unsigned long flags;
 	struct task_struct *t;
 
-	raw_spin_lock_irqsave(&rnp->lock, flags);
+	raw_spin_lock_irqsave_rcu_node(rnp, flags);
 	if (!rcu_preempt_blocked_readers_cgp(rnp)) {
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		return;
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index ef7093c..8efaba8 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -319,7 +319,7 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
 	unsigned long gpmax;
 	struct rcu_node *rnp = &rsp->node[0];
 
-	raw_spin_lock_irqsave(&rnp->lock, flags);
+	raw_spin_lock_irqsave_rcu_node(rnp, flags);
 	completed = READ_ONCE(rsp->completed);
 	gpnum = READ_ONCE(rsp->gpnum);
 	if (completed == gpnum)
-- 
cgit v1.1


From b26b218a1e9c5815cb8964e180b7fba3cd9bd509 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Tue, 17 Nov 2015 16:05:37 +0100
Subject: perf callchain: Move initial entry call into get_entries function

Moving initial entry call into get_entries function so all entries
processing is on one place. It will be useful for next change that adds
ordering logic.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Milian Wolff <milian.wolff@kdab.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1447772739-18471-2-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/unwind-libunwind.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/tools/perf/util/unwind-libunwind.c b/tools/perf/util/unwind-libunwind.c
index c83832b..0ae8844 100644
--- a/tools/perf/util/unwind-libunwind.c
+++ b/tools/perf/util/unwind-libunwind.c
@@ -614,10 +614,22 @@ void unwind__finish_access(struct thread *thread)
 static int get_entries(struct unwind_info *ui, unwind_entry_cb_t cb,
 		       void *arg, int max_stack)
 {
+	u64 val;
 	unw_addr_space_t addr_space;
 	unw_cursor_t c;
 	int ret;
 
+	ret = perf_reg_value(&val, &ui->sample->user_regs, PERF_REG_IP);
+	if (ret)
+		return ret;
+
+	ret = entry(val, ui->thread, cb, arg);
+	if (ret)
+		return -ENOMEM;
+
+	if (--max_stack == 0)
+		return 0;
+
 	addr_space = thread__priv(ui->thread);
 	if (addr_space == NULL)
 		return -1;
@@ -640,24 +652,17 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg,
 			struct thread *thread,
 			struct perf_sample *data, int max_stack)
 {
-	u64 ip;
 	struct unwind_info ui = {
 		.sample       = data,
 		.thread       = thread,
 		.machine      = thread->mg->machine,
 	};
-	int ret;
 
 	if (!data->user_regs.regs)
 		return -EINVAL;
 
-	ret = perf_reg_value(&ip, &data->user_regs, PERF_REG_IP);
-	if (ret)
-		return ret;
-
-	ret = entry(ip, thread, cb, arg);
-	if (ret)
-		return -ENOMEM;
+	if (max_stack <= 0)
+		return -EINVAL;
 
-	return --max_stack > 0 ? get_entries(&ui, cb, arg, max_stack) : 0;
+	return get_entries(&ui, cb, arg, max_stack);
 }
-- 
cgit v1.1


From cb1dc22dce6e54dbd1eac213c9216e1aa57084da Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Wed, 18 Nov 2015 08:52:47 +0100
Subject: perf callchain: Add order support for libunwind DWARF unwinder

As reported by Milian, currently for DWARF unwind (both libdw and
libunwind) we display callchain in callee order only.

Adding the support to follow callchain order setup to libunwind DWARF
unwinder, so we could get following output for report:

  $ perf record --call-graph dwarf ls
  ...
  $ perf report --no-children --stdio

    39.26%  ls       libc-2.21.so      [.] __strcoll_l
                 |
                 ---__strcoll_l
                    mpsort_with_tmp
                    mpsort_with_tmp
                    sort_files
                    main
                    __libc_start_main
                    _start
                    0

  $ perf report -g caller --no-children --stdio
    ...
    39.26%  ls       libc-2.21.so      [.] __strcoll_l
                 |
                 ---0
                    _start
                    __libc_start_main
                    main
                    sort_files
                    mpsort_with_tmp
                    mpsort_with_tmp
                    __strcoll_l

Based-on-patch-by: Milian Wolff <milian.wolff@kdab.com>
Reported-and-Tested-by: Milian Wolff <milian.wolff@kdab.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Tested-by: Wang Nan <wangnan0@huawei.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/20151118075247.GA5416@krava.brq.redhat.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/unwind-libunwind.c | 47 ++++++++++++++++++++++++--------------
 1 file changed, 30 insertions(+), 17 deletions(-)

diff --git a/tools/perf/util/unwind-libunwind.c b/tools/perf/util/unwind-libunwind.c
index 0ae8844..3c258a0 100644
--- a/tools/perf/util/unwind-libunwind.c
+++ b/tools/perf/util/unwind-libunwind.c
@@ -615,34 +615,47 @@ static int get_entries(struct unwind_info *ui, unwind_entry_cb_t cb,
 		       void *arg, int max_stack)
 {
 	u64 val;
+	unw_word_t ips[max_stack];
 	unw_addr_space_t addr_space;
 	unw_cursor_t c;
-	int ret;
+	int ret, i = 0;
 
 	ret = perf_reg_value(&val, &ui->sample->user_regs, PERF_REG_IP);
 	if (ret)
 		return ret;
 
-	ret = entry(val, ui->thread, cb, arg);
-	if (ret)
-		return -ENOMEM;
+	ips[i++] = (unw_word_t) val;
 
-	if (--max_stack == 0)
-		return 0;
-
-	addr_space = thread__priv(ui->thread);
-	if (addr_space == NULL)
-		return -1;
+	/*
+	 * If we need more than one entry, do the DWARF
+	 * unwind itself.
+	 */
+	if (max_stack - 1 > 0) {
+		addr_space = thread__priv(ui->thread);
+		if (addr_space == NULL)
+			return -1;
+
+		ret = unw_init_remote(&c, addr_space, ui);
+		if (ret)
+			display_error(ret);
+
+		while (!ret && (unw_step(&c) > 0) && i < max_stack) {
+			unw_get_reg(&c, UNW_REG_IP, &ips[i]);
+			++i;
+		}
 
-	ret = unw_init_remote(&c, addr_space, ui);
-	if (ret)
-		display_error(ret);
+		max_stack = i;
+	}
 
-	while (!ret && (unw_step(&c) > 0) && max_stack--) {
-		unw_word_t ip;
+	/*
+	 * Display what we got based on the order setup.
+	 */
+	for (i = 0; i < max_stack && !ret; i++) {
+		int j = i;
 
-		unw_get_reg(&c, UNW_REG_IP, &ip);
-		ret = ip ? entry(ip, ui->thread, cb, arg) : 0;
+		if (callchain_param.order == ORDER_CALLER)
+			j = max_stack - i - 1;
+		ret = ips[j] ? entry(ips[j], ui->thread, cb, arg) : 0;
 	}
 
 	return ret;
-- 
cgit v1.1


From 8dc0564d809e3903834950e2d12f6d1d2fcff708 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Tue, 17 Nov 2015 16:05:39 +0100
Subject: perf test: Add callchain order setup for DWARF unwinder test

Adding callchain order setup for DWARF unwinder test. The test now runs
unwinder for both callee and caller orders.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Milian Wolff <milian.wolff@kdab.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1447772739-18471-4-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/dwarf-unwind.c | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/tools/perf/tests/dwarf-unwind.c b/tools/perf/tests/dwarf-unwind.c
index 01f0b61..b2357e8 100644
--- a/tools/perf/tests/dwarf-unwind.c
+++ b/tools/perf/tests/dwarf-unwind.c
@@ -51,6 +51,12 @@ static int unwind_entry(struct unwind_entry *entry, void *arg)
 		"krava_1",
 		"test__dwarf_unwind"
 	};
+	/*
+	 * The funcs[MAX_STACK] array index, based on the
+	 * callchain order setup.
+	 */
+	int idx = callchain_param.order == ORDER_CALLER ?
+		  MAX_STACK - *cnt - 1 : *cnt;
 
 	if (*cnt >= MAX_STACK) {
 		pr_debug("failed: crossed the max stack value %d\n", MAX_STACK);
@@ -63,8 +69,10 @@ static int unwind_entry(struct unwind_entry *entry, void *arg)
 		return -1;
 	}
 
-	pr_debug("got: %s 0x%" PRIx64 "\n", symbol, entry->ip);
-	return strcmp((const char *) symbol, funcs[(*cnt)++]);
+	(*cnt)++;
+	pr_debug("got: %s 0x%" PRIx64 ", expecting %s\n",
+		 symbol, entry->ip, funcs[idx]);
+	return strcmp((const char *) symbol, funcs[idx]);
 }
 
 __attribute__ ((noinline))
@@ -105,8 +113,16 @@ static int compare(void *p1, void *p2)
 	/* Any possible value should be 'thread' */
 	struct thread *thread = *(struct thread **)p1;
 
-	if (global_unwind_retval == -INT_MAX)
+	if (global_unwind_retval == -INT_MAX) {
+		/* Call unwinder twice for both callchain orders. */
+		callchain_param.order = ORDER_CALLER;
+
 		global_unwind_retval = unwind_thread(thread);
+		if (!global_unwind_retval) {
+			callchain_param.order = ORDER_CALLEE;
+			global_unwind_retval = unwind_thread(thread);
+		}
+	}
 
 	return p1 - p2;
 }
-- 
cgit v1.1


From 8bd508b001629a5d836987d9a0702a6bfc4fc705 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Thu, 19 Nov 2015 14:01:19 +0100
Subject: perf callchain: Add order support for libdw DWARF unwinder

As reported by Milian, currently for DWARF unwind (both libdw and
libunwind) we display callchain in callee order only.

Adding the support to follow callchain order setup to libdw DWARF
unwinder, so we could get following output for report:

  $ perf record --call-graph dwarf ls
  ...

  $ perf report --no-children --stdio

    21.12%  ls       libc-2.21.so      [.] __strcoll_l
                 |
                 ---__strcoll_l
                    mpsort_with_tmp
                    mpsort_with_tmp
                    mpsort_with_tmp
                    sort_files
                    main
                    __libc_start_main
                    _start

  $ perf report --stdio --no-children -g caller

    21.12%  ls       libc-2.21.so      [.] __strcoll_l
                 |
                 ---_start
                    __libc_start_main
                    main
                    sort_files
                    mpsort_with_tmp
                    mpsort_with_tmp
                    mpsort_with_tmp
                    __strcoll_l

Reported-and-Tested-by: Milian Wolff <milian.wolff@kdab.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Wang Nan <wangnan0@huawei.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jan Kratochvil <jkratoch@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20151119130119.GA26617@krava.brq.redhat.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/unwind-libdw.c | 53 ++++++++++++++++++++++++++++++------------
 tools/perf/util/unwind-libdw.h |  2 ++
 2 files changed, 40 insertions(+), 15 deletions(-)

diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c
index 2dcfe9a..db8142b 100644
--- a/tools/perf/util/unwind-libdw.c
+++ b/tools/perf/util/unwind-libdw.c
@@ -11,6 +11,7 @@
 #include <linux/types.h>
 #include "event.h"
 #include "perf_regs.h"
+#include "callchain.h"
 
 static char *debuginfo_path;
 
@@ -52,25 +53,28 @@ static int report_module(u64 ip, struct unwind_info *ui)
 	return __report_module(&al, ip, ui);
 }
 
+/*
+ * Store all entries within entries array,
+ * we will process it after we finish unwind.
+ */
 static int entry(u64 ip, struct unwind_info *ui)
 
 {
-	struct unwind_entry e;
+	struct unwind_entry *e = &ui->entries[ui->idx++];
 	struct addr_location al;
 
 	if (__report_module(&al, ip, ui))
 		return -1;
 
-	e.ip  = ip;
-	e.map = al.map;
-	e.sym = al.sym;
+	e->ip  = ip;
+	e->map = al.map;
+	e->sym = al.sym;
 
 	pr_debug("unwind: %s:ip = 0x%" PRIx64 " (0x%" PRIx64 ")\n",
 		 al.sym ? al.sym->name : "''",
 		 ip,
 		 al.map ? al.map->map_ip(al.map, ip) : (u64) 0);
-
-	return ui->cb(&e, ui->arg);
+	return 0;
 }
 
 static pid_t next_thread(Dwfl *dwfl, void *arg, void **thread_argp)
@@ -168,7 +172,7 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg,
 			struct perf_sample *data,
 			int max_stack)
 {
-	struct unwind_info ui = {
+	struct unwind_info *ui, ui_buf = {
 		.sample		= data,
 		.thread		= thread,
 		.machine	= thread->mg->machine,
@@ -177,35 +181,54 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg,
 		.max_stack	= max_stack,
 	};
 	Dwarf_Word ip;
-	int err = -EINVAL;
+	int err = -EINVAL, i;
 
 	if (!data->user_regs.regs)
 		return -EINVAL;
 
-	ui.dwfl = dwfl_begin(&offline_callbacks);
-	if (!ui.dwfl)
+	ui = zalloc(sizeof(ui_buf) + sizeof(ui_buf.entries[0]) * max_stack);
+	if (!ui)
+		return -ENOMEM;
+
+	*ui = ui_buf;
+
+	ui->dwfl = dwfl_begin(&offline_callbacks);
+	if (!ui->dwfl)
 		goto out;
 
 	err = perf_reg_value(&ip, &data->user_regs, PERF_REG_IP);
 	if (err)
 		goto out;
 
-	err = report_module(ip, &ui);
+	err = report_module(ip, ui);
 	if (err)
 		goto out;
 
-	if (!dwfl_attach_state(ui.dwfl, EM_NONE, thread->tid, &callbacks, &ui))
+	if (!dwfl_attach_state(ui->dwfl, EM_NONE, thread->tid, &callbacks, ui))
 		goto out;
 
-	err = dwfl_getthread_frames(ui.dwfl, thread->tid, frame_callback, &ui);
+	err = dwfl_getthread_frames(ui->dwfl, thread->tid, frame_callback, ui);
 
-	if (err && !ui.max_stack)
+	if (err && !ui->max_stack)
 		err = 0;
 
+	/*
+	 * Display what we got based on the order setup.
+	 */
+	for (i = 0; i < ui->idx && !err; i++) {
+		int j = i;
+
+		if (callchain_param.order == ORDER_CALLER)
+			j = ui->idx - i - 1;
+
+		err = ui->entries[j].ip ? ui->cb(&ui->entries[j], ui->arg) : 0;
+	}
+
  out:
 	if (err)
 		pr_debug("unwind: failed with '%s'\n", dwfl_errmsg(-1));
 
-	dwfl_end(ui.dwfl);
+	dwfl_end(ui->dwfl);
+	free(ui);
 	return 0;
 }
diff --git a/tools/perf/util/unwind-libdw.h b/tools/perf/util/unwind-libdw.h
index 417a142..5832866 100644
--- a/tools/perf/util/unwind-libdw.h
+++ b/tools/perf/util/unwind-libdw.h
@@ -16,6 +16,8 @@ struct unwind_info {
 	unwind_entry_cb_t	cb;
 	void			*arg;
 	int			max_stack;
+	int			idx;
+	struct unwind_entry	entries[];
 };
 
 #endif /* __PERF_UNWIND_LIBDW_H */
-- 
cgit v1.1


From 30862f2c5725c46afcfab5af710fdf5163bf0f81 Mon Sep 17 00:00:00 2001
From: Taeung Song <treeze.taeung@gmail.com>
Date: Tue, 17 Nov 2015 22:53:21 +0900
Subject: perf tools: Add 'perf config' command

The perf configuration file contains many variables to change various
aspects of each of its tools, including output, disk usage, etc.

But looking at the state of configuration is difficult and there's no
documentation about config variables except for the variables in
perfconfig.example exist.

So this patch adds a 'perf-config' command with a '--list' option.

    perf config [options]

    display current perf config variables.
    # perf config -l | --list

Signed-off-by: Taeung Song <treeze.taeung@gmail.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Link: http://lkml.kernel.org/r/1447768424-17327-1-git-send-email-treeze.taeung@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Build            |  1 +
 tools/perf/builtin-config.c | 66 +++++++++++++++++++++++++++++++++++++++++++++
 tools/perf/builtin.h        |  1 +
 tools/perf/command-list.txt |  1 +
 tools/perf/perf.c           |  1 +
 5 files changed, 70 insertions(+)
 create mode 100644 tools/perf/builtin-config.c

diff --git a/tools/perf/Build b/tools/perf/Build
index 7223745..2c7aaf2 100644
--- a/tools/perf/Build
+++ b/tools/perf/Build
@@ -1,5 +1,6 @@
 perf-y += builtin-bench.o
 perf-y += builtin-annotate.o
+perf-y += builtin-config.o
 perf-y += builtin-diff.o
 perf-y += builtin-evlist.o
 perf-y += builtin-help.o
diff --git a/tools/perf/builtin-config.c b/tools/perf/builtin-config.c
new file mode 100644
index 0000000..427ea7a
--- /dev/null
+++ b/tools/perf/builtin-config.c
@@ -0,0 +1,66 @@
+/*
+ * builtin-config.c
+ *
+ * Copyright (C) 2015, Taeung Song <treeze.taeung@gmail.com>
+ *
+ */
+#include "builtin.h"
+
+#include "perf.h"
+
+#include "util/cache.h"
+#include "util/parse-options.h"
+#include "util/util.h"
+#include "util/debug.h"
+
+static const char * const config_usage[] = {
+	"perf config [options]",
+	NULL
+};
+
+enum actions {
+	ACTION_LIST = 1
+} actions;
+
+static struct option config_options[] = {
+	OPT_SET_UINT('l', "list", &actions,
+		     "show current config variables", ACTION_LIST),
+	OPT_END()
+};
+
+static int show_config(const char *key, const char *value,
+		       void *cb __maybe_unused)
+{
+	if (value)
+		printf("%s=%s\n", key, value);
+	else
+		printf("%s\n", key);
+
+	return 0;
+}
+
+int cmd_config(int argc, const char **argv, const char *prefix __maybe_unused)
+{
+	int ret = 0;
+
+	argc = parse_options(argc, argv, config_options, config_usage,
+			     PARSE_OPT_STOP_AT_NON_OPTION);
+
+	switch (actions) {
+	case ACTION_LIST:
+		if (argc) {
+			pr_err("Error: takes no arguments\n");
+			parse_options_usage(config_usage, config_options, "l", 1);
+		} else {
+			ret = perf_config(show_config, NULL);
+			if (ret < 0)
+				pr_err("Nothing configured, "
+				       "please check your ~/.perfconfig file\n");
+		}
+		break;
+	default:
+		usage_with_options(config_usage, config_options);
+	}
+
+	return ret;
+}
diff --git a/tools/perf/builtin.h b/tools/perf/builtin.h
index 3688ad2..3f871b5 100644
--- a/tools/perf/builtin.h
+++ b/tools/perf/builtin.h
@@ -17,6 +17,7 @@ extern int cmd_annotate(int argc, const char **argv, const char *prefix);
 extern int cmd_bench(int argc, const char **argv, const char *prefix);
 extern int cmd_buildid_cache(int argc, const char **argv, const char *prefix);
 extern int cmd_buildid_list(int argc, const char **argv, const char *prefix);
+extern int cmd_config(int argc, const char **argv, const char *prefix);
 extern int cmd_diff(int argc, const char **argv, const char *prefix);
 extern int cmd_evlist(int argc, const char **argv, const char *prefix);
 extern int cmd_help(int argc, const char **argv, const char *prefix);
diff --git a/tools/perf/command-list.txt b/tools/perf/command-list.txt
index 00fcaf8..acc3ea7 100644
--- a/tools/perf/command-list.txt
+++ b/tools/perf/command-list.txt
@@ -9,6 +9,7 @@ perf-buildid-cache		mainporcelain common
 perf-buildid-list		mainporcelain common
 perf-data			mainporcelain common
 perf-diff			mainporcelain common
+perf-config			mainporcelain common
 perf-evlist			mainporcelain common
 perf-inject			mainporcelain common
 perf-kmem			mainporcelain common
diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index 3d4c7c0..4bee53c 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -39,6 +39,7 @@ struct cmd_struct {
 static struct cmd_struct commands[] = {
 	{ "buildid-cache", cmd_buildid_cache, 0 },
 	{ "buildid-list", cmd_buildid_list, 0 },
+	{ "config",	cmd_config,	0 },
 	{ "diff",	cmd_diff,	0 },
 	{ "evlist",	cmd_evlist,	0 },
 	{ "help",	cmd_help,	0 },
-- 
cgit v1.1


From 7d6852432acb3b09fc3ec45dd65421d34eebe3b5 Mon Sep 17 00:00:00 2001
From: Taeung Song <treeze.taeung@gmail.com>
Date: Sun, 22 Nov 2015 19:11:56 +0900
Subject: perf config: Add initial man page
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add perf-config document to describe the perf configuration and a
'list’ subcommand.

Signed-off-by: Taeung Song <treeze.taeung@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: http://lkml.kernel.org/r/63AD9B57-7B8C-46F8-8F18-0FFEB9A6A1BC@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-config.txt | 103 +++++++++++++++++++++++++++++++
 1 file changed, 103 insertions(+)
 create mode 100644 tools/perf/Documentation/perf-config.txt

diff --git a/tools/perf/Documentation/perf-config.txt b/tools/perf/Documentation/perf-config.txt
new file mode 100644
index 0000000..b9ca1e3
--- /dev/null
+++ b/tools/perf/Documentation/perf-config.txt
@@ -0,0 +1,103 @@
+perf-config(1)
+==============
+
+NAME
+----
+perf-config - Get and set variables in a configuration file.
+
+SYNOPSIS
+--------
+[verse]
+'perf config' -l | --list
+
+DESCRIPTION
+-----------
+You can manage variables in a configuration file with this command.
+
+OPTIONS
+-------
+
+-l::
+--list::
+	Show current config variables, name and value, for all sections.
+
+CONFIGURATION FILE
+------------------
+
+The perf configuration file contains many variables to change various
+aspects of each of its tools, including output, disk usage, etc.
+The '$HOME/.perfconfig' file is used to store a per-user configuration.
+The file '$(sysconfdir)/perfconfig' can be used to
+store a system-wide default configuration.
+
+Syntax
+~~~~~~
+
+The file consist of sections. A section starts with its name
+surrounded by square brackets and continues till the next section
+begins. Each variable must be in a section, and have the form
+'name = value', for example:
+
+	[section]
+		name1 = value1
+		name2 = value2
+
+Section names are case sensitive and can contain any characters except
+newline (double quote `"` and backslash have to be escaped as `\"` and `\\`,
+respectively). Section headers can't span multiple lines.
+
+Example
+~~~~~~~
+
+Given a $HOME/.perfconfig like this:
+
+#
+# This is the config file, and
+# a '#' and ';' character indicates a comment
+#
+
+	[colors]
+		# Color variables
+		top = red, default
+		medium = green, default
+		normal = lightgray, default
+		selected = white, lightgray
+		code = blue, default
+		addr = magenta, default
+		root = white, blue
+
+	[tui]
+		# Defaults if linked with libslang
+		report = on
+		annotate = on
+		top = on
+
+	[buildid]
+		# Default, disable using /dev/null
+		dir = ~/.debug
+
+	[annotate]
+		# Defaults
+		hide_src_code = false
+		use_offset = true
+		jump_arrows = true
+		show_nr_jumps = false
+
+	[help]
+		# Format can be man, info, web or html
+		format = man
+		autocorrect = 0
+
+	[ui]
+		show-headers = true
+
+	[call-graph]
+		# fp (framepointer), dwarf
+		record-mode = fp
+		print-type = graph
+		order = caller
+		sort-key = function
+
+SEE ALSO
+--------
+linkperf:perf[1]
-- 
cgit v1.1


From 646a6e846c4dc3812c614fd061603b6db5b8d380 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Sat, 21 Nov 2015 11:23:55 +0100
Subject: perf callchain: Add missing parent_val initialization

Adding missing parent_val callchain_node initialization.
It's causing segfault in perf top:

  $ sudo perf top -g
  perf: Segmentation fault
  -------- backtrace --------
  free_callchain_node(+0x29) in perf [0x4a4b3e]
  free_callchain(+0x29) in perf [0x4a5a83]
  hist_entry__delete(+0x126) in perf [0x4c6649]
  hists__delete_entry(+0x6e) in perf [0x4c66dc]
  hists__decay_entries(+0x7d) in perf [0x4c6776]
  perf_top__sort_new_samples(+0x7c) in perf [0x436a78]
  hist_browser__run(+0xf2) in perf [0x507760]
  perf_evsel__hists_browse(+0x1da) in perf [0x507c8d]
  perf_evlist__tui_browse_hists(+0x3e) in perf [0x5088cf]
  display_thread_tui(+0x7f) in perf [0x437953]
  start_thread(+0xc5) in libpthread-2.21.so [0x7f7068fbb555]
  __clone(+0x6d) in libc-2.21.so [0x7f7066fc3b9d]
  [0x0]

Reported-and-Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Wang Nan <wangnan0@huawei.com>
Fixes: 4b3a3212233a ("perf hists browser: Support flat callchains")
Link: http://lkml.kernel.org/r/20151121102355.GA17313@krava.local
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/callchain.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index 6e9b5f2..8ac8f043 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -143,6 +143,7 @@ extern __thread struct callchain_cursor callchain_cursor;
 static inline void callchain_init(struct callchain_root *root)
 {
 	INIT_LIST_HEAD(&root->node.val);
+	INIT_LIST_HEAD(&root->node.parent_val);
 
 	root->node.parent = NULL;
 	root->node.hit = 0;
-- 
cgit v1.1


From 32abc2ede536aae52978d6c0a8944eb1df14f460 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 16 Nov 2015 17:25:16 -0500
Subject: tools lib traceevent: Fix output of %llu for 64 bit values read on 32
 bit machines

When a long value is read on 32 bit machines for 64 bit output, the
parsing needs to change "%lu" into "%llu", as the value is read
natively.

Unfortunately, if "%llu" is already there, the code will add another "l"
to it and fail to parse it properly.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/20151116172516.4b79b109@gandalf.local.home
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/lib/traceevent/event-parse.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c
index 2a912df..68276f3 100644
--- a/tools/lib/traceevent/event-parse.c
+++ b/tools/lib/traceevent/event-parse.c
@@ -4968,13 +4968,12 @@ static void pretty_print(struct trace_seq *s, void *data, int size, struct event
 				    sizeof(long) != 8) {
 					char *p;
 
-					ls = 2;
 					/* make %l into %ll */
-					p = strchr(format, 'l');
-					if (p)
+					if (ls == 1 && (p = strchr(format, 'l')))
 						memmove(p+1, p, strlen(p)+1);
 					else if (strcmp(format, "%p") == 0)
 						strcpy(format, "0x%llx");
+					ls = 2;
 				}
 				switch (ls) {
 				case -2:
-- 
cgit v1.1


From 8b38937b7ab55e93065a14c88753b1fe83e93c60 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Tue, 24 Nov 2015 08:41:17 +0100
Subject: x86/mce: Do not enter deferred errors into the generic pool twice

We used to have a special ring buffer for deferred errors that
was used to mark problem pages. We replaced that with a generic
pool. Then later converted mce_log() to also use the same pool.
As a result, we end up adding all deferred errors to the pool
twice.

Rearrange this code. Make sure to set the m.severity and
m.usable_addr fields for deferred errors. Then if flags and
mca_cfg.dont_log_ce mean we call mce_log() we are done, because
that will add this entry to the generic pool.

If we skipped mce_log(), then we still want to take action for
the deferred error, so add to the pool.

Change the name of the boolean "error_logged" to "error_seen",
we should set it whether of not we logged an error because the
return value from machine_check_poll() is used to decide whether
storms have subsided or not.

Reported-by: Gong Chen <gong.chen@linux.intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/1448350880-5573-2-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index c5b0d56..6531cb4 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -567,7 +567,7 @@ DEFINE_PER_CPU(unsigned, mce_poll_count);
  */
 bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 {
-	bool error_logged = false;
+	bool error_seen = false;
 	struct mce m;
 	int severity;
 	int i;
@@ -601,6 +601,8 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 		    (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC)))
 			continue;
 
+		error_seen = true;
+
 		mce_read_aux(&m, i);
 
 		if (!(flags & MCP_TIMESTAMP))
@@ -608,17 +610,10 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 
 		severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
 
-		/*
-		 * In the cases where we don't have a valid address after all,
-		 * do not add it into the ring buffer.
-		 */
 		if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m)) {
 			if (m.status & MCI_STATUS_ADDRV) {
 				m.severity = severity;
 				m.usable_addr = mce_usable_address(&m);
-
-				if (!mce_gen_pool_add(&m))
-					mce_schedule_work();
 			}
 		}
 
@@ -626,9 +621,16 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 		 * Don't get the IP here because it's unlikely to
 		 * have anything to do with the actual error location.
 		 */
-		if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) {
-			error_logged = true;
+		if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce)
 			mce_log(&m);
+		else if (m.usable_addr) {
+			/*
+			 * Although we skipped logging this, we still want
+			 * to take action. Add to the pool so the registered
+			 * notifiers will see it.
+			 */
+			if (!mce_gen_pool_add(&m))
+				mce_schedule_work();
 		}
 
 		/*
@@ -644,7 +646,7 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 
 	sync_core();
 
-	return error_logged;
+	return error_seen;
 }
 EXPORT_SYMBOL_GPL(machine_check_poll);
 
-- 
cgit v1.1


From c0ec382e1928402031e754ad0391ecbdabb18c43 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Tue, 24 Nov 2015 08:41:18 +0100
Subject: x86/RAS: Remove mce.usable_addr

It is useless and we can use the function instead. Besides,
mcelog(8) hasn't managed to make use of it yet. So kill it.

Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Tony Luck <tony.luck@intel.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1448350880-5573-3-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/uapi/asm/mce.h  |  2 +-
 arch/x86/kernel/cpu/mcheck/mce.c | 12 ++++--------
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/uapi/asm/mce.h b/arch/x86/include/uapi/asm/mce.h
index 03429da..2184943 100644
--- a/arch/x86/include/uapi/asm/mce.h
+++ b/arch/x86/include/uapi/asm/mce.h
@@ -16,7 +16,7 @@ struct mce {
 	__u8  cpuvendor;	/* cpu vendor as encoded in system.h */
 	__u8  inject_flags;	/* software inject flags */
 	__u8  severity;
-	__u8  usable_addr;
+	__u8  pad;
 	__u32 cpuid;	/* CPUID 1 EAX */
 	__u8  cs;		/* code segment */
 	__u8  bank;	/* machine check bank */
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 6531cb4..fb8b1db 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -484,7 +484,7 @@ static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
 	if (!mce)
 		return NOTIFY_DONE;
 
-	if (mce->usable_addr && (mce->severity == MCE_AO_SEVERITY)) {
+	if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) {
 		pfn = mce->addr >> PAGE_SHIFT;
 		memory_failure(pfn, MCE_VECTOR, 0);
 	}
@@ -610,12 +610,9 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 
 		severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
 
-		if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m)) {
-			if (m.status & MCI_STATUS_ADDRV) {
+		if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m))
+			if (m.status & MCI_STATUS_ADDRV)
 				m.severity = severity;
-				m.usable_addr = mce_usable_address(&m);
-			}
-		}
 
 		/*
 		 * Don't get the IP here because it's unlikely to
@@ -623,7 +620,7 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 		 */
 		if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce)
 			mce_log(&m);
-		else if (m.usable_addr) {
+		else if (mce_usable_address(&m)) {
 			/*
 			 * Although we skipped logging this, we still want
 			 * to take action. Add to the pool so the registered
@@ -1091,7 +1088,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 
 		/* assuming valid severity level != 0 */
 		m.severity = severity;
-		m.usable_addr = mce_usable_address(&m);
 
 		mce_log(&m);
 
-- 
cgit v1.1


From db548a28fcee0f38cf4c7c726becf24c8afacf02 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Tue, 24 Nov 2015 08:41:19 +0100
Subject: x86/mce: Add the missing memory error check on AMD

We simply need to look at the extended error code when detecting
whether the error is of type memory.

Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Tony Luck <tony.luck@intel.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1448350880-5573-4-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index fb8b1db..e00e85a 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -522,10 +522,10 @@ static bool memory_error(struct mce *m)
 	struct cpuinfo_x86 *c = &boot_cpu_data;
 
 	if (c->x86_vendor == X86_VENDOR_AMD) {
-		/*
-		 * coming soon
-		 */
-		return false;
+		/* ErrCodeExt[20:16] */
+		u8 xec = (m->status >> 16) & 0x1f;
+
+		return (xec == 0x0 || xec == 0x8);
 	} else if (c->x86_vendor == X86_VENDOR_INTEL) {
 		/*
 		 * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
-- 
cgit v1.1


From feab21f8356bde572663e29c9d9e48c964292e05 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Tue, 24 Nov 2015 08:41:20 +0100
Subject: x86/mce: Make usable address checks Intel-only

The MCi_MISC bitfield definitions mce_usable_address() checks
are Intel-only. Make them so.

While at it, move mce_usable_address() up, before all its
callers and get rid of the forward declaration.

Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Tony Luck <tony.luck@intel.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1448350880-5573-5-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 40 ++++++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index e00e85a..3865e95 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -114,7 +114,6 @@ static struct work_struct mce_work;
 static struct irq_work mce_irq_work;
 
 static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
-static int mce_usable_address(struct mce *m);
 
 /*
  * CPU/chipset specific EDAC code can register a notifier call here to print
@@ -475,6 +474,28 @@ static void mce_report_event(struct pt_regs *regs)
 	irq_work_queue(&mce_irq_work);
 }
 
+/*
+ * Check if the address reported by the CPU is in a format we can parse.
+ * It would be possible to add code for most other cases, but all would
+ * be somewhat complicated (e.g. segment offset would require an instruction
+ * parser). So only support physical addresses up to page granuality for now.
+ */
+static int mce_usable_address(struct mce *m)
+{
+	if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
+		return 0;
+
+	/* Checks after this one are Intel-specific: */
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+		return 1;
+
+	if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
+		return 0;
+	if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
+		return 0;
+	return 1;
+}
+
 static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
 				void *data)
 {
@@ -930,23 +951,6 @@ reset:
 	return ret;
 }
 
-/*
- * Check if the address reported by the CPU is in a format we can parse.
- * It would be possible to add code for most other cases, but all would
- * be somewhat complicated (e.g. segment offset would require an instruction
- * parser). So only support physical addresses up to page granuality for now.
- */
-static int mce_usable_address(struct mce *m)
-{
-	if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
-		return 0;
-	if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
-		return 0;
-	if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
-		return 0;
-	return 1;
-}
-
 static void mce_clear_state(unsigned long *toclear)
 {
 	int i;
-- 
cgit v1.1


From 99f925ce927e4ac313d9af8bd1bf55796e2cdcb1 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 23 Nov 2015 11:12:21 +0100
Subject: x86/cpu: Unify CPU family, model, stepping calculation

Add generic functions which calc family, model and stepping from
the CPUID_1.EAX leaf and stick them into the library we have.

Rename those which do call CPUID with the prefix "x86_cpuid" as
suggested by Paolo Bonzini.

No functionality change.

Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1448273546-2567-2-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/cpu.h            |  3 +++
 arch/x86/include/asm/microcode.h      | 39 +++++++----------------------------
 arch/x86/kernel/cpu/common.c          | 11 +++-------
 arch/x86/kernel/cpu/microcode/core.c  | 12 +++++------
 arch/x86/kernel/cpu/microcode/intel.c | 16 ++++++--------
 arch/x86/lib/Makefile                 |  2 +-
 arch/x86/lib/cpu.c                    | 35 +++++++++++++++++++++++++++++++
 7 files changed, 61 insertions(+), 57 deletions(-)
 create mode 100644 arch/x86/lib/cpu.c

diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index bf2caa1..678637a 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -36,4 +36,7 @@ extern int _debug_hotplug_cpu(int cpu, int action);
 
 int mwait_usable(const struct cpuinfo_x86 *);
 
+unsigned int x86_family(unsigned int sig);
+unsigned int x86_model(unsigned int sig);
+unsigned int x86_stepping(unsigned int sig);
 #endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index 34e62b1..1e1b07a 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -1,6 +1,7 @@
 #ifndef _ASM_X86_MICROCODE_H
 #define _ASM_X86_MICROCODE_H
 
+#include <asm/cpu.h>
 #include <linux/earlycpio.h>
 
 #define native_rdmsr(msr, val1, val2)			\
@@ -95,14 +96,14 @@ static inline void __exit exit_amd_microcode(void) {}
 
 /*
  * In early loading microcode phase on BSP, boot_cpu_data is not set up yet.
- * x86_vendor() gets vendor id for BSP.
+ * x86_cpuid_vendor() gets vendor id for BSP.
  *
  * In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify
- * coding, we still use x86_vendor() to get vendor id for AP.
+ * coding, we still use x86_cpuid_vendor() to get vendor id for AP.
  *
- * x86_vendor() gets vendor information directly from CPUID.
+ * x86_cpuid_vendor() gets vendor information directly from CPUID.
  */
-static inline int x86_vendor(void)
+static inline int x86_cpuid_vendor(void)
 {
 	u32 eax = 0x00000000;
 	u32 ebx, ecx = 0, edx;
@@ -118,40 +119,14 @@ static inline int x86_vendor(void)
 	return X86_VENDOR_UNKNOWN;
 }
 
-static inline unsigned int __x86_family(unsigned int sig)
-{
-	unsigned int x86;
-
-	x86 = (sig >> 8) & 0xf;
-
-	if (x86 == 0xf)
-		x86 += (sig >> 20) & 0xff;
-
-	return x86;
-}
-
-static inline unsigned int x86_family(void)
+static inline unsigned int x86_cpuid_family(void)
 {
 	u32 eax = 0x00000001;
 	u32 ebx, ecx = 0, edx;
 
 	native_cpuid(&eax, &ebx, &ecx, &edx);
 
-	return __x86_family(eax);
-}
-
-static inline unsigned int x86_model(unsigned int sig)
-{
-	unsigned int x86, model;
-
-	x86 = __x86_family(sig);
-
-	model = (sig >> 4) & 0xf;
-
-	if (x86 == 0x6 || x86 == 0xf)
-		model += ((sig >> 16) & 0xf) << 4;
-
-	return model;
+	return x86_family(eax);
 }
 
 #ifdef CONFIG_MICROCODE
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c2b7522..0bed416 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -581,14 +581,9 @@ void cpu_detect(struct cpuinfo_x86 *c)
 		u32 junk, tfms, cap0, misc;
 
 		cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
-		c->x86 = (tfms >> 8) & 0xf;
-		c->x86_model = (tfms >> 4) & 0xf;
-		c->x86_mask = tfms & 0xf;
-
-		if (c->x86 == 0xf)
-			c->x86 += (tfms >> 20) & 0xff;
-		if (c->x86 >= 0x6)
-			c->x86_model += ((tfms >> 16) & 0xf) << 4;
+		c->x86		= x86_family(tfms);
+		c->x86_model	= x86_model(tfms);
+		c->x86_mask	= x86_stepping(tfms);
 
 		if (cap0 & (1<<19)) {
 			c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 7fc27f1..3aaffb6 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -129,8 +129,8 @@ void __init load_ucode_bsp(void)
 	if (!have_cpuid_p())
 		return;
 
-	vendor = x86_vendor();
-	family = x86_family();
+	vendor = x86_cpuid_vendor();
+	family = x86_cpuid_family();
 
 	switch (vendor) {
 	case X86_VENDOR_INTEL:
@@ -165,8 +165,8 @@ void load_ucode_ap(void)
 	if (!have_cpuid_p())
 		return;
 
-	vendor = x86_vendor();
-	family = x86_family();
+	vendor = x86_cpuid_vendor();
+	family = x86_cpuid_family();
 
 	switch (vendor) {
 	case X86_VENDOR_INTEL:
@@ -206,8 +206,8 @@ void reload_early_microcode(void)
 {
 	int vendor, family;
 
-	vendor = x86_vendor();
-	family = x86_family();
+	vendor = x86_cpuid_vendor();
+	family = x86_cpuid_family();
 
 	switch (vendor) {
 	case X86_VENDOR_INTEL:
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index ce47402..ee81c54 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -145,10 +145,10 @@ matching_model_microcode(struct microcode_header_intel *mc_header,
 	int ext_sigcount, i;
 	struct extended_signature *ext_sig;
 
-	fam   = __x86_family(sig);
+	fam   = x86_family(sig);
 	model = x86_model(sig);
 
-	fam_ucode   = __x86_family(mc_header->sig);
+	fam_ucode   = x86_family(mc_header->sig);
 	model_ucode = x86_model(mc_header->sig);
 
 	if (fam == fam_ucode && model == model_ucode)
@@ -163,7 +163,7 @@ matching_model_microcode(struct microcode_header_intel *mc_header,
 	ext_sigcount = ext_header->count;
 
 	for (i = 0; i < ext_sigcount; i++) {
-		fam_ucode   = __x86_family(ext_sig->sig);
+		fam_ucode   = x86_family(ext_sig->sig);
 		model_ucode = x86_model(ext_sig->sig);
 
 		if (fam == fam_ucode && model == model_ucode)
@@ -365,7 +365,7 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci)
 	native_cpuid(&eax, &ebx, &ecx, &edx);
 	csig.sig = eax;
 
-	family = __x86_family(csig.sig);
+	family = x86_family(csig.sig);
 	model  = x86_model(csig.sig);
 
 	if ((model >= 5) || (family > 6)) {
@@ -521,16 +521,12 @@ static bool __init load_builtin_intel_microcode(struct cpio_data *cp)
 {
 #ifdef CONFIG_X86_64
 	unsigned int eax = 0x00000001, ebx, ecx = 0, edx;
-	unsigned int family, model, stepping;
 	char name[30];
 
 	native_cpuid(&eax, &ebx, &ecx, &edx);
 
-	family   = __x86_family(eax);
-	model    = x86_model(eax);
-	stepping = eax & 0xf;
-
-	sprintf(name, "intel-ucode/%02x-%02x-%02x", family, model, stepping);
+	sprintf(name, "intel-ucode/%02x-%02x-%02x",
+		      x86_family(eax), x86_model(eax), x86_stepping(eax));
 
 	return get_builtin_firmware(cp, name);
 #else
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index f258788..a501fa2 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -16,7 +16,7 @@ clean-files := inat-tables.c
 
 obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o
 
-lib-y := delay.o misc.o cmdline.o
+lib-y := delay.o misc.o cmdline.o cpu.o
 lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o
 lib-y += memcpy_$(BITS).o
 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
diff --git a/arch/x86/lib/cpu.c b/arch/x86/lib/cpu.c
new file mode 100644
index 0000000..aa417a9
--- /dev/null
+++ b/arch/x86/lib/cpu.c
@@ -0,0 +1,35 @@
+#include <linux/module.h>
+
+unsigned int x86_family(unsigned int sig)
+{
+	unsigned int x86;
+
+	x86 = (sig >> 8) & 0xf;
+
+	if (x86 == 0xf)
+		x86 += (sig >> 20) & 0xff;
+
+	return x86;
+}
+EXPORT_SYMBOL_GPL(x86_family);
+
+unsigned int x86_model(unsigned int sig)
+{
+	unsigned int fam, model;
+
+	 fam = x86_family(sig);
+
+	model = (sig >> 4) & 0xf;
+
+	if (fam >= 0x6)
+		model += ((sig >> 16) & 0xf) << 4;
+
+	return model;
+}
+EXPORT_SYMBOL_GPL(x86_model);
+
+unsigned int x86_stepping(unsigned int sig)
+{
+	return sig & 0xf;
+}
+EXPORT_SYMBOL_GPL(x86_stepping);
-- 
cgit v1.1


From 91713faf386be6d7e6556b656436813f8c4ee552 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 23 Nov 2015 11:12:22 +0100
Subject: kvm: Add accessors for guest CPU's family, model, stepping

Those give the family, model and stepping of the guest vcpu.

Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1448273546-2567-3-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kvm/cpuid.h | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 06332cb..5d47e0d 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -2,6 +2,7 @@
 #define ARCH_X86_KVM_CPUID_H
 
 #include "x86.h"
+#include <asm/cpu.h>
 
 int kvm_update_cpuid(struct kvm_vcpu *vcpu);
 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
@@ -170,4 +171,37 @@ static inline bool guest_cpuid_has_nrips(struct kvm_vcpu *vcpu)
 }
 #undef BIT_NRIPS
 
+static inline int guest_cpuid_family(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 0x1, 0);
+	if (!best)
+		return -1;
+
+	return x86_family(best->eax);
+}
+
+static inline int guest_cpuid_model(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 0x1, 0);
+	if (!best)
+		return -1;
+
+	return x86_model(best->eax);
+}
+
+static inline int guest_cpuid_stepping(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 0x1, 0);
+	if (!best)
+		return -1;
+
+	return x86_stepping(best->eax);
+}
+
 #endif
-- 
cgit v1.1


From ae8b787543d872cf89a7f9ef8aa302f3ef9bcbd7 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 23 Nov 2015 11:12:23 +0100
Subject: x86/cpu/amd, kvm: Satisfy guest kernel reads of IC_CFG MSR

The kernel accesses IC_CFG MSR (0xc0011021) on AMD because it
checks whether the way access filter is enabled on some F15h
models, and, if so, disables it.

kvm doesn't handle that MSR access and complains about it, which
can get really noisy in dmesg when one starts kvm guests all the
time for testing. And it is useless anyway - guest kernel
shouldn't be doing such changes anyway so tell it that that
filter is disabled.

Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1448273546-2567-4-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/msr-index.h |  1 +
 arch/x86/kernel/cpu/amd.c        |  4 ++--
 arch/x86/kvm/svm.c               | 17 +++++++++++++++++
 3 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 690b402..b05402e 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -321,6 +321,7 @@
 #define MSR_F15H_PERF_CTR		0xc0010201
 #define MSR_F15H_NB_PERF_CTL		0xc0010240
 #define MSR_F15H_NB_PERF_CTR		0xc0010241
+#define MSR_F15H_IC_CFG			0xc0011021
 
 /* Fam 10h MSRs */
 #define MSR_FAM10H_MMIO_CONF_BASE	0xc0010058
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index a8816b3..e229640 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -678,9 +678,9 @@ static void init_amd_bd(struct cpuinfo_x86 *c)
 	 * Disable it on the affected CPUs.
 	 */
 	if ((c->x86_model >= 0x02) && (c->x86_model < 0x20)) {
-		if (!rdmsrl_safe(0xc0011021, &value) && !(value & 0x1E)) {
+		if (!rdmsrl_safe(MSR_F15H_IC_CFG, &value) && !(value & 0x1E)) {
 			value |= 0x1E;
-			wrmsrl_safe(0xc0011021, value);
+			wrmsrl_safe(MSR_F15H_IC_CFG, value);
 		}
 	}
 }
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 83a1c64..58b64c1 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3053,6 +3053,23 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_IA32_UCODE_REV:
 		msr_info->data = 0x01000065;
 		break;
+	case MSR_F15H_IC_CFG: {
+
+		int family, model;
+
+		family = guest_cpuid_family(vcpu);
+		model  = guest_cpuid_model(vcpu);
+
+		if (family < 0 || model < 0)
+			return kvm_get_msr_common(vcpu, msr_info);
+
+		msr_info->data = 0;
+
+		if (family == 0x15 &&
+		    (model >= 0x2 && model < 0x20))
+			msr_info->data = 0x1E;
+		}
+		break;
 	default:
 		return kvm_get_msr_common(vcpu, msr_info);
 	}
-- 
cgit v1.1


From 31ac34ca5636e596485c6e03df1879643bde585e Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 23 Nov 2015 11:12:25 +0100
Subject: x86/cpu: Fix MSR value truncation issue

So sparse rightfully complains that the u64 MSR value we're
writing into the STAR MSR, i.e. 0xc0000081, is being truncated:

./arch/x86/include/asm/msr.h:193:36: warning: cast truncates
bits from constant value (23001000000000 becomes 0)

because the actual value doesn't fit into the unsigned 32-bit
quantity which are the @low and @high wrmsrl() parameters.

This is not a problem, practically, because gcc is actually
being smart enough here and does the right thing:

  .loc 3 87 0
  xorl    %esi, %esi		# we needz a 32-bit zero
  movl    $2293776, %edx	# 0x00230010 == (__USER32_CS << 16) | __KERNEL_CS go into the high bits
  movl    $-1073741695, %ecx	# MSR_STAR, i.e., 0xc0000081
  movl    %esi, %eax		# low order 32 bits in the MSR which are 0
  #APP
  # 87 "./arch/x86/include/asm/msr.h" 1
          wrmsr

More specifically, MSR_STAR[31:0] is being set to 0. That field
is reserved on Intel and on AMD it is 32-bit SYSCALL Target EIP.

I'd strongly guess because Intel doesn't have SYSCALL in
compat/legacy mode and we're using SYSENTER and INT80 there. And
for compat syscalls in long mode we use CSTAR.

So let's fix the sparse warning by writing SYSRET and SYSCALL CS
and SS into the high 32-bit half of STAR and 0 in the low half
explicitly.

 [ Actually, if we had to be precise, we would have to read what's in
   STAR[31:0] and write it back unchanged on Intel and write 0 on AMD. I
   guess the current writing to 0 is still ok since Intel can apparently
   stomach it. ]

The resulting code is identical to what we have above:

  .loc 3 87 0
  xorl    %esi, %esi      # tmp104
  movl    $2293776, %eax  #, tmp103
  movl    $-1073741695, %ecx      #, tmp102
  movl    %esi, %edx      # tmp104, tmp104

  ...

        wrmsr

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1448273546-2567-6-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/common.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 0bed416..105da8d 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1180,7 +1180,7 @@ void syscall_init(void)
 	 * They both write to the same internal register. STAR allows to
 	 * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
 	 */
-	wrmsrl(MSR_STAR,  ((u64)__USER32_CS)<<48  | ((u64)__KERNEL_CS)<<32);
+	wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
 	wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
 
 #ifdef CONFIG_IA32_EMULATION
-- 
cgit v1.1


From 679bcea857d72868e3431dde3a0e158bf0ed9119 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 23 Nov 2015 11:12:26 +0100
Subject: x86/MSR: Chop off lower 32-bit value

sparse complains that the cast truncates the high bits. But here
we really do know what we're doing and we need the lower 32 bits
only as the @low argument. So make that explicit.

Suggested-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1448273546-2567-7-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/msr.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 77d8b28..8613382 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -190,7 +190,7 @@ static inline void wrmsr(unsigned msr, unsigned low, unsigned high)
 
 static inline void wrmsrl(unsigned msr, u64 val)
 {
-	native_write_msr(msr, (u32)val, (u32)(val >> 32));
+	native_write_msr(msr, (u32)(val & 0xffffffffULL), (u32)(val >> 32));
 }
 
 /* wrmsr with exception handling */
-- 
cgit v1.1


From 42baa2581c92f8d07e7260506c8d41caf14b0fc3 Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Mon, 23 Nov 2015 11:59:24 +0100
Subject: x86/apic: Fix the saving and restoring of lapic vectors during
 suspend/resume

Saving and restoring lapic vectors in lapic_suspend() and
lapic_resume() is not consistent: the thmr vector saving is
guarded by a different config option than the restore part. The
cmci vector isn't handled at all.

Those inconsistencies are not very critical, as the missing cmci
vector will be set via mce resume handling, the wrong config
option used for restoring the thmr vector can't be configured
differently than the one which should be used.

Nevertheless correct the thmr vector restore and add cmci vector
handling.

Signed-off-by: Juergen Gross <jgross@suse.com>
Acked-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1448276364-31334-1-git-send-email-jgross@suse.com
[ Minor code edits. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/apic/apic.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 2f69e3b1..8d7df74 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -2270,6 +2270,7 @@ static struct {
 	unsigned int apic_tmict;
 	unsigned int apic_tdcr;
 	unsigned int apic_thmr;
+	unsigned int apic_cmci;
 } apic_pm_state;
 
 static int lapic_suspend(void)
@@ -2299,6 +2300,10 @@ static int lapic_suspend(void)
 	if (maxlvt >= 5)
 		apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
 #endif
+#ifdef CONFIG_X86_MCE_INTEL
+	if (maxlvt >= 6)
+		apic_pm_state.apic_cmci = apic_read(APIC_LVTCMCI);
+#endif
 
 	local_irq_save(flags);
 	disable_local_APIC();
@@ -2355,10 +2360,14 @@ static void lapic_resume(void)
 	apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
 	apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
 	apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
-#if defined(CONFIG_X86_MCE_INTEL)
+#ifdef CONFIG_X86_THERMAL_VECTOR
 	if (maxlvt >= 5)
 		apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
 #endif
+#ifdef CONFIG_X86_MCE_INTEL
+	if (maxlvt >= 6)
+		apic_write(APIC_LVTCMCI, apic_pm_state.apic_cmci);
+#endif
 	if (maxlvt >= 4)
 		apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
 	apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
-- 
cgit v1.1


From b74a0cf1b3db30173eefa00c411775d2b1697700 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Thu, 19 Nov 2015 12:25:25 +0100
Subject: x86/fpu: Add an XSTATE_OP() macro

Add an XSTATE_OP() macro which contains the XSAVE* fault handling
and replace all non-alternatives users of xstate_fault() with
it.

This fixes also the buglet in copy_xregs_to_user() and
copy_user_to_xregs() where the inline asm didn't have @xstate as
memory reference and thus potentially causing unwanted
reordering of accesses to the extended state.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1447932326-4371-2-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/fpu/internal.h | 68 +++++++++++++++++--------------------
 1 file changed, 31 insertions(+), 37 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 3c3550c..709a3df 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -237,6 +237,20 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu)
 	_ASM_EXTABLE(1b, 3b)		\
 	: [_err] "=r" (__err)
 
+#define XSTATE_OP(op, st, lmask, hmask, err)				\
+	asm volatile("1:" op "\n\t"					\
+		     "xor %[err], %[err]\n"				\
+		     "2:\n\t"						\
+		     ".pushsection .fixup,\"ax\"\n\t"			\
+		     "3: movl $-2,%[err]\n\t"				\
+		     "jmp 2b\n\t"					\
+		     ".popsection\n\t"					\
+		     _ASM_EXTABLE(1b, 3b)				\
+		     : [err] "=r" (err)					\
+		     : "D" (st), "m" (*st), "a" (lmask), "d" (hmask)	\
+		     : "memory")
+
+
 /*
  * This function is called only during boot time when x86 caps are not set
  * up and alternative can not be used yet.
@@ -246,22 +260,14 @@ static inline void copy_xregs_to_kernel_booting(struct xregs_state *xstate)
 	u64 mask = -1;
 	u32 lmask = mask;
 	u32 hmask = mask >> 32;
-	int err = 0;
+	int err;
 
 	WARN_ON(system_state != SYSTEM_BOOTING);
 
-	if (boot_cpu_has(X86_FEATURE_XSAVES))
-		asm volatile("1:"XSAVES"\n\t"
-			"2:\n\t"
-			     xstate_fault(err)
-			: "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask), "0" (err)
-			: "memory");
+	if (static_cpu_has_safe(X86_FEATURE_XSAVES))
+		XSTATE_OP(XSAVES, xstate, lmask, hmask, err);
 	else
-		asm volatile("1:"XSAVE"\n\t"
-			"2:\n\t"
-			     xstate_fault(err)
-			: "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask), "0" (err)
-			: "memory");
+		XSTATE_OP(XSAVE, xstate, lmask, hmask, err);
 
 	/* We should never fault when copying to a kernel buffer: */
 	WARN_ON_FPU(err);
@@ -276,22 +282,14 @@ static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate)
 	u64 mask = -1;
 	u32 lmask = mask;
 	u32 hmask = mask >> 32;
-	int err = 0;
+	int err;
 
 	WARN_ON(system_state != SYSTEM_BOOTING);
 
-	if (boot_cpu_has(X86_FEATURE_XSAVES))
-		asm volatile("1:"XRSTORS"\n\t"
-			"2:\n\t"
-			     xstate_fault(err)
-			: "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask), "0" (err)
-			: "memory");
+	if (static_cpu_has_safe(X86_FEATURE_XSAVES))
+		XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
 	else
-		asm volatile("1:"XRSTOR"\n\t"
-			"2:\n\t"
-			     xstate_fault(err)
-			: "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask), "0" (err)
-			: "memory");
+		XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
 
 	/* We should never fault when copying from a kernel buffer: */
 	WARN_ON_FPU(err);
@@ -388,12 +386,10 @@ static inline int copy_xregs_to_user(struct xregs_state __user *buf)
 	if (unlikely(err))
 		return -EFAULT;
 
-	__asm__ __volatile__(ASM_STAC "\n"
-			     "1:"XSAVE"\n"
-			     "2: " ASM_CLAC "\n"
-			     xstate_fault(err)
-			     : "D" (buf), "a" (-1), "d" (-1), "0" (err)
-			     : "memory");
+	stac();
+	XSTATE_OP(XSAVE, buf, -1, -1, err);
+	clac();
+
 	return err;
 }
 
@@ -405,14 +401,12 @@ static inline int copy_user_to_xregs(struct xregs_state __user *buf, u64 mask)
 	struct xregs_state *xstate = ((__force struct xregs_state *)buf);
 	u32 lmask = mask;
 	u32 hmask = mask >> 32;
-	int err = 0;
+	int err;
+
+	stac();
+	XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
+	clac();
 
-	__asm__ __volatile__(ASM_STAC "\n"
-			     "1:"XRSTOR"\n"
-			     "2: " ASM_CLAC "\n"
-			     xstate_fault(err)
-			     : "D" (xstate), "a" (lmask), "d" (hmask), "0" (err)
-			     : "memory");	/* memory required? */
 	return err;
 }
 
-- 
cgit v1.1


From b7106fa0f29f9fd83d2d1905ab690d334ef855c1 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Thu, 19 Nov 2015 12:25:26 +0100
Subject: x86/fpu: Get rid of xstate_fault()

Add macros for the alternative XSAVE*/XRSTOR* operations which
contain the fault handling and use them. Kill xstate_fault().

Also, copy_xregs_to_kernel() didn't have the extended state as
memory reference in the asm.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1447932326-4371-3-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/fpu/internal.h | 105 ++++++++++++++++++------------------
 1 file changed, 52 insertions(+), 53 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 709a3df..eadcdd5 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -224,19 +224,6 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu)
 #define XRSTOR		".byte " REX_PREFIX "0x0f,0xae,0x2f"
 #define XRSTORS		".byte " REX_PREFIX "0x0f,0xc7,0x1f"
 
-/* xstate instruction fault handler: */
-#define xstate_fault(__err)		\
-					\
-	".section .fixup,\"ax\"\n"	\
-					\
-	"3:  movl $-2,%[_err]\n"	\
-	"    jmp  2b\n"			\
-					\
-	".previous\n"			\
-					\
-	_ASM_EXTABLE(1b, 3b)		\
-	: [_err] "=r" (__err)
-
 #define XSTATE_OP(op, st, lmask, hmask, err)				\
 	asm volatile("1:" op "\n\t"					\
 		     "xor %[err], %[err]\n"				\
@@ -250,6 +237,54 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu)
 		     : "D" (st), "m" (*st), "a" (lmask), "d" (hmask)	\
 		     : "memory")
 
+/*
+ * If XSAVES is enabled, it replaces XSAVEOPT because it supports a compact
+ * format and supervisor states in addition to modified optimization in
+ * XSAVEOPT.
+ *
+ * Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT
+ * supports modified optimization which is not supported by XSAVE.
+ *
+ * We use XSAVE as a fallback.
+ *
+ * The 661 label is defined in the ALTERNATIVE* macros as the address of the
+ * original instruction which gets replaced. We need to use it here as the
+ * address of the instruction where we might get an exception at.
+ */
+#define XSTATE_XSAVE(st, lmask, hmask, err)				\
+	asm volatile(ALTERNATIVE_2(XSAVE,				\
+				   XSAVEOPT, X86_FEATURE_XSAVEOPT,	\
+				   XSAVES,   X86_FEATURE_XSAVES)	\
+		     "\n"						\
+		     "xor %[err], %[err]\n"				\
+		     "3:\n"						\
+		     ".pushsection .fixup,\"ax\"\n"			\
+		     "4: movl $-2, %[err]\n"				\
+		     "jmp 3b\n"						\
+		     ".popsection\n"					\
+		     _ASM_EXTABLE(661b, 4b)				\
+		     : [err] "=r" (err)					\
+		     : "D" (st), "m" (*st), "a" (lmask), "d" (hmask)	\
+		     : "memory")
+
+/*
+ * Use XRSTORS to restore context if it is enabled. XRSTORS supports compact
+ * XSAVE area format.
+ */
+#define XSTATE_XRESTORE(st, lmask, hmask, err)				\
+	asm volatile(ALTERNATIVE(XRSTOR,				\
+				 XRSTORS, X86_FEATURE_XSAVES)		\
+		     "\n"						\
+		     "xor %[err], %[err]\n"				\
+		     "3:\n"						\
+		     ".pushsection .fixup,\"ax\"\n"			\
+		     "4: movl $-2, %[err]\n"				\
+		     "jmp 3b\n"						\
+		     ".popsection\n"					\
+		     _ASM_EXTABLE(661b, 4b)				\
+		     : [err] "=r" (err)					\
+		     : "D" (st), "m" (*st), "a" (lmask), "d" (hmask)	\
+		     : "memory")
 
 /*
  * This function is called only during boot time when x86 caps are not set
@@ -303,33 +338,11 @@ static inline void copy_xregs_to_kernel(struct xregs_state *xstate)
 	u64 mask = -1;
 	u32 lmask = mask;
 	u32 hmask = mask >> 32;
-	int err = 0;
+	int err;
 
 	WARN_ON(!alternatives_patched);
 
-	/*
-	 * If xsaves is enabled, xsaves replaces xsaveopt because
-	 * it supports compact format and supervisor states in addition to
-	 * modified optimization in xsaveopt.
-	 *
-	 * Otherwise, if xsaveopt is enabled, xsaveopt replaces xsave
-	 * because xsaveopt supports modified optimization which is not
-	 * supported by xsave.
-	 *
-	 * If none of xsaves and xsaveopt is enabled, use xsave.
-	 */
-	alternative_input_2(
-		"1:"XSAVE,
-		XSAVEOPT,
-		X86_FEATURE_XSAVEOPT,
-		XSAVES,
-		X86_FEATURE_XSAVES,
-		[xstate] "D" (xstate), "a" (lmask), "d" (hmask) :
-		"memory");
-	asm volatile("2:\n\t"
-		     xstate_fault(err)
-		     : "0" (err)
-		     : "memory");
+	XSTATE_XSAVE(xstate, lmask, hmask, err);
 
 	/* We should never fault when copying to a kernel buffer: */
 	WARN_ON_FPU(err);
@@ -342,23 +355,9 @@ static inline void copy_kernel_to_xregs(struct xregs_state *xstate, u64 mask)
 {
 	u32 lmask = mask;
 	u32 hmask = mask >> 32;
-	int err = 0;
+	int err;
 
-	/*
-	 * Use xrstors to restore context if it is enabled. xrstors supports
-	 * compacted format of xsave area which is not supported by xrstor.
-	 */
-	alternative_input(
-		"1: " XRSTOR,
-		XRSTORS,
-		X86_FEATURE_XSAVES,
-		"D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask)
-		: "memory");
-
-	asm volatile("2:\n"
-		     xstate_fault(err)
-		     : "0" (err)
-		     : "memory");
+	XSTATE_XRESTORE(xstate, lmask, hmask, err);
 
 	/* We should never fault when copying from a kernel buffer: */
 	WARN_ON_FPU(err);
-- 
cgit v1.1


From ed11a7f1b3bd482bd7d6ef7bc2859c41fb43b9ee Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 12 Nov 2015 12:59:01 -0800
Subject: context_tracking: Switch to new static_branch API

This is much less error-prone than the old code.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/812df7e64f120c5c7c08481f36a8caa9f53b2199.1447361906.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/context_tracking_state.h | 4 ++--
 kernel/context_tracking.c              | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h
index ee956c5..1d34fe6 100644
--- a/include/linux/context_tracking_state.h
+++ b/include/linux/context_tracking_state.h
@@ -22,12 +22,12 @@ struct context_tracking {
 };
 
 #ifdef CONFIG_CONTEXT_TRACKING
-extern struct static_key context_tracking_enabled;
+extern struct static_key_false context_tracking_enabled;
 DECLARE_PER_CPU(struct context_tracking, context_tracking);
 
 static inline bool context_tracking_is_enabled(void)
 {
-	return static_key_false(&context_tracking_enabled);
+	return static_branch_unlikely(&context_tracking_enabled);
 }
 
 static inline bool context_tracking_cpu_is_enabled(void)
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index d8560ee..9ad37b9 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -24,7 +24,7 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/context_tracking.h>
 
-struct static_key context_tracking_enabled = STATIC_KEY_INIT_FALSE;
+DEFINE_STATIC_KEY_FALSE(context_tracking_enabled);
 EXPORT_SYMBOL_GPL(context_tracking_enabled);
 
 DEFINE_PER_CPU(struct context_tracking, context_tracking);
@@ -191,7 +191,7 @@ void __init context_tracking_cpu_set(int cpu)
 
 	if (!per_cpu(context_tracking.active, cpu)) {
 		per_cpu(context_tracking.active, cpu) = true;
-		static_key_slow_inc(&context_tracking_enabled);
+		static_branch_inc(&context_tracking_enabled);
 	}
 
 	if (initialized)
-- 
cgit v1.1


From c28454332fe0b65e22c3a2717e5bf05b5b47ca20 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 12 Nov 2015 12:59:02 -0800
Subject: x86/asm: Error out if asm/jump_label.h is included inappropriately

Rather than potentially generating incorrect code on a
non-HAVE_JUMP_LABEL kernel if someone includes asm/jump_label.h,
error out.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/99407f0ac7fa3ab03a3d31ce076d47b5c2f44795.1447361906.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/jump_label.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index 5daeca3..96872dc 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -1,6 +1,19 @@
 #ifndef _ASM_X86_JUMP_LABEL_H
 #define _ASM_X86_JUMP_LABEL_H
 
+#ifndef HAVE_JUMP_LABEL
+/*
+ * For better or for worse, if jump labels (the gcc extension) are missing,
+ * then the entire static branch patching infrastructure is compiled out.
+ * If that happens, the code in here will malfunction.  Raise a compiler
+ * error instead.
+ *
+ * In theory, jump labels and the static branch patching infrastructure
+ * could be decoupled to fix this.
+ */
+#error asm/jump_label.h included on a non-jump-label kernel
+#endif
+
 #ifndef __ASSEMBLY__
 
 #include <linux/stringify.h>
-- 
cgit v1.1


From 2671c3e4fe2a34bd9bf2eecdf5d1149d4b55dbdf Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 12 Nov 2015 12:59:03 -0800
Subject: x86/asm: Add asm macros for static keys/jump labels

Unfortunately, we can only do this if HAVE_JUMP_LABEL.  In
principle, we could do some serious surgery on the core jump
label infrastructure to keep the patch infrastructure available
on x86 on all builds, but that's probably not worth it.

Implementing the macros using a conditional branch as a fallback
seems like a bad idea: we'd have to clobber flags.

This limitation can't cause silent failures -- trying to include
asm/jump_label.h at all on a non-HAVE_JUMP_LABEL kernel will
error out.  The macro's users are responsible for handling this
issue themselves.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/63aa45c4b692e8469e1876d6ccbb5da707972990.1447361906.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/jump_label.h | 52 +++++++++++++++++++++++++++++++++------
 1 file changed, 44 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index 96872dc..adc54c1 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -14,13 +14,6 @@
 #error asm/jump_label.h included on a non-jump-label kernel
 #endif
 
-#ifndef __ASSEMBLY__
-
-#include <linux/stringify.h>
-#include <linux/types.h>
-#include <asm/nops.h>
-#include <asm/asm.h>
-
 #define JUMP_LABEL_NOP_SIZE 5
 
 #ifdef CONFIG_X86_64
@@ -29,6 +22,14 @@
 # define STATIC_KEY_INIT_NOP GENERIC_NOP5_ATOMIC
 #endif
 
+#include <asm/asm.h>
+#include <asm/nops.h>
+
+#ifndef __ASSEMBLY__
+
+#include <linux/stringify.h>
+#include <linux/types.h>
+
 static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
 {
 	asm_volatile_goto("1:"
@@ -72,5 +73,40 @@ struct jump_entry {
 	jump_label_t key;
 };
 
-#endif  /* __ASSEMBLY__ */
+#else	/* __ASSEMBLY__ */
+
+.macro STATIC_JUMP_IF_TRUE target, key, def
+.Lstatic_jump_\@:
+	.if \def
+	/* Equivalent to "jmp.d32 \target" */
+	.byte		0xe9
+	.long		\target - .Lstatic_jump_after_\@
+.Lstatic_jump_after_\@:
+	.else
+	.byte		STATIC_KEY_INIT_NOP
+	.endif
+	.pushsection __jump_table, "aw"
+	_ASM_ALIGN
+	_ASM_PTR	.Lstatic_jump_\@, \target, \key
+	.popsection
+.endm
+
+.macro STATIC_JUMP_IF_FALSE target, key, def
+.Lstatic_jump_\@:
+	.if \def
+	.byte		STATIC_KEY_INIT_NOP
+	.else
+	/* Equivalent to "jmp.d32 \target" */
+	.byte		0xe9
+	.long		\target - .Lstatic_jump_after_\@
+.Lstatic_jump_after_\@:
+	.endif
+	.pushsection __jump_table, "aw"
+	_ASM_ALIGN
+	_ASM_PTR	.Lstatic_jump_\@, \target, \key + 1
+	.popsection
+.endm
+
+#endif	/* __ASSEMBLY__ */
+
 #endif
-- 
cgit v1.1


From 478dc89cf316697e8029411a64ea2b30c528434d Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 12 Nov 2015 12:59:04 -0800
Subject: x86/entry/64: Bypass enter_from_user_mode on non-context-tracking
 boots

On CONFIG_CONTEXT_TRACKING kernels that have context tracking
disabled at runtime (which includes most distro kernels), we
still have the overhead of a call to enter_from_user_mode in
interrupt and exception entries.

If jump labels are available, this uses the jump label
infrastructure to skip the call.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/73ee804fff48cd8c66b65b724f9f728a11a8c686.1447361906.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/calling.h  | 15 +++++++++++++++
 arch/x86/entry/entry_64.S |  8 ++------
 2 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 3c71dd9..e32206e 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -1,3 +1,5 @@
+#include <linux/jump_label.h>
+
 /*
 
  x86 function call convention, 64-bit:
@@ -232,3 +234,16 @@ For 32-bit we have the following conventions - kernel is built with
 
 #endif /* CONFIG_X86_64 */
 
+/*
+ * This does 'call enter_from_user_mode' unless we can avoid it based on
+ * kernel config or using the static jump infrastructure.
+ */
+.macro CALL_enter_from_user_mode
+#ifdef CONFIG_CONTEXT_TRACKING
+#ifdef HAVE_JUMP_LABEL
+	STATIC_JUMP_IF_FALSE .Lafter_call_\@, context_tracking_enabled, def=0
+#endif
+	call enter_from_user_mode
+.Lafter_call_\@:
+#endif
+.endm
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index a55697d..9d34d3c 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -520,9 +520,7 @@ END(irq_entries_start)
 	 */
 	TRACE_IRQS_OFF
 
-#ifdef CONFIG_CONTEXT_TRACKING
-	call enter_from_user_mode
-#endif
+	CALL_enter_from_user_mode
 
 1:
 	/*
@@ -1066,9 +1064,7 @@ ENTRY(error_entry)
 	 * (which can take locks).
 	 */
 	TRACE_IRQS_OFF
-#ifdef CONFIG_CONTEXT_TRACKING
-	call enter_from_user_mode
-#endif
+	CALL_enter_from_user_mode
 	ret
 
 .Lerror_entry_done:
-- 
cgit v1.1


From 4d3b16269059eee12dc572848191c8e4e7bd24b3 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Wed, 25 Nov 2015 19:34:32 +0900
Subject: perf probe: Fix to free temporal Dwarf_Frame correctly

The commit 05c8d802fa52 ("perf probe: Fix to free temporal Dwarf_Frame")
tried to fix the memory leak of Dwarf_Frame, but it released the frame
at wrong point. Since the dwarf_frame_cfa(frame, &pf->fb_ops, &nops) can
return an address inside the frame data structure to pf->fb_ops, we can
not release the frame before using pf->fb_ops.

This reverts the commit and releases the frame afterwards (right before
returning from call_probe_finder) correctly.

Reported-and-Tested-by: Arnaldo Carvalho de Melo <acme@kernel.org>
Reported-by: Michael Petlan <mpetlan@redhat.com>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Wang Nan <wangnan0@huawei.com>
Fixes: 05c8d802fa52 ("perf probe: Fix to free temporal Dwarf_Frame")
LPU-Reference: 20151125103432.1473.31009.stgit@localhost.localdomain
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/probe-finder.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index 1cab05a..2be10fb 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -654,6 +654,7 @@ static int convert_to_trace_point(Dwarf_Die *sp_die, Dwfl_Module *mod,
 static int call_probe_finder(Dwarf_Die *sc_die, struct probe_finder *pf)
 {
 	Dwarf_Attribute fb_attr;
+	Dwarf_Frame *frame = NULL;
 	size_t nops;
 	int ret;
 
@@ -683,26 +684,24 @@ static int call_probe_finder(Dwarf_Die *sc_die, struct probe_finder *pf)
 	ret = dwarf_getlocation_addr(&fb_attr, pf->addr, &pf->fb_ops, &nops, 1);
 	if (ret <= 0 || nops == 0) {
 		pf->fb_ops = NULL;
-		ret = 0;
 #if _ELFUTILS_PREREQ(0, 142)
 	} else if (nops == 1 && pf->fb_ops[0].atom == DW_OP_call_frame_cfa &&
 		   pf->cfi != NULL) {
-		Dwarf_Frame *frame = NULL;
 		if (dwarf_cfi_addrframe(pf->cfi, pf->addr, &frame) != 0 ||
 		    dwarf_frame_cfa(frame, &pf->fb_ops, &nops) != 0) {
 			pr_warning("Failed to get call frame on 0x%jx\n",
 				   (uintmax_t)pf->addr);
-			ret = -ENOENT;
+			free(frame);
+			return -ENOENT;
 		}
-		free(frame);
 #endif
 	}
 
 	/* Call finder's callback handler */
-	if (ret >= 0)
-		ret = pf->callback(sc_die, pf);
+	ret = pf->callback(sc_die, pf);
 
-	/* *pf->fb_ops will be cached in libdw. Don't free it. */
+	/* Since *pf->fb_ops can be a part of frame. we should free it here. */
+	free(frame);
 	pf->fb_ops = NULL;
 
 	return ret;
-- 
cgit v1.1


From 5725dd8fa888b4dcdff58241f9d3d3ac42a048e2 Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Tue, 24 Nov 2015 13:36:06 +0000
Subject: tools build: Clean CFLAGS and LDFLAGS for fixdep

Sometimes passing variables to tools/build is dangerous. For example, on
my platform there is a gcc problem (gcc 4.8.1):

It passes the stackprotector-all feature check:

  $ gcc -fstack-protector-all -c ./test.c
  $ echo $?
  0

But requires LDFLAGS support if separate compiling and linking:
  $ gcc -fstack-protector-all -c ./test.c
  $ gcc ./test.o
  ./test.o: In function `main':
  test.c:(.text+0xb): undefined reference to `__stack_chk_guard'
  test.c:(.text+0x21): undefined reference to `__stack_chk_guard'
  collect2: error: ld returned 1 exit status
  $ gcc -fstack-protector-all ./test.o
  $ echo $?
  0
  $ gcc ./test.o -lssp
  $ echo $?
  0
  $

In this environment building perf throws an error:

  $ make
    BUILD:   Doing 'make -j24' parallel build
  config/Makefile:344: No libunwind found. Please install libunwind-dev[el] >= 1.1 and/or set LIBUNWIND_DIR
  config/Makefile:403: No libaudit.h found, disables 'trace' tool, please install audit-libs-devel or libaudit-dev
  config/Makefile:418: slang not found, disables TUI support. Please install slang-devel or libslang-dev
  config/Makefile:432: GTK2 not found, disables GTK2 support. Please install gtk2-devel or libgtk2.0-dev
  config/Makefile:564: No bfd.h/libbfd found, please install binutils-dev[el]/zlib-static/libiberty-dev to gain symbol demangling
  config/Makefile:606: No numa.h found, disables 'perf bench numa mem' benchmark, please install numactl-devel/libnuma-devel/libnuma-dev
    CC       fixdep.o
    LD       fixdep-in.o
    LINK     fixdep
  fixdep-in.o: In function `parse_dep_file':
  /kernel/tools/build/fixdep.c:47: undefined reference to `__stack_chk_guard'
  /kernel/tools/build/fixdep.c:117: undefined reference to `__stack_chk_guard'
  fixdep-in.o: In function `main':
  /kernel-hydrogen/tools/build/fixdep.c:156: undefined reference to `__stack_chk_guard'
  /kernel/tools/build/fixdep.c:168: undefined reference to `__stack_chk_guard'
  collect2: error: ld returned 1 exit status
  make[2]: *** [fixdep] Error 1
  make[1]: *** [fixdep] Error 2
  make: *** [all] Error 2

This is because the CFLAGS used in building perf pollutes the CFLAGS
used for fixdep, passing -fstack-protector-all to buiold fixdep which is
obviously not required. Since fixdep is a small host side tool, we
should keep its CFLAGS/LDFLAGS simple and clean.

This patch clears the CFLAGS and LDFLAGS passed when building fixdep, so
such gcc problem won't block the perf build process.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1448372181-151723-2-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/build/Makefile.include | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/build/Makefile.include b/tools/build/Makefile.include
index 4e09ad6..6254760 100644
--- a/tools/build/Makefile.include
+++ b/tools/build/Makefile.include
@@ -4,7 +4,7 @@ ifdef CROSS_COMPILE
 fixdep:
 else
 fixdep:
-	$(Q)$(MAKE) -C $(srctree)/tools/build fixdep
+	$(Q)$(MAKE) -C $(srctree)/tools/build CFLAGS= LDFLAGS= fixdep
 endif
 
 .PHONY: fixdep
-- 
cgit v1.1


From d8ad6a15cc3a364de6c8010378adc3fb06ce3ff1 Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Tue, 24 Nov 2015 13:36:07 +0000
Subject: tools lib bpf: Don't do a feature check when cleaning

Before this patch libbpf always do feature check even when cleaning.

For example:

  $ cd kernel/tools/lib/bpf
  $ make

  Auto-detecting system features:
  ...                        libelf: [ on  ]
  ...                           bpf: [ on  ]

    CC       libbpf.o
    CC       bpf.o
    LD       libbpf-in.o
    LINK     libbpf.a
    LINK     libbpf.so
  $ make clean
    CLEAN    libbpf
    CLEAN    core-gen
  $ make clean

  Auto-detecting system features:
  ...                        libelf: [ on  ]
  ...                           bpf: [ on  ]

    CLEAN    libbpf
    CLEAN    core-gen
  $

Although the first 'make clean' doesn't show feature check result, it
still does the check. No output because check result is similar to
FEATURE-DUMP.libbpf.

This patch uses same method as perf to turn off feature checking when
'make clean'.

Reported-and-Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Wang Nan <wangnan0@huawei.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1448372181-151723-3-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/lib/bpf/Makefile | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile
index a3caaf3..636e3dd 100644
--- a/tools/lib/bpf/Makefile
+++ b/tools/lib/bpf/Makefile
@@ -71,7 +71,17 @@ FEATURE_DISPLAY = libelf bpf
 INCLUDES = -I. -I$(srctree)/tools/include -I$(srctree)/arch/$(ARCH)/include/uapi -I$(srctree)/include/uapi
 FEATURE_CHECK_CFLAGS-bpf = $(INCLUDES)
 
+check_feat := 1
+NON_CHECK_FEAT_TARGETS := clean TAGS tags cscope help
+ifdef MAKECMDGOALS
+ifeq ($(filter-out $(NON_CHECK_FEAT_TARGETS),$(MAKECMDGOALS)),)
+  check_feat := 0
+endif
+endif
+
+ifeq ($(check_feat),1)
 include $(srctree)/tools/build/Makefile.feature
+endif
 
 export prefix libdir src obj
 
-- 
cgit v1.1


From 0007bccc3cfd1e69deb0fd73ccc426b4cedb061d Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Sun, 16 Aug 2015 11:20:00 -0400
Subject: x86: Replace RDRAND forced-reseed with simple sanity check

x86_init_rdrand() was added with 2 goals:

1. Sanity check that the built-in-self-test circuit on the Digital
   Random Number Generator (DRNG) is not complaining.  As RDRAND
   HW self-checks on every invocation, this goal is achieved
   by simply invoking RDRAND and checking its return code.

2. Force a full re-seed of the random number generator.
   This was done out of paranoia to benefit the most un-sophisticated
   DRNG implementation conceivable in the architecture,
   an implementation that does not exist, and unlikely ever will.
   This worst-case full-re-seed is achieved by invoking
   a 64-bit RDRAND 8192 times.

Unfortunately, this worst-case re-seed costs O(1,000us).
Magnifying this cost, it is done from identify_cpu(), which is the
synchronous critical path to bring a processor on-line -- repeated
for every logical processor in the system at boot and resume from S3.

As it is very expensive, and of highly dubious value, we delete the
worst-case re-seed from the kernel.

We keep the 1st goal -- sanity check the hardware, and mark it absent
if it complains.

This change reduces the cost of x86_init_rdrand() by a factor of 1,000x,
to O(1us) from O(1,000us).

Signed-off-by: Len Brown <len.brown@intel.com>
Link: http://lkml.kernel.org/r/058618cc56ec6611171427ad7205e37e377aa8d4.1439738240.git.len.brown@intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/rdrand.c | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c
index 136ac74..819d949 100644
--- a/arch/x86/kernel/cpu/rdrand.c
+++ b/arch/x86/kernel/cpu/rdrand.c
@@ -33,28 +33,27 @@ static int __init x86_rdrand_setup(char *s)
 __setup("nordrand", x86_rdrand_setup);
 
 /*
- * Force a reseed cycle; we are architecturally guaranteed a reseed
- * after no more than 512 128-bit chunks of random data.  This also
- * acts as a test of the CPU capability.
+ * RDRAND has Built-In-Self-Test (BIST) that runs on every invocation.
+ * Run the instruction a few times as a sanity check.
+ * If it fails, it is simple to disable RDRAND here.
  */
-#define RESEED_LOOP ((512*128)/sizeof(unsigned long))
+#define SANITY_CHECK_LOOPS 8
 
 void x86_init_rdrand(struct cpuinfo_x86 *c)
 {
 #ifdef CONFIG_ARCH_RANDOM
 	unsigned long tmp;
-	int i, count, ok;
+	int i;
 
 	if (!cpu_has(c, X86_FEATURE_RDRAND))
-		return;		/* Nothing to do */
+		return;
 
-	for (count = i = 0; i < RESEED_LOOP; i++) {
-		ok = rdrand_long(&tmp);
-		if (ok)
-			count++;
+	for (i = 0; i < SANITY_CHECK_LOOPS; i++) {
+		if (!rdrand_long(&tmp)) {
+			clear_cpu_cap(c, X86_FEATURE_RDRAND);
+			printk_once(KERN_WARNING "rdrand: disabled\n");
+			return;
+		}
 	}
-
-	if (count != RESEED_LOOP)
-		clear_cpu_cap(c, X86_FEATURE_RDRAND);
 #endif
 }
-- 
cgit v1.1


From d6ccc3ec95251d8d3276f2900b59cbc468dd74f4 Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Tue, 17 Nov 2015 15:51:19 +0100
Subject: x86/paravirt: Remove paravirt ops pmd_update[_defer] and
 pte_update_defer

pte_update_defer can be removed as it is always set to the same
function as pte_update. So any usage of pte_update_defer() can be
replaced by pte_update().

pmd_update and pmd_update_defer are always set to paravirt_nop, so they
can just be nuked.

Signed-off-by: Juergen Gross <jgross@suse.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: jeremy@goop.org
Cc: chrisw@sous-sol.org
Cc: akataria@vmware.com
Cc: virtualization@lists.linux-foundation.org
Cc: xen-devel@lists.xen.org
Cc: konrad.wilk@oracle.com
Cc: david.vrabel@citrix.com
Cc: boris.ostrovsky@oracle.com
Link: http://lkml.kernel.org/r/1447771879-1806-1-git-send-email-jgross@suse.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/paravirt.h       | 17 -----------------
 arch/x86/include/asm/paravirt_types.h |  6 ------
 arch/x86/include/asm/pgtable.h        | 15 ++-------------
 arch/x86/kernel/paravirt.c            |  3 ---
 arch/x86/lguest/boot.c                |  1 -
 arch/x86/mm/pgtable.c                 |  7 +------
 arch/x86/xen/mmu.c                    |  1 -
 7 files changed, 3 insertions(+), 47 deletions(-)

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 4d7f080..cbbf41c 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -366,23 +366,6 @@ static inline void pte_update(struct mm_struct *mm, unsigned long addr,
 {
 	PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep);
 }
-static inline void pmd_update(struct mm_struct *mm, unsigned long addr,
-			      pmd_t *pmdp)
-{
-	PVOP_VCALL3(pv_mmu_ops.pmd_update, mm, addr, pmdp);
-}
-
-static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr,
-				    pte_t *ptep)
-{
-	PVOP_VCALL3(pv_mmu_ops.pte_update_defer, mm, addr, ptep);
-}
-
-static inline void pmd_update_defer(struct mm_struct *mm, unsigned long addr,
-				    pmd_t *pmdp)
-{
-	PVOP_VCALL3(pv_mmu_ops.pmd_update_defer, mm, addr, pmdp);
-}
 
 static inline pte_t __pte(pteval_t val)
 {
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 7afeafb..0451503 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -266,12 +266,6 @@ struct pv_mmu_ops {
 			   pmd_t *pmdp, pmd_t pmdval);
 	void (*pte_update)(struct mm_struct *mm, unsigned long addr,
 			   pte_t *ptep);
-	void (*pte_update_defer)(struct mm_struct *mm,
-				 unsigned long addr, pte_t *ptep);
-	void (*pmd_update)(struct mm_struct *mm, unsigned long addr,
-			   pmd_t *pmdp);
-	void (*pmd_update_defer)(struct mm_struct *mm,
-				 unsigned long addr, pmd_t *pmdp);
 
 	pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr,
 					pte_t *ptep);
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index c0b41f11..e99cbe8 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -69,9 +69,6 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page);
 #define pmd_clear(pmd)			native_pmd_clear(pmd)
 
 #define pte_update(mm, addr, ptep)              do { } while (0)
-#define pte_update_defer(mm, addr, ptep)        do { } while (0)
-#define pmd_update(mm, addr, ptep)              do { } while (0)
-#define pmd_update_defer(mm, addr, ptep)        do { } while (0)
 
 #define pgd_val(x)	native_pgd_val(x)
 #define __pgd(x)	native_make_pgd(x)
@@ -721,14 +718,9 @@ static inline void native_set_pmd_at(struct mm_struct *mm, unsigned long addr,
  * updates should either be sets, clears, or set_pte_atomic for P->P
  * transitions, which means this hook should only be called for user PTEs.
  * This hook implies a P->P protection or access change has taken place, which
- * requires a subsequent TLB flush.  The notification can optionally be delayed
- * until the TLB flush event by using the pte_update_defer form of the
- * interface, but care must be taken to assure that the flush happens while
- * still holding the same page table lock so that the shadow and primary pages
- * do not become out of sync on SMP.
+ * requires a subsequent TLB flush.
  */
 #define pte_update(mm, addr, ptep)		do { } while (0)
-#define pte_update_defer(mm, addr, ptep)	do { } while (0)
 #endif
 
 /*
@@ -820,9 +812,7 @@ static inline int pmd_write(pmd_t pmd)
 static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr,
 				       pmd_t *pmdp)
 {
-	pmd_t pmd = native_pmdp_get_and_clear(pmdp);
-	pmd_update(mm, addr, pmdp);
-	return pmd;
+	return native_pmdp_get_and_clear(pmdp);
 }
 
 #define __HAVE_ARCH_PMDP_SET_WRPROTECT
@@ -830,7 +820,6 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
 				      unsigned long addr, pmd_t *pmdp)
 {
 	clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp);
-	pmd_update(mm, addr, pmdp);
 }
 
 /*
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index f27962c..3265ea0 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -426,9 +426,6 @@ struct pv_mmu_ops pv_mmu_ops = {
 	.set_pmd = native_set_pmd,
 	.set_pmd_at = native_set_pmd_at,
 	.pte_update = paravirt_nop,
-	.pte_update_defer = paravirt_nop,
-	.pmd_update = paravirt_nop,
-	.pmd_update_defer = paravirt_nop,
 
 	.ptep_modify_prot_start = __ptep_modify_prot_start,
 	.ptep_modify_prot_commit = __ptep_modify_prot_commit,
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index a0d09f6..a1900d4 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -1472,7 +1472,6 @@ __init void lguest_init(void)
 	pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode;
 	pv_mmu_ops.lazy_mode.flush = paravirt_flush_lazy_mmu;
 	pv_mmu_ops.pte_update = lguest_pte_update;
-	pv_mmu_ops.pte_update_defer = lguest_pte_update;
 
 #ifdef CONFIG_X86_LOCAL_APIC
 	/* APIC read/write intercepts */
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index fb0a9dd..ee9c2e3 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -414,7 +414,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
 
 	if (changed && dirty) {
 		*ptep = entry;
-		pte_update_defer(vma->vm_mm, address, ptep);
+		pte_update(vma->vm_mm, address, ptep);
 	}
 
 	return changed;
@@ -431,7 +431,6 @@ int pmdp_set_access_flags(struct vm_area_struct *vma,
 
 	if (changed && dirty) {
 		*pmdp = entry;
-		pmd_update_defer(vma->vm_mm, address, pmdp);
 		/*
 		 * We had a write-protection fault here and changed the pmd
 		 * to to more permissive. No need to flush the TLB for that,
@@ -469,9 +468,6 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma,
 		ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
 					 (unsigned long *)pmdp);
 
-	if (ret)
-		pmd_update(vma->vm_mm, addr, pmdp);
-
 	return ret;
 }
 #endif
@@ -518,7 +514,6 @@ void pmdp_splitting_flush(struct vm_area_struct *vma,
 	set = !test_and_set_bit(_PAGE_BIT_SPLITTING,
 				(unsigned long *)pmdp);
 	if (set) {
-		pmd_update(vma->vm_mm, address, pmdp);
 		/* need tlb flush only to serialize against gup-fast */
 		flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
 	}
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 9c479fe..41ee3e2 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -2436,7 +2436,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
 	.flush_tlb_others = xen_flush_tlb_others,
 
 	.pte_update = paravirt_nop,
-	.pte_update_defer = paravirt_nop,
 
 	.pgd_alloc = xen_pgd_alloc,
 	.pgd_free = xen_pgd_free,
-- 
cgit v1.1


From 7a9c2dd08eadd5c6943115dbbec040c38d2e0822 Mon Sep 17 00:00:00 2001
From: Chen Yu <yu.c.chen@intel.com>
Date: Wed, 25 Nov 2015 01:03:41 +0800
Subject: x86/pm: Introduce quirk framework to save/restore extra MSR registers
 around suspend/resume

A bug was reported that on certain Broadwell platforms, after
resuming from S3, the CPU is running at an anomalously low
speed.

It turns out that the BIOS has modified the value of the
THERM_CONTROL register during S3, and changed it from 0 to 0x10,
thus enabled clock modulation(bit4), but with undefined CPU Duty
Cycle(bit1:3) - which causes the problem.

Here is a simple scenario to reproduce the issue:

 1. Boot up the system
 2. Get MSR 0x19a, it should be 0
 3. Put the system into sleep, then wake it up
 4. Get MSR 0x19a, it shows 0x10, while it should be 0

Although some BIOSen want to change the CPU Duty Cycle during
S3, in our case we don't want the BIOS to do any modification.

Fix this issue by introducing a more generic x86 framework to
save/restore specified MSR registers(THERM_CONTROL in this case)
for suspend/resume. This allows us to fix similar bugs in a much
simpler way in the future.

When the kernel wants to protect certain MSRs during suspending,
we simply add a quirk entry in msr_save_dmi_table, and customize
the MSR registers inside the quirk callback, for example:

  u32 msr_id_need_to_save[] = {MSR_ID0, MSR_ID1, MSR_ID2...};

and the quirk mechanism ensures that, once resumed from suspend,
the MSRs indicated by these IDs will be restored to their
original, pre-suspend values.

Since both 64-bit and 32-bit kernels are affected, this patch
covers the common 64/32-bit suspend/resume code path. And
because the MSRs specified by the user might not be available or
readable in any situation, we use rdmsrl_safe() to safely save
these MSRs.

Reported-and-tested-by: Marcin Kaszewski <marcin.kaszewski@intel.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bp@suse.de
Cc: len.brown@intel.com
Cc: linux@horizon.com
Cc: luto@kernel.org
Cc: rjw@rjwysocki.net
Link: http://lkml.kernel.org/r/c9abdcbc173dd2f57e8990e304376f19287e92ba.1448382971.git.yu.c.chen@intel.com
[ More edits to the naming of data structures. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/msr.h        | 10 +++++
 arch/x86/include/asm/suspend_32.h |  1 +
 arch/x86/include/asm/suspend_64.h |  1 +
 arch/x86/power/cpu.c              | 92 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 104 insertions(+)

diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 77d8b28..24feb3c 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -32,6 +32,16 @@ struct msr_regs_info {
 	int err;
 };
 
+struct saved_msr {
+	bool valid;
+	struct msr_info info;
+};
+
+struct saved_msrs {
+	unsigned int num;
+	struct saved_msr *array;
+};
+
 static inline unsigned long long native_read_tscp(unsigned int *aux)
 {
 	unsigned long low, high;
diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h
index d1793f0..8e9dbe7 100644
--- a/arch/x86/include/asm/suspend_32.h
+++ b/arch/x86/include/asm/suspend_32.h
@@ -15,6 +15,7 @@ struct saved_context {
 	unsigned long cr0, cr2, cr3, cr4;
 	u64 misc_enable;
 	bool misc_enable_saved;
+	struct saved_msrs saved_msrs;
 	struct desc_ptr gdt_desc;
 	struct desc_ptr idt;
 	u16 ldt;
diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h
index 7ebf0eb..6136a18 100644
--- a/arch/x86/include/asm/suspend_64.h
+++ b/arch/x86/include/asm/suspend_64.h
@@ -24,6 +24,7 @@ struct saved_context {
 	unsigned long cr0, cr2, cr3, cr4, cr8;
 	u64 misc_enable;
 	bool misc_enable_saved;
+	struct saved_msrs saved_msrs;
 	unsigned long efer;
 	u16 gdt_pad; /* Unused */
 	struct desc_ptr gdt_desc;
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 9ab5279..d5f6499 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -23,6 +23,7 @@
 #include <asm/debugreg.h>
 #include <asm/cpu.h>
 #include <asm/mmu_context.h>
+#include <linux/dmi.h>
 
 #ifdef CONFIG_X86_32
 __visible unsigned long saved_context_ebx;
@@ -32,6 +33,29 @@ __visible unsigned long saved_context_eflags;
 #endif
 struct saved_context saved_context;
 
+static void msr_save_context(struct saved_context *ctxt)
+{
+	struct saved_msr *msr = ctxt->saved_msrs.array;
+	struct saved_msr *end = msr + ctxt->saved_msrs.num;
+
+	while (msr < end) {
+		msr->valid = !rdmsrl_safe(msr->info.msr_no, &msr->info.reg.q);
+		msr++;
+	}
+}
+
+static void msr_restore_context(struct saved_context *ctxt)
+{
+	struct saved_msr *msr = ctxt->saved_msrs.array;
+	struct saved_msr *end = msr + ctxt->saved_msrs.num;
+
+	while (msr < end) {
+		if (msr->valid)
+			wrmsrl(msr->info.msr_no, msr->info.reg.q);
+		msr++;
+	}
+}
+
 /**
  *	__save_processor_state - save CPU registers before creating a
  *		hibernation image and before restoring the memory state from it
@@ -111,6 +135,7 @@ static void __save_processor_state(struct saved_context *ctxt)
 #endif
 	ctxt->misc_enable_saved = !rdmsrl_safe(MSR_IA32_MISC_ENABLE,
 					       &ctxt->misc_enable);
+	msr_save_context(ctxt);
 }
 
 /* Needed by apm.c */
@@ -229,6 +254,7 @@ static void notrace __restore_processor_state(struct saved_context *ctxt)
 	x86_platform.restore_sched_clock_state();
 	mtrr_bp_restore();
 	perf_restore_debug_store();
+	msr_restore_context(ctxt);
 }
 
 /* Needed by apm.c */
@@ -320,3 +346,69 @@ static int __init bsp_pm_check_init(void)
 }
 
 core_initcall(bsp_pm_check_init);
+
+static int msr_init_context(const u32 *msr_id, const int total_num)
+{
+	int i = 0;
+	struct saved_msr *msr_array;
+
+	if (saved_context.saved_msrs.array || saved_context.saved_msrs.num > 0) {
+		pr_err("x86/pm: MSR quirk already applied, please check your DMI match table.\n");
+		return -EINVAL;
+	}
+
+	msr_array = kmalloc_array(total_num, sizeof(struct saved_msr), GFP_KERNEL);
+	if (!msr_array) {
+		pr_err("x86/pm: Can not allocate memory to save/restore MSRs during suspend.\n");
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < total_num; i++) {
+		msr_array[i].info.msr_no	= msr_id[i];
+		msr_array[i].valid		= false;
+		msr_array[i].info.reg.q		= 0;
+	}
+	saved_context.saved_msrs.num	= total_num;
+	saved_context.saved_msrs.array	= msr_array;
+
+	return 0;
+}
+
+/*
+ * The following section is a quirk framework for problematic BIOSen:
+ * Sometimes MSRs are modified by the BIOSen after suspended to
+ * RAM, this might cause unexpected behavior after wakeup.
+ * Thus we save/restore these specified MSRs across suspend/resume
+ * in order to work around it.
+ *
+ * For any further problematic BIOSen/platforms,
+ * please add your own function similar to msr_initialize_bdw.
+ */
+static int msr_initialize_bdw(const struct dmi_system_id *d)
+{
+	/* Add any extra MSR ids into this array. */
+	u32 bdw_msr_id[] = { MSR_IA32_THERM_CONTROL };
+
+	pr_info("x86/pm: %s detected, MSR saving is needed during suspending.\n", d->ident);
+	return msr_init_context(bdw_msr_id, ARRAY_SIZE(bdw_msr_id));
+}
+
+static struct dmi_system_id msr_save_dmi_table[] = {
+	{
+	 .callback = msr_initialize_bdw,
+	 .ident = "BROADWELL BDX_EP",
+	 .matches = {
+		DMI_MATCH(DMI_PRODUCT_NAME, "GRANTLEY"),
+		DMI_MATCH(DMI_PRODUCT_VERSION, "E63448-400"),
+		},
+	},
+	{}
+};
+
+static int pm_check_save_msr(void)
+{
+	dmi_check_system(msr_save_dmi_table);
+	return 0;
+}
+
+device_initcall(pm_check_save_msr);
-- 
cgit v1.1


From b49a8fe52626814968b9a9d27d7ad1cadc5532ed Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 26 Nov 2015 16:08:20 +0900
Subject: perf callchain: Honor hide_unresolved

If user requested to hide unresolved entries, skip unresolved callchains
as well as hist entries.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1448521700-32062-3-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-report.c | 7 +++----
 tools/perf/util/machine.c   | 5 +++++
 tools/perf/util/symbol.h    | 3 ++-
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 1442834..8a9c690 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -45,7 +45,6 @@ struct report {
 	struct perf_tool	tool;
 	struct perf_session	*session;
 	bool			use_tui, use_gtk, use_stdio;
-	bool			hide_unresolved;
 	bool			dont_use_callchains;
 	bool			show_full_info;
 	bool			show_threads;
@@ -146,7 +145,7 @@ static int process_sample_event(struct perf_tool *tool,
 	struct hist_entry_iter iter = {
 		.evsel 			= evsel,
 		.sample 		= sample,
-		.hide_unresolved 	= rep->hide_unresolved,
+		.hide_unresolved 	= symbol_conf.hide_unresolved,
 		.add_entry_cb 		= hist_iter__report_callback,
 	};
 	int ret = 0;
@@ -157,7 +156,7 @@ static int process_sample_event(struct perf_tool *tool,
 		return -1;
 	}
 
-	if (rep->hide_unresolved && al.sym == NULL)
+	if (symbol_conf.hide_unresolved && al.sym == NULL)
 		goto out_put;
 
 	if (rep->cpu_list && !test_bit(sample->cpu, rep->cpu_bitmap))
@@ -740,7 +739,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
 	OPT_STRING_NOEMPTY('t', "field-separator", &symbol_conf.field_sep, "separator",
 		   "separator for columns, no spaces will be added between "
 		   "columns '.' is reserved."),
-	OPT_BOOLEAN('U', "hide-unresolved", &report.hide_unresolved,
+	OPT_BOOLEAN('U', "hide-unresolved", &symbol_conf.hide_unresolved,
 		    "Only display entries resolved to a symbol"),
 	OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory",
 		    "Look for files with symbols relative to this directory"),
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 7f5071a..f0019b7 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -1618,6 +1618,8 @@ static int add_callchain_ip(struct thread *thread,
 		}
 	}
 
+	if (symbol_conf.hide_unresolved && al.sym == NULL)
+		return 0;
 	return callchain_cursor_append(&callchain_cursor, al.addr, al.map, al.sym);
 }
 
@@ -1872,6 +1874,9 @@ check_calls:
 static int unwind_entry(struct unwind_entry *entry, void *arg)
 {
 	struct callchain_cursor *cursor = arg;
+
+	if (symbol_conf.hide_unresolved && entry->sym == NULL)
+		return 0;
 	return callchain_cursor_append(cursor, entry->ip,
 				       entry->map, entry->sym);
 }
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index dcd786e..857f707 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -108,7 +108,8 @@ struct symbol_conf {
 			show_hist_headers,
 			branch_callstack,
 			has_filter,
-			show_ref_callgraph;
+			show_ref_callgraph,
+			hide_unresolved;
 	const char	*vmlinux_name,
 			*kallsyms_name,
 			*source_prefix,
-- 
cgit v1.1


From 0356218a68551f051998f4fb5074a1eed7a346fe Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 26 Nov 2015 16:08:18 +0900
Subject: perf top: Fix freeze on --call-graph flat/folded

The callchain rbtree is rebuilt periodically, so it needs to
reinitialize the root everytime.  Otherwise it can be stuck in the
rbtree insertion with stale pointers.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1448521700-32062-1-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/callchain.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index fc3b1e0..564377d 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -290,6 +290,7 @@ static void
 sort_chain_flat(struct rb_root *rb_root, struct callchain_root *root,
 		u64 min_hit, struct callchain_param *param __maybe_unused)
 {
+	*rb_root = RB_ROOT;
 	__sort_chain_flat(rb_root, &root->node, min_hit);
 }
 
-- 
cgit v1.1


From 2aaecfc51bc65532152e141df3268fda06cae029 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 26 Nov 2015 14:55:23 +0100
Subject: perf script: Remove default_scripting_ops

The default script handler (the one that displays samples on screen) is
implemented scripting_ops instance with process_event callback.

This way we can't pass any script config into display function, because
we don't want perl or python handlers to be depended on perf script
internals.

Removing the default_scripting_ops and calling process event function
directly. This way it's possible to pass perf_script struct and process
configuration data in following commit.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1448546125-29245-1-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-script.c | 43 +++++++------------------------------------
 1 file changed, 7 insertions(+), 36 deletions(-)

diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 72b5deb..8e3f804 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -643,57 +643,24 @@ static void process_event(union perf_event *event, struct perf_sample *sample,
 	printf("\n");
 }
 
-static int default_start_script(const char *script __maybe_unused,
-				int argc __maybe_unused,
-				const char **argv __maybe_unused)
-{
-	return 0;
-}
-
-static int default_flush_script(void)
-{
-	return 0;
-}
-
-static int default_stop_script(void)
-{
-	return 0;
-}
-
-static int default_generate_script(struct pevent *pevent __maybe_unused,
-				   const char *outfile __maybe_unused)
-{
-	return 0;
-}
-
-static struct scripting_ops default_scripting_ops = {
-	.start_script		= default_start_script,
-	.flush_script		= default_flush_script,
-	.stop_script		= default_stop_script,
-	.process_event		= process_event,
-	.generate_script	= default_generate_script,
-};
-
 static struct scripting_ops	*scripting_ops;
 
 static void setup_scripting(void)
 {
 	setup_perl_scripting();
 	setup_python_scripting();
-
-	scripting_ops = &default_scripting_ops;
 }
 
 static int flush_scripting(void)
 {
-	return scripting_ops->flush_script();
+	return scripting_ops ? scripting_ops->flush_script() : 0;
 }
 
 static int cleanup_scripting(void)
 {
 	pr_debug("\nperf script stopped\n");
 
-	return scripting_ops->stop_script();
+	return scripting_ops ? scripting_ops->stop_script() : 0;
 }
 
 static int process_sample_event(struct perf_tool *tool __maybe_unused,
@@ -727,7 +694,11 @@ static int process_sample_event(struct perf_tool *tool __maybe_unused,
 	if (cpu_list && !test_bit(sample->cpu, cpu_bitmap))
 		goto out_put;
 
-	scripting_ops->process_event(event, sample, evsel, &al);
+	if (scripting_ops)
+		scripting_ops->process_event(event, sample, evsel, &al);
+	else
+		process_event(event, sample, evsel, &al);
+
 out_put:
 	addr_location__put(&al);
 	return 0;
-- 
cgit v1.1


From 67befc652845c8ffbefc8d173a6e6ced14d472f1 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 26 Nov 2015 14:54:04 +0100
Subject: perf build: Fix traceevent plugins build race

Ingo reported following build failure:

  $ make clean install
  ...
    CC       plugin_kmem.o
  fixdep: error opening depfile: ./.plugin_hrtimer.o.d: No such file or directory
  /home/mingo/tip/tools/build/Makefile.build:77: recipe for target
  'plugin_hrtimer.o' failed
  make[3]: *** [plugin_hrtimer.o] Error 2
  Makefile:189: recipe for target 'plugin_hrtimer-in.o' failed
  make[2]: *** [plugin_hrtimer-in.o] Error 2
  Makefile.perf:414: recipe for target 'libtraceevent_plugins' failed
  make[1]: *** [libtraceevent_plugins] Error 2
  make[1]: *** Waiting for unfinished jobs....

Currently we have the install-traceevent-plugins target being dependent
on $(LIBTRACEEVENT), which will actualy not build any plugin. So the
install-traceevent-plugins target itself will try to build plugins,
but..

Plugins built is also triggered by perf build itself via
libtraceevent_plugins target.

This might cause a race having one make thread removing temp files from
another and result in above error. Fixing this by having proper plugins
build dependency before installing plugins.

Reported-and-Tested-by:: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1448546044-28973-3-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Makefile.perf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index 0d19d54..929a32b 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -420,7 +420,7 @@ $(LIBTRACEEVENT)-clean:
 	$(call QUIET_CLEAN, libtraceevent)
 	$(Q)$(MAKE) -C $(TRACE_EVENT_DIR) O=$(OUTPUT) clean >/dev/null
 
-install-traceevent-plugins: $(LIBTRACEEVENT)
+install-traceevent-plugins: libtraceevent_plugins
 	$(Q)$(MAKE) -C $(TRACE_EVENT_DIR) $(LIBTRACEEVENT_FLAGS) O=$(OUTPUT) install_plugins
 
 $(LIBAPI): fixdep FORCE
-- 
cgit v1.1


From c03d5184f0e92fa696e4b57f54ffc3b19a92f704 Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Thu, 26 Nov 2015 03:59:57 +0000
Subject: perf machine: Adjust dso->long_name for offline module

Something unexpected may happen if copy statically linked perf to a
production environment:

  # ./perf probe -m ./mymodule.ko my_func
  [mymodule] with build id 326ab42550ef3d24944f53c817533728367effeb not found, continuing without symbols
  Failed to find symbol my_func in /home/wangnan/kmodule/mymodule.ko
    Error: Failed to add events.
  # ./perf buildid-cache -a ./mymodule.ko
  # ./perf probe -m ./mymodule.ko my_func
  Added new event:
    probe:my_func        (on my_func in /home/wangnan/kmodule/mymodule.ko)

  You can now use it in all perf tools, such as:

  	perf record -e probe:my_func -aR sleep 1

Where:

  # ldd ./perf
 	not a dynamic executable
  # strace -e open ./perf probe -m ./mymodule.ko my_func
  ...
  open("/home/wangnan/kmodule/mymodule.ko", O_RDONLY) = 3
  open("/home/wangnan/kmodule/../lib64/elfutils/libebl_x86_64.so", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
  ...
  open("/lib64/tls/libebl_x86_64.so", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
  open("/lib64/libebl_x86_64.so", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
  open("/usr/lib64/tls/libebl_x86_64.so", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
  open("/usr/lib64/libebl_x86_64.so", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
  open("[mymodule]", O_RDONLY)            = -1 ENOENT (No such file or directory)
  open("/home/wangnan/.debug/.build-id/32/6ab42550ef3d24944f53c817533728367effeb", O_RDONLY) = -1 ENOENT (No such file or directory)
  open("[mymodule]", O_RDONLY)            = -1 ENOENT (No such file or directory)

In the above example, probe fails before we put the module into
buildid-cache. However, user would expect it success in both case
because perf is able to find probe points actually.

The reason is because perf won't utilize module's full path if it failed
to open debuginfo. In:

     convert_to_probe_trace_events ->
        find_probe_trace_events_from_map ->
            get_target_map ->
                kernel_get_module_map ->
                    machine__findnew_module_map ->
                        map_groups__find_by_name

map_groups__find_by_name() is able to find the map of that module, but
this information is found from /proc/module before it knows the real
path of the offline module. Therefore, the map->dso->long_name is set to
something like '[mymodule]', which prevent dso__load() find the real
path of the module file.

In another aspect, if dso__load() can get the offline module through
buildid cache, it can read symble table from that ko. Even if debuginfo
is not available, 'perf probe' can success if the '.symtab' can be
found.

This patch improves machine__findnew_module_map(): when dso->long_name
is leading with '[' (doesn't find path of module when parsing
/proc/modules), fixes it by dso__set_long_name(), so following
dso__load() is possible to find the symbol table.

This patch won't interfere with buildid matching. Here is the test
result:

  # ./perf probe -m ./mymodule.ko my_func
  Added new event:
    probe:my_func        (on my_func in /home/wangnan/kmodule/mymodule.ko)

  You can now use it in all perf tools, such as:

  	perf record -e probe:my_func -aR sleep 1

  # ./perf probe -d '*'
  Removed event: probe:my_func
  # mv ./mymodule.{ko,.bak}
  # mv ./moduleb.ko mymodule.ko
  # ./perf probe -m ./mymodule.ko my_func
  /home/wangnan/kmodule/mymodule.ko with build id 326ab42550ef3d24944f53c817533728367effeb not found, continuing without symbols
  Failed to find symbol my_func in /home/wangnan/kmodule/mymodule.ko
    Error: Failed to add events.

  # ./perf probe -v -m ./mymodule.ko my_func
  probe-definition(0): my_func
  symbol:my_func file:(null) line:0 offset:0 return:0 lazy:(null)
  0 arguments
  Could not open debuginfo. Try to use symbols.
  symsrc__init: build id mismatch for /home/wangnan/kmodule/mymodule.ko.
  /home/wangnan/kmodule/mymodule.ko with build id 326ab42550ef3d24944f53c817533728367effeb not found, continuing without symbols
  Failed to find symbol my_func in /home/wangnan/kmodule/mymodule.ko
    Error: Failed to add events. Reason: No such file or directory (Code: -2)

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1448510397-187965-1-git-send-email-wangnan0@huawei.com
[ Renamed adjust_dso_long_name() do dso__adjust_kmod_long_name() ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/machine.c | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index f0019b7..95a7f60 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -561,6 +561,24 @@ int machine__process_switch_event(struct machine *machine __maybe_unused,
 	return 0;
 }
 
+static void dso__adjust_kmod_long_name(struct dso *dso, const char *filename)
+{
+	const char *dup_filename;
+
+	if (!filename || !dso || !dso->long_name)
+		return;
+	if (dso->long_name[0] != '[')
+		return;
+	if (!strchr(filename, '/'))
+		return;
+
+	dup_filename = strdup(filename);
+	if (!dup_filename)
+		return;
+
+	dso__set_long_name(dso, filename, true);
+}
+
 struct map *machine__findnew_module_map(struct machine *machine, u64 start,
 					const char *filename)
 {
@@ -573,8 +591,15 @@ struct map *machine__findnew_module_map(struct machine *machine, u64 start,
 
 	map = map_groups__find_by_name(&machine->kmaps, MAP__FUNCTION,
 				       m.name);
-	if (map)
+	if (map) {
+		/*
+		 * If the map's dso is an offline module, give dso__load()
+		 * a chance to find the file path of that module by fixing
+		 * long_name.
+		 */
+		dso__adjust_kmod_long_name(map->dso, filename);
 		goto out;
+	}
 
 	dso = machine__findnew_module_dso(machine, &m, filename);
 	if (dso == NULL)
-- 
cgit v1.1


From b2be5451f660e0ee230969cc24121d9e210a91de Mon Sep 17 00:00:00 2001
From: Yannick Brosseau <scientist@fb.com>
Date: Thu, 26 Nov 2015 03:42:32 -0800
Subject: perf tools: Correctly identify anon_hugepage when generating map (v2)

When parsing /proc/xxx/maps, the sscanf in perf_event__synthesize_mmap_events
truncate the map name at the space in "/anon_hugepage (deleted)".

is_anon_memory() then only receives the string "/anon_hugepage" and does
not detect it.  We change is_anon_memory() to only compare the first
part of the string, effectively ignoring if " (deleted)" is there.

Signed-off-by: Yannick Brosseau <scientist@fb.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Joshua Zhu <zhu.wen-jie@hp.com>
Cc: kernel-team@fb.com
Link: http://lkml.kernel.org/r/1448538152-2898-1-git-send-email-scientist@fb.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/map.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c
index afc6b56..93d9f1c 100644
--- a/tools/perf/util/map.c
+++ b/tools/perf/util/map.c
@@ -26,8 +26,8 @@ const char *map_type__name[MAP__NR_TYPES] = {
 static inline int is_anon_memory(const char *filename)
 {
 	return !strcmp(filename, "//anon") ||
-	       !strcmp(filename, "/dev/zero (deleted)") ||
-	       !strcmp(filename, "/anon_hugepage (deleted)");
+	       !strncmp(filename, "/dev/zero", sizeof("/dev/zero") - 1) ||
+	       !strncmp(filename, "/anon_hugepage", sizeof("/anon_hugepage") - 1);
 }
 
 static inline int is_no_dso_memory(const char *filename)
-- 
cgit v1.1


From 809e9423d7bc72e50d94d8267bab010a007a6137 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Thu, 26 Nov 2015 18:55:21 +0100
Subject: perf script: Pass perf_script into process_event

Passing perf_script struct into process_event function, so we could
process configuration data for event printing.

It will be used in following patch to get event name string width.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20151126175521.GA18979@krava.brq.redhat.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-script.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 8e3f804..3c3f8d0 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -588,8 +588,17 @@ static void print_sample_flags(u32 flags)
 	printf("  %-4s ", str);
 }
 
-static void process_event(union perf_event *event, struct perf_sample *sample,
-			  struct perf_evsel *evsel, struct addr_location *al)
+struct perf_script {
+	struct perf_tool	tool;
+	struct perf_session	*session;
+	bool			show_task_events;
+	bool			show_mmap_events;
+	bool			show_switch_events;
+};
+
+static void process_event(struct perf_script *script __maybe_unused, union perf_event *event,
+			  struct perf_sample *sample, struct perf_evsel *evsel,
+			  struct addr_location *al)
 {
 	struct thread *thread = al->thread;
 	struct perf_event_attr *attr = &evsel->attr;
@@ -663,12 +672,13 @@ static int cleanup_scripting(void)
 	return scripting_ops ? scripting_ops->stop_script() : 0;
 }
 
-static int process_sample_event(struct perf_tool *tool __maybe_unused,
+static int process_sample_event(struct perf_tool *tool,
 				union perf_event *event,
 				struct perf_sample *sample,
 				struct perf_evsel *evsel,
 				struct machine *machine)
 {
+	struct perf_script *scr = container_of(tool, struct perf_script, tool);
 	struct addr_location al;
 
 	if (debug_mode) {
@@ -697,21 +707,13 @@ static int process_sample_event(struct perf_tool *tool __maybe_unused,
 	if (scripting_ops)
 		scripting_ops->process_event(event, sample, evsel, &al);
 	else
-		process_event(event, sample, evsel, &al);
+		process_event(scr, event, sample, evsel, &al);
 
 out_put:
 	addr_location__put(&al);
 	return 0;
 }
 
-struct perf_script {
-	struct perf_tool	tool;
-	struct perf_session	*session;
-	bool			show_task_events;
-	bool			show_mmap_events;
-	bool			show_switch_events;
-};
-
 static int process_attr(struct perf_tool *tool, union perf_event *event,
 			struct perf_evlist **pevlist)
 {
-- 
cgit v1.1


From 5e50426d5d9049dfdb8b2b18e761717e7e80a6ad Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Thu, 26 Nov 2015 19:50:55 +0100
Subject: tools build: Use fixdep with OUTPUT path prefix

Adding OUTPUT path prefix for fixdep target so we use it properly in out
of tree builds.

If the fixdep already existed in the tree, the out of tree build would
see it already exist and did not build the out of tree version, as
reported by Arnaldo:

  [acme@zoo linux]$ make O=/tmp/build/perf -C tools/perf
  make: Entering directory '/home/git/linux/tools/perf'
    BUILD:   Doing 'make -j4' parallel build
  make[2]: Nothing to be done for 'fixdep'.
  make: Leaving directory '/home/git/linux/tools/perf'

Reported-and-Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/20151126185055.GC19410@krava.brq.redhat.com
[ Fixed conflict with 5725dd8fa888 ("tools build: Clean CFLAGS and LDFLAGS for fixdep") ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/build/Makefile         | 2 +-
 tools/build/Makefile.include | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/build/Makefile b/tools/build/Makefile
index a930362..0d5a0e3 100644
--- a/tools/build/Makefile
+++ b/tools/build/Makefile
@@ -25,7 +25,7 @@ export Q srctree CC LD
 MAKEFLAGS := --no-print-directory
 build     := -f $(srctree)/tools/build/Makefile.build dir=. obj
 
-all: fixdep
+all: $(OUTPUT)fixdep
 
 clean:
 	$(call QUIET_CLEAN, fixdep)
diff --git a/tools/build/Makefile.include b/tools/build/Makefile.include
index 6254760..be630be 100644
--- a/tools/build/Makefile.include
+++ b/tools/build/Makefile.include
@@ -4,7 +4,7 @@ ifdef CROSS_COMPILE
 fixdep:
 else
 fixdep:
-	$(Q)$(MAKE) -C $(srctree)/tools/build CFLAGS= LDFLAGS= fixdep
+	$(Q)$(MAKE) -C $(srctree)/tools/build CFLAGS= LDFLAGS= $(OUTPUT)fixdep
 endif
 
 .PHONY: fixdep
-- 
cgit v1.1


From aac4864727f4b3838ec1c03277bbc47a237b7516 Mon Sep 17 00:00:00 2001
From: Ekaterina Tumanova <tumanova@linux.vnet.ibm.com>
Date: Wed, 25 Nov 2015 17:32:45 +0100
Subject: perf symbols: Refactor vmlinux_path__init() to ease path additions

Refactor vmlinux_path__init() to ease subsequent additions of new
vmlinux locations.

Signed-off-by: Ekaterina Tumanova <tumanova@linux.vnet.ibm.com>
Acked-by: Alexander Yarygin <yarygin@linux.vnet.ibm.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1448469166-61363-2-git-send-email-tumanova@linux.vnet.ibm.com
[ Rename vmlinux_path__update() to vmlinux_path__add() ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/symbol.c | 64 +++++++++++++++++++++++++-----------------------
 1 file changed, 33 insertions(+), 31 deletions(-)

diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index cd08027..e2ac6b6 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -1860,24 +1860,43 @@ static void vmlinux_path__exit(void)
 	zfree(&vmlinux_path);
 }
 
+static const char * const vmlinux_paths[] = {
+	"vmlinux",
+	"/boot/vmlinux"
+};
+
+static const char * const vmlinux_paths_upd[] = {
+	"/boot/vmlinux-%s",
+	"/usr/lib/debug/boot/vmlinux-%s",
+	"/lib/modules/%s/build/vmlinux",
+	"/usr/lib/debug/lib/modules/%s/vmlinux"
+};
+
+static int vmlinux_path__add(const char *new_entry)
+{
+	vmlinux_path[vmlinux_path__nr_entries] = strdup(new_entry);
+	if (vmlinux_path[vmlinux_path__nr_entries] == NULL)
+		return -1;
+	++vmlinux_path__nr_entries;
+
+	return 0;
+}
+
 static int vmlinux_path__init(struct perf_env *env)
 {
 	struct utsname uts;
 	char bf[PATH_MAX];
 	char *kernel_version;
+	unsigned int i;
 
-	vmlinux_path = malloc(sizeof(char *) * 6);
+	vmlinux_path = malloc(sizeof(char *) * (ARRAY_SIZE(vmlinux_paths) +
+			      ARRAY_SIZE(vmlinux_paths_upd)));
 	if (vmlinux_path == NULL)
 		return -1;
 
-	vmlinux_path[vmlinux_path__nr_entries] = strdup("vmlinux");
-	if (vmlinux_path[vmlinux_path__nr_entries] == NULL)
-		goto out_fail;
-	++vmlinux_path__nr_entries;
-	vmlinux_path[vmlinux_path__nr_entries] = strdup("/boot/vmlinux");
-	if (vmlinux_path[vmlinux_path__nr_entries] == NULL)
-		goto out_fail;
-	++vmlinux_path__nr_entries;
+	for (i = 0; i < ARRAY_SIZE(vmlinux_paths); i++)
+		if (vmlinux_path__add(vmlinux_paths[i]) < 0)
+			goto out_fail;
 
 	/* only try kernel version if no symfs was given */
 	if (symbol_conf.symfs[0] != 0)
@@ -1892,28 +1911,11 @@ static int vmlinux_path__init(struct perf_env *env)
 		kernel_version = uts.release;
 	}
 
-	snprintf(bf, sizeof(bf), "/boot/vmlinux-%s", kernel_version);
-	vmlinux_path[vmlinux_path__nr_entries] = strdup(bf);
-	if (vmlinux_path[vmlinux_path__nr_entries] == NULL)
-		goto out_fail;
-	++vmlinux_path__nr_entries;
-	snprintf(bf, sizeof(bf), "/usr/lib/debug/boot/vmlinux-%s",
-		 kernel_version);
-	vmlinux_path[vmlinux_path__nr_entries] = strdup(bf);
-	if (vmlinux_path[vmlinux_path__nr_entries] == NULL)
-		goto out_fail;
-        ++vmlinux_path__nr_entries;
-	snprintf(bf, sizeof(bf), "/lib/modules/%s/build/vmlinux", kernel_version);
-	vmlinux_path[vmlinux_path__nr_entries] = strdup(bf);
-	if (vmlinux_path[vmlinux_path__nr_entries] == NULL)
-		goto out_fail;
-	++vmlinux_path__nr_entries;
-	snprintf(bf, sizeof(bf), "/usr/lib/debug/lib/modules/%s/vmlinux",
-		 kernel_version);
-	vmlinux_path[vmlinux_path__nr_entries] = strdup(bf);
-	if (vmlinux_path[vmlinux_path__nr_entries] == NULL)
-		goto out_fail;
-	++vmlinux_path__nr_entries;
+	for (i = 0; i < ARRAY_SIZE(vmlinux_paths_upd); i++) {
+		snprintf(bf, sizeof(bf), vmlinux_paths_upd[i], kernel_version);
+		if (vmlinux_path__add(bf) < 0)
+			goto out_fail;
+	}
 
 	return 0;
 
-- 
cgit v1.1


From f55ae9540d16a355e61cb57b035aab9e1ae2da28 Mon Sep 17 00:00:00 2001
From: Ekaterina Tumanova <tumanova@linux.vnet.ibm.com>
Date: Wed, 25 Nov 2015 17:32:46 +0100
Subject: perf symbols: Add the path to vmlinux.debug

Currently when debuginfo is separated to vmlinux.debug, it's contents
get ignored. Let's change that and add it to the vmlinux_path list.

Signed-off-by: Ekaterina Tumanova <tumanova@linux.vnet.ibm.com>
Acked-by: Alexander Yarygin <yarygin@linux.vnet.ibm.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1448469166-61363-3-git-send-email-tumanova@linux.vnet.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/symbol.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index e2ac6b6..d51abd2 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -1869,7 +1869,8 @@ static const char * const vmlinux_paths_upd[] = {
 	"/boot/vmlinux-%s",
 	"/usr/lib/debug/boot/vmlinux-%s",
 	"/lib/modules/%s/build/vmlinux",
-	"/usr/lib/debug/lib/modules/%s/vmlinux"
+	"/usr/lib/debug/lib/modules/%s/vmlinux",
+	"/usr/lib/debug/boot/vmlinux-%s.debug"
 };
 
 static int vmlinux_path__add(const char *new_entry)
-- 
cgit v1.1


From 6acd8e9271cdeaec458fd4eec4a6765d16e0e61c Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 25 Nov 2015 16:36:54 +0100
Subject: perf stat: Clear sample_(type|period) for counting

Clear sample_(type|period) for counting, as it only confuses debug
output with unwanted sampling details:

Before:

  $ sudo perf stat -e 'raw_syscalls:sys_enter' -vv ls
  ------------------------------------------------------------
  perf_event_attr:
    type                             2
    size                             112
    config                           0x11
    { sample_period, sample_freq }   1
    sample_type                      TIME|CPU|PERIOD|RAW
    read_format                      TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING
    disabled                         1
    inherit                          1
    enable_on_exec                   1
    exclude_guest                    1
  ...

After:
  $ sudo perf stat -e 'raw_syscalls:sys_enter' -vv ls
  ------------------------------------------------------------
  perf_event_attr:
    type                             2
    size                             112
    config                           0x11
    read_format                      TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING
    disabled                         1
    inherit                          1
    enable_on_exec                   1
    exclude_guest                    1
  ...

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1448465815-27404-1-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-stat.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index e77880b..df2fbf0 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -161,6 +161,13 @@ static int create_perf_stat_counter(struct perf_evsel *evsel)
 
 	attr->inherit = !no_inherit;
 
+	/*
+	 * Some events get initialized with sample_(period/type) set,
+	 * like tracepoints. Clear it up for counting.
+	 */
+	attr->sample_period = 0;
+	attr->sample_type   = 0;
+
 	if (target__has_cpu(&target))
 		return perf_evsel__open_per_cpu(evsel, perf_evsel__cpus(evsel));
 
-- 
cgit v1.1


From dcdd184b60c3943fb678dcbaf899a26f845901ad Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 25 Nov 2015 16:36:55 +0100
Subject: perf evlist: Display WEIGHT sample type bit

Adding WIEGHT bit_name call to display sample_type properly.

  $ perf evlist -v
  cpu/mem-loads/pp: ...SNIP... sample_type: IP|TID|TIME|ADDR|ID|CPU|DATA_SRC|WEIGHT ...

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1448465815-27404-2-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/evsel.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 397fb4e..0a1f4d9 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -1192,6 +1192,7 @@ static void __p_sample_type(char *buf, size_t size, u64 value)
 		bit_name(PERIOD), bit_name(STREAM_ID), bit_name(RAW),
 		bit_name(BRANCH_STACK), bit_name(REGS_USER), bit_name(STACK_USER),
 		bit_name(IDENTIFIER), bit_name(REGS_INTR), bit_name(DATA_SRC),
+		bit_name(WEIGHT),
 		{ .name = NULL, }
 	};
 #undef bit_name
-- 
cgit v1.1


From 43798bf37215fe242e592fd4605d804e2da0781b Mon Sep 17 00:00:00 2001
From: He Kuang <hekuang@huawei.com>
Date: Tue, 24 Nov 2015 13:36:08 +0000
Subject: bpf tools: Add helper function for updating bpf maps elements

Add bpf_map_update_elem() helper function which calls the sys_bpf
syscall to update elements in bpf maps. Upcoming patches will use it to
adjust data in map through the perf command line.

Signed-off-by: He Kuang <hekuang@huawei.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1448372181-151723-4-git-send-email-wangnan0@huawei.com
Signed-off-by: Wang Nan <wangnan0@huawei.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/lib/bpf/bpf.c | 14 ++++++++++++++
 tools/lib/bpf/bpf.h |  2 ++
 2 files changed, 16 insertions(+)

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index a633105..5bdc6ea 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -83,3 +83,17 @@ int bpf_load_program(enum bpf_prog_type type, struct bpf_insn *insns,
 	log_buf[0] = 0;
 	return sys_bpf(BPF_PROG_LOAD, &attr, sizeof(attr));
 }
+
+int bpf_map_update_elem(int fd, void *key, void *value,
+			u64 flags)
+{
+	union bpf_attr attr;
+
+	bzero(&attr, sizeof(attr));
+	attr.map_fd = fd;
+	attr.key = ptr_to_u64(key);
+	attr.value = ptr_to_u64(value);
+	attr.flags = flags;
+
+	return sys_bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr));
+}
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 854b736..a764655 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -20,4 +20,6 @@ int bpf_load_program(enum bpf_prog_type type, struct bpf_insn *insns,
 		     u32 kern_version, char *log_buf,
 		     size_t log_buf_sz);
 
+int bpf_map_update_elem(int fd, void *key, void *value,
+			u64 flags);
 #endif
-- 
cgit v1.1


From e49a449b869afb2b8bf282427c8355bc3a2fad56 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Fri, 13 Nov 2015 15:18:31 +0100
Subject: x86/fpu: Put a few variables in .init.data

These are clearly just used during init.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1447424312-26400-1-git-send-email-linux@rasmusvillemoes.dk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/fpu/init.c   | 4 ++--
 arch/x86/kernel/fpu/xstate.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index be39b5f..e1ed519 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -188,7 +188,7 @@ static void __init fpu__init_task_struct_size(void)
  */
 static void __init fpu__init_system_xstate_size_legacy(void)
 {
-	static int on_boot_cpu = 1;
+	static int on_boot_cpu __initdata = 1;
 
 	WARN_ON_FPU(!on_boot_cpu);
 	on_boot_cpu = 0;
@@ -278,7 +278,7 @@ __setup("eagerfpu=", eager_fpu_setup);
  */
 static void __init fpu__init_system_ctx_switch(void)
 {
-	static bool on_boot_cpu = 1;
+	static bool on_boot_cpu __initdata = 1;
 
 	WARN_ON_FPU(!on_boot_cpu);
 	on_boot_cpu = 0;
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 70fc312..40f1002 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -297,7 +297,7 @@ static void __init setup_xstate_comp(void)
  */
 static void __init setup_init_fpu_buf(void)
 {
-	static int on_boot_cpu = 1;
+	static int on_boot_cpu __initdata = 1;
 
 	WARN_ON_FPU(!on_boot_cpu);
 	on_boot_cpu = 0;
@@ -608,7 +608,7 @@ static void fpu__init_disable_system_xstate(void)
 void __init fpu__init_system_xstate(void)
 {
 	unsigned int eax, ebx, ecx, edx;
-	static int on_boot_cpu = 1;
+	static int on_boot_cpu __initdata = 1;
 	int err;
 
 	WARN_ON_FPU(!on_boot_cpu);
-- 
cgit v1.1


From c4e079494f9258a3d2045779a3c9d6599c773dd9 Mon Sep 17 00:00:00 2001
From: Michael Petlan <mpetlan@redhat.com>
Date: Fri, 27 Nov 2015 14:48:09 +0100
Subject: perf buildid-list: Show running kernel build id fix

The --kernel option of perf buildid-list tool should show the running
kernel buildid.  The functionality has been lost during other changes of
the related code.

The build_id__sprintf() function should return length of the build-id
string,  but it was the length of the build-id raw data instead. Due to
that, some return value checking caused that the final string was not
printed out.

With this patch the build_id__sprintf() returns the correct value, so
the --kernel option works again.

Before:

	# perf buildid-list --kernel
	#

After:

	# perf buildid-list --kernel
	972c1edab5bdc06cc224af45d510af662a3c6972
	#

Signed-off-by: Michael Petlan <mpetlan@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
LPU-Reference: 1448632089.24573.114.camel@redhat.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/build-id.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c
index 217b5a6..6a7e273 100644
--- a/tools/perf/util/build-id.c
+++ b/tools/perf/util/build-id.c
@@ -91,7 +91,7 @@ int build_id__sprintf(const u8 *build_id, int len, char *bf)
 		bid += 2;
 	}
 
-	return raw - build_id;
+	return (bid - bf) + 1;
 }
 
 int sysfs__sprintf_build_id(const char *root_dir, char *sbuild_id)
-- 
cgit v1.1


From 9bdcede563a831f139b5fc872f028ef844a7462e Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Fri, 27 Nov 2015 09:21:21 +0100
Subject: perf test: 'unwind' test should create kernel maps

The 'perf test unwind' is failing because it forgot to create the kernel
maps, fix it.

After the patch:

  # perf test unwind
  40: Test dwarf unwind         : Ok

Reported-and-Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: http://lkml.kernel.org/r/20151127082121.GA24503@krava.brq.redhat.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/dwarf-unwind.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tools/perf/tests/dwarf-unwind.c b/tools/perf/tests/dwarf-unwind.c
index b2357e8..3cce13b1 100644
--- a/tools/perf/tests/dwarf-unwind.c
+++ b/tools/perf/tests/dwarf-unwind.c
@@ -173,6 +173,11 @@ int test__dwarf_unwind(int subtest __maybe_unused)
 		return -1;
 	}
 
+	if (machine__create_kernel_maps(machine)) {
+		pr_err("Failed to create kernel maps\n");
+		return -1;
+	}
+
 	callchain_param.record_mode = CALLCHAIN_DWARF;
 
 	if (init_live_machine(machine)) {
-- 
cgit v1.1


From bae9cc41105b9edd74d68a9636be2ba240e74b9e Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 27 Nov 2015 15:54:33 -0300
Subject: perf list: Add support for PERF_COUNT_SW_BPF_OUT

When PERF_COUNT_SW_BPF_OUTPUT was added to the kernel we should've
added it to tools/perf, where it is used just to list events.

This ended up causing a segfault in commands like "perf list stall".

Fix it by adding that new software counter.

A patch to robustify perf to not segfault when the next counter gets
added in the kernel will follow this one.

Reported-by: Ingo Molnar <mingo@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/n/tip-uya354upi3eprsey6mi5962d@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/parse-events.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index e48d9da..40ae92a 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -124,6 +124,10 @@ struct event_symbol event_symbols_sw[PERF_COUNT_SW_MAX] = {
 		.symbol = "dummy",
 		.alias  = "",
 	},
+	[PERF_COUNT_SW_BPF_OUTPUT] = {
+		.symbol = "bpf-output",
+		.alias  = "",
+	},
 };
 
 #define __PERF_EVENT_FIELD(config, name) \
-- 
cgit v1.1


From e37df6c76cb19971f1228bfaff504d8a3ea6f748 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 27 Nov 2015 16:04:58 -0300
Subject: perf list: Robustify event printing routine

When a43eec304259 ("bpf: introduce bpf_perf_event_output() helper") added
PERF_COUNT_SW_BPF_OUTPUT we ended up with a new entry in the event_symbols_sw
array that wasn't initialized, thus set to NULL, fix print_symbol_events()
to check for that case so that we don't crash if this happens again.

  (gdb) bt
  #0  __match_glob (ignore_space=false, pat=<optimized out>, str=<optimized out>) at util/string.c:198
  #1  strglobmatch (str=<optimized out>, pat=pat@entry=0x7fffffffe61d "stall") at util/string.c:252
  #2  0x00000000004993a5 in print_symbol_events (type=1, syms=0x872880 <event_symbols_sw+160>, max=11, name_only=false, event_glob=0x7fffffffe61d "stall")
      at util/parse-events.c:1615
  #3  print_events (event_glob=event_glob@entry=0x7fffffffe61d "stall", name_only=false) at util/parse-events.c:1675
  #4  0x000000000042c79e in cmd_list (argc=1, argv=0x7fffffffe390, prefix=<optimized out>) at builtin-list.c:68
  #5  0x00000000004788a5 in run_builtin (p=p@entry=0x871758 <commands+120>, argc=argc@entry=2, argv=argv@entry=0x7fffffffe390) at perf.c:370
  #6  0x0000000000420ab0 in handle_internal_command (argv=0x7fffffffe390, argc=2) at perf.c:429
  #7  run_argv (argv=0x7fffffffe110, argcp=0x7fffffffe11c) at perf.c:473
  #8  main (argc=2, argv=0x7fffffffe390) at perf.c:588
  (gdb) p event_symbols_sw[PERF_COUNT_SW_BPF_OUTPUT]
  $4 = {symbol = 0x0, alias = 0x0}
  (gdb)

A patch to robustify perf to not segfault when the next counter gets added in
the kernel will follow this one.

Reported-by: Ingo Molnar <mingo@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/n/tip-57wysblcjfrseb0zg5u7ek10@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/parse-events.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 40ae92a..6fc8cd7 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -1883,7 +1883,7 @@ restart:
 
 	for (i = 0; i < max; i++, syms++) {
 
-		if (event_glob != NULL &&
+		if (event_glob != NULL && syms->symbol != NULL &&
 		    !(strglobmatch(syms->symbol, event_glob) ||
 		      (syms->alias && strglobmatch(syms->alias, event_glob))))
 			continue;
-- 
cgit v1.1


From 25b1606be1a910a63a23c3d1006581c9aad4e6e3 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Sat, 28 Nov 2015 02:32:37 +0900
Subject: perf report: Show error message when processing sample fails

Currently when perf fails to process samples for some reason, it doesn't
show any message about the failure.  This is very inconvenient for users
especially on TUI as screen is reset after the failure.

Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1448645559-31167-1-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-report.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 8a9c690..af5db88 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -513,20 +513,26 @@ static int __cmd_report(struct report *rep)
 	if (rep->cpu_list) {
 		ret = perf_session__cpu_bitmap(session, rep->cpu_list,
 					       rep->cpu_bitmap);
-		if (ret)
+		if (ret) {
+			ui__error("failed to set cpu bitmap\n");
 			return ret;
+		}
 	}
 
 	if (rep->show_threads)
 		perf_read_values_init(&rep->show_threads_values);
 
 	ret = report__setup_sample_type(rep);
-	if (ret)
+	if (ret) {
+		/* report__setup_sample_type() already showed error message */
 		return ret;
+	}
 
 	ret = perf_session__process_events(session);
-	if (ret)
+	if (ret) {
+		ui__error("failed to process sample\n");
 		return ret;
+	}
 
 	report__warn_kptr_restrict(rep);
 
-- 
cgit v1.1


From e72655d97d24fff559b4ab59de791c3741a74c8c Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Sat, 28 Nov 2015 02:32:38 +0900
Subject: perf hists: Do not skip elided fields when processing samples

If user gives a filter, perf marks the corresponding column elided and
omits the output.  But it should process and aggregates samples using
the field, otherwise samples will be aggregated as if the column was not
there resulted in incorrect output.

For example, I'd like to set a filter on native_write_msr_safe.  The
original overhead of the function is negligible.

  $ perf report | grep native_write_msr_safe
      0.00%  swapper  [kernel.vmlinux]  native_write_msr_safe
      0.00%  perf     [kernel.vmlinux]  native_write_msr_safe

However adding -S option gives different output.

  $ perf report -S native_write_msr_safe --percentage absolute | \
  > grep -e swapper -e perf
     51.47%  swapper  [kernel.vmlinux]
      4.14%  perf     [kernel.vmlinux]

Since it aggregated samples using comm and dso only.  In fact, the above
values are same when it sorts with -s comm,dso.

  $ perf report -s comm,dso | grep -e swapper -e perf
     51.47%  swapper  [kernel.vmlinux]
      4.14%  perf     [kernel.vmlinux]

This resulted in TUI failure with -ERANGE since it tries to increase
sample hit count for annotation with wrong symbols due to incorrect
aggregation.

This patch fixes it not to skip elided fields when comparing samples in
order to insert them to the hists.

Commiter note:

After the patch, with a different workloads:

  # perf report --show-total-period -S native_write_msr_safe --stdio
  #
  # symbol: native_write_msr_safe
  #
  # Samples: 455  of event 'cycles:pp'
  # Event count (approx.): 134787489
  #
  # Overhead Period Command         Shared Object
  # ........ ...... ............... ................
  #
       0.22% 293081 qemu-system-x86 [vmlinux]
       0.19% 255914 swapper         [vmlinux]
       0.00%   2054 Timer           [vmlinux]
       0.00%   1021 firefox         [vmlinux]
       0.00%      2 perf            [vmlinux]

  # perf report --show-total-period | grep native_write_msr_safe
  Failed to open /tmp/perf-14838.map, continuing without symbols
       0.22% 293081 qemu-system-x86 [vmlinux]  [k] native_write_msr_safe
       0.19% 255914 swapper         [vmlinux]  [k] native_write_msr_safe
       0.00%   2054 Timer           [vmlinux]  [k] native_write_msr_safe
       0.00%   1021 firefox         [vmlinux]  [k] native_write_msr_safe
       0.00%      2 perf            [vmlinux]  [k] native_write_msr_safe
  #

Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1448645559-31167-2-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/hist.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 4fd37d6..6e8e0ee 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -924,9 +924,6 @@ hist_entry__cmp(struct hist_entry *left, struct hist_entry *right)
 	int64_t cmp = 0;
 
 	perf_hpp__for_each_sort_list(fmt) {
-		if (perf_hpp__should_skip(fmt))
-			continue;
-
 		cmp = fmt->cmp(fmt, left, right);
 		if (cmp)
 			break;
@@ -942,9 +939,6 @@ hist_entry__collapse(struct hist_entry *left, struct hist_entry *right)
 	int64_t cmp = 0;
 
 	perf_hpp__for_each_sort_list(fmt) {
-		if (perf_hpp__should_skip(fmt))
-			continue;
-
 		cmp = fmt->collapse(fmt, left, right);
 		if (cmp)
 			break;
-- 
cgit v1.1


From 039050482573e168690d365b8ea1d4f599ebbbd8 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Sat, 28 Nov 2015 02:32:39 +0900
Subject: perf hists browser: Update nr entries regardless of min percent

When perf report on TUI was called with -S symbol filter, it should
update nr entries even if min_pcnt is 0.  IIRC the reason was to update
nr entries after applying minimum percent threshold.  But if symbol
filter was given on command line (with -S option), it should use
hists->nr_non_filtered_entries instead of hists->nr_entries.

So this patch fixes a bug of navigating hists browser that the cursor
goes beyond the number of entries when -S (or similar) option is used.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1448645559-31167-3-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/ui/browsers/hists.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index a211b7b..dcdcbaf 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -2055,10 +2055,9 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
 	SLang_reset_tty();
 	SLang_init_tty(0, 0, 0);
 
-	if (min_pcnt) {
+	if (min_pcnt)
 		browser->min_pcnt = min_pcnt;
-		hist_browser__update_nr_entries(browser);
-	}
+	hist_browser__update_nr_entries(browser);
 
 	browser->pstack = pstack__new(3);
 	if (browser->pstack == NULL)
-- 
cgit v1.1


From 9d759a9b4ac2690077d8b21258e6e95c3e34bfa9 Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Fri, 27 Nov 2015 08:47:35 +0000
Subject: tools lib bpf: Collect map definition in bpf_object

This patch collects more information from maps sections in BPF object
files into 'struct bpf_object', enables later patches access those
information (such as the type and size of the map).

In this patch, a new handler 'struct bpf_map' is extracted in parallel
with bpf_object and bpf_program. Its iterator and accessor is also
created.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1448614067-197576-2-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/lib/bpf/libbpf.c | 187 +++++++++++++++++++++++++++++++++----------------
 tools/lib/bpf/libbpf.h |  21 ++++++
 2 files changed, 148 insertions(+), 60 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index e3f4c33..f509825 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -163,22 +163,24 @@ struct bpf_program {
 	bpf_program_clear_priv_t clear_priv;
 };
 
+struct bpf_map {
+	int fd;
+	struct bpf_map_def def;
+	void *priv;
+	bpf_map_clear_priv_t clear_priv;
+};
+
 static LIST_HEAD(bpf_objects_list);
 
 struct bpf_object {
 	char license[64];
 	u32 kern_version;
-	void *maps_buf;
-	size_t maps_buf_sz;
 
 	struct bpf_program *programs;
 	size_t nr_programs;
-	int *map_fds;
-	/*
-	 * This field is required because maps_buf will be freed and
-	 * maps_buf_sz will be set to 0 after loaded.
-	 */
-	size_t nr_map_fds;
+	struct bpf_map *maps;
+	size_t nr_maps;
+
 	bool loaded;
 
 	/*
@@ -489,21 +491,38 @@ static int
 bpf_object__init_maps(struct bpf_object *obj, void *data,
 		      size_t size)
 {
-	if (size == 0) {
+	size_t nr_maps;
+	int i;
+
+	nr_maps = size / sizeof(struct bpf_map_def);
+	if (!data || !nr_maps) {
 		pr_debug("%s doesn't need map definition\n",
 			 obj->path);
 		return 0;
 	}
 
-	obj->maps_buf = malloc(size);
-	if (!obj->maps_buf) {
-		pr_warning("malloc maps failed: %s\n", obj->path);
+	pr_debug("maps in %s: %zd bytes\n", obj->path, size);
+
+	obj->maps = calloc(nr_maps, sizeof(obj->maps[0]));
+	if (!obj->maps) {
+		pr_warning("alloc maps for object failed\n");
 		return -ENOMEM;
 	}
+	obj->nr_maps = nr_maps;
+
+	for (i = 0; i < nr_maps; i++) {
+		struct bpf_map_def *def = &obj->maps[i].def;
 
-	obj->maps_buf_sz = size;
-	memcpy(obj->maps_buf, data, size);
-	pr_debug("maps in %s: %ld bytes\n", obj->path, (long)size);
+		/*
+		 * fill all fd with -1 so won't close incorrect
+		 * fd (fd=0 is stdin) when failure (zclose won't close
+		 * negative fd)).
+		 */
+		obj->maps[i].fd = -1;
+
+		/* Save map definition into obj->maps */
+		*def = ((struct bpf_map_def *)data)[i];
+	}
 	return 0;
 }
 
@@ -688,37 +707,15 @@ static int
 bpf_object__create_maps(struct bpf_object *obj)
 {
 	unsigned int i;
-	size_t nr_maps;
-	int *pfd;
-
-	nr_maps = obj->maps_buf_sz / sizeof(struct bpf_map_def);
-	if (!obj->maps_buf || !nr_maps) {
-		pr_debug("don't need create maps for %s\n",
-			 obj->path);
-		return 0;
-	}
 
-	obj->map_fds = malloc(sizeof(int) * nr_maps);
-	if (!obj->map_fds) {
-		pr_warning("realloc perf_bpf_map_fds failed\n");
-		return -ENOMEM;
-	}
-	obj->nr_map_fds = nr_maps;
-
-	/* fill all fd with -1 */
-	memset(obj->map_fds, -1, sizeof(int) * nr_maps);
+	for (i = 0; i < obj->nr_maps; i++) {
+		struct bpf_map_def *def = &obj->maps[i].def;
+		int *pfd = &obj->maps[i].fd;
 
-	pfd = obj->map_fds;
-	for (i = 0; i < nr_maps; i++) {
-		struct bpf_map_def def;
-
-		def = *(struct bpf_map_def *)(obj->maps_buf +
-				i * sizeof(struct bpf_map_def));
-
-		*pfd = bpf_create_map(def.type,
-				      def.key_size,
-				      def.value_size,
-				      def.max_entries);
+		*pfd = bpf_create_map(def->type,
+				      def->key_size,
+				      def->value_size,
+				      def->max_entries);
 		if (*pfd < 0) {
 			size_t j;
 			int err = *pfd;
@@ -726,22 +723,17 @@ bpf_object__create_maps(struct bpf_object *obj)
 			pr_warning("failed to create map: %s\n",
 				   strerror(errno));
 			for (j = 0; j < i; j++)
-				zclose(obj->map_fds[j]);
-			obj->nr_map_fds = 0;
-			zfree(&obj->map_fds);
+				zclose(obj->maps[j].fd);
 			return err;
 		}
 		pr_debug("create map: fd=%d\n", *pfd);
-		pfd++;
 	}
 
-	zfree(&obj->maps_buf);
-	obj->maps_buf_sz = 0;
 	return 0;
 }
 
 static int
-bpf_program__relocate(struct bpf_program *prog, int *map_fds)
+bpf_program__relocate(struct bpf_program *prog, struct bpf_object *obj)
 {
 	int i;
 
@@ -761,7 +753,7 @@ bpf_program__relocate(struct bpf_program *prog, int *map_fds)
 			return -LIBBPF_ERRNO__RELOC;
 		}
 		insns[insn_idx].src_reg = BPF_PSEUDO_MAP_FD;
-		insns[insn_idx].imm = map_fds[map_idx];
+		insns[insn_idx].imm = obj->maps[map_idx].fd;
 	}
 
 	zfree(&prog->reloc_desc);
@@ -780,7 +772,7 @@ bpf_object__relocate(struct bpf_object *obj)
 	for (i = 0; i < obj->nr_programs; i++) {
 		prog = &obj->programs[i];
 
-		err = bpf_program__relocate(prog, obj->map_fds);
+		err = bpf_program__relocate(prog, obj);
 		if (err) {
 			pr_warning("failed to relocate '%s'\n",
 				   prog->section_name);
@@ -804,8 +796,7 @@ static int bpf_object__collect_reloc(struct bpf_object *obj)
 		Elf_Data *data = obj->efile.reloc[i].data;
 		int idx = shdr->sh_info;
 		struct bpf_program *prog;
-		size_t nr_maps = obj->maps_buf_sz /
-				 sizeof(struct bpf_map_def);
+		size_t nr_maps = obj->nr_maps;
 
 		if (shdr->sh_type != SHT_REL) {
 			pr_warning("internal error at %d\n", __LINE__);
@@ -1050,10 +1041,8 @@ int bpf_object__unload(struct bpf_object *obj)
 	if (!obj)
 		return -EINVAL;
 
-	for (i = 0; i < obj->nr_map_fds; i++)
-		zclose(obj->map_fds[i]);
-	zfree(&obj->map_fds);
-	obj->nr_map_fds = 0;
+	for (i = 0; i < obj->nr_maps; i++)
+		zclose(obj->maps[i].fd);
 
 	for (i = 0; i < obj->nr_programs; i++)
 		bpf_program__unload(&obj->programs[i]);
@@ -1096,7 +1085,15 @@ void bpf_object__close(struct bpf_object *obj)
 	bpf_object__elf_finish(obj);
 	bpf_object__unload(obj);
 
-	zfree(&obj->maps_buf);
+	for (i = 0; i < obj->nr_maps; i++) {
+		if (obj->maps[i].clear_priv)
+			obj->maps[i].clear_priv(&obj->maps[i],
+						obj->maps[i].priv);
+		obj->maps[i].priv = NULL;
+		obj->maps[i].clear_priv = NULL;
+	}
+	zfree(&obj->maps);
+	obj->nr_maps = 0;
 
 	if (obj->programs && obj->nr_programs) {
 		for (i = 0; i < obj->nr_programs; i++)
@@ -1251,3 +1248,73 @@ int bpf_program__nth_fd(struct bpf_program *prog, int n)
 
 	return fd;
 }
+
+int bpf_map__get_fd(struct bpf_map *map)
+{
+	if (!map)
+		return -EINVAL;
+
+	return map->fd;
+}
+
+int bpf_map__get_def(struct bpf_map *map, struct bpf_map_def *pdef)
+{
+	if (!map || !pdef)
+		return -EINVAL;
+
+	*pdef = map->def;
+	return 0;
+}
+
+int bpf_map__set_private(struct bpf_map *map, void *priv,
+			 bpf_map_clear_priv_t clear_priv)
+{
+	if (!map)
+		return -EINVAL;
+
+	if (map->priv) {
+		if (map->clear_priv)
+			map->clear_priv(map, map->priv);
+	}
+
+	map->priv = priv;
+	map->clear_priv = clear_priv;
+	return 0;
+}
+
+int bpf_map__get_private(struct bpf_map *map, void **ppriv)
+{
+	if (!map)
+		return -EINVAL;
+
+	if (ppriv)
+		*ppriv = map->priv;
+	return 0;
+}
+
+struct bpf_map *
+bpf_map__next(struct bpf_map *prev, struct bpf_object *obj)
+{
+	size_t idx;
+	struct bpf_map *s, *e;
+
+	if (!obj || !obj->maps)
+		return NULL;
+
+	s = obj->maps;
+	e = obj->maps + obj->nr_maps;
+
+	if (prev == NULL)
+		return s;
+
+	if ((prev < s) || (prev >= e)) {
+		pr_warning("error in %s: map handler doesn't belong to object\n",
+			   __func__);
+		return NULL;
+	}
+
+	idx = (prev - obj->maps) + 1;
+	if (idx >= obj->nr_maps)
+		return NULL;
+	return &obj->maps[idx];
+}
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 949df4b..ef63125 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -165,4 +165,25 @@ struct bpf_map_def {
 	unsigned int max_entries;
 };
 
+/*
+ * There is another 'struct bpf_map' in include/linux/map.h. However,
+ * it is not a uapi header so no need to consider name clash.
+ */
+struct bpf_map;
+
+struct bpf_map *
+bpf_map__next(struct bpf_map *map, struct bpf_object *obj);
+#define bpf_map__for_each(pos, obj)		\
+	for ((pos) = bpf_map__next(NULL, (obj));	\
+	     (pos) != NULL;				\
+	     (pos) = bpf_map__next((pos), (obj)))
+
+int bpf_map__get_fd(struct bpf_map *map);
+int bpf_map__get_def(struct bpf_map *map, struct bpf_map_def *pdef);
+
+typedef void (*bpf_map_clear_priv_t)(struct bpf_map *, void *);
+int bpf_map__set_private(struct bpf_map *map, void *priv,
+			 bpf_map_clear_priv_t clear_priv);
+int bpf_map__get_private(struct bpf_map *map, void **ppriv);
+
 #endif
-- 
cgit v1.1


From 561bbccac72d08babafaa33fd7fa9100ec4c9fb6 Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Fri, 27 Nov 2015 08:47:36 +0000
Subject: tools lib bpf: Extract and collect map names from BPF object file

This patch collects name of maps in BPF object files and saves them into
'maps' field in 'struct bpf_object'. 'bpf_object__get_map_by_name' is
introduced to retrive fd and definitions of a map through its name.

Signed-off-by: He Kuang <hekuang@huawei.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: He Kuang <hekuang@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1448614067-197576-3-git-send-email-wangnan0@huawei.com
Signed-off-by: Wang Nan <wangnan0@huawei.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/lib/bpf/libbpf.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++---
 tools/lib/bpf/libbpf.h |  3 +++
 2 files changed, 65 insertions(+), 3 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index f509825..a298614 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -165,6 +165,7 @@ struct bpf_program {
 
 struct bpf_map {
 	int fd;
+	char *name;
 	struct bpf_map_def def;
 	void *priv;
 	bpf_map_clear_priv_t clear_priv;
@@ -526,12 +527,46 @@ bpf_object__init_maps(struct bpf_object *obj, void *data,
 	return 0;
 }
 
+static void
+bpf_object__init_maps_name(struct bpf_object *obj, int maps_shndx)
+{
+	int i;
+	Elf_Data *symbols = obj->efile.symbols;
+
+	if (!symbols || maps_shndx < 0)
+		return;
+
+	for (i = 0; i < symbols->d_size / sizeof(GElf_Sym); i++) {
+		GElf_Sym sym;
+		size_t map_idx;
+		const char *map_name;
+
+		if (!gelf_getsym(symbols, i, &sym))
+			continue;
+		if (sym.st_shndx != maps_shndx)
+			continue;
+
+		map_name = elf_strptr(obj->efile.elf,
+				      obj->efile.ehdr.e_shstrndx,
+				      sym.st_name);
+		map_idx = sym.st_value / sizeof(struct bpf_map_def);
+		if (map_idx >= obj->nr_maps) {
+			pr_warning("index of map \"%s\" is buggy: %zu > %zu\n",
+				   map_name, map_idx, obj->nr_maps);
+			continue;
+		}
+		obj->maps[map_idx].name = strdup(map_name);
+		pr_debug("map %zu is \"%s\"\n", map_idx,
+			 obj->maps[map_idx].name);
+	}
+}
+
 static int bpf_object__elf_collect(struct bpf_object *obj)
 {
 	Elf *elf = obj->efile.elf;
 	GElf_Ehdr *ep = &obj->efile.ehdr;
 	Elf_Scn *scn = NULL;
-	int idx = 0, err = 0;
+	int idx = 0, err = 0, maps_shndx = -1;
 
 	/* Elf is corrupted/truncated, avoid calling elf_strptr. */
 	if (!elf_rawdata(elf_getscn(elf, ep->e_shstrndx), NULL)) {
@@ -581,10 +616,11 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
 			err = bpf_object__init_kversion(obj,
 							data->d_buf,
 							data->d_size);
-		else if (strcmp(name, "maps") == 0)
+		else if (strcmp(name, "maps") == 0) {
 			err = bpf_object__init_maps(obj, data->d_buf,
 						    data->d_size);
-		else if (sh.sh_type == SHT_SYMTAB) {
+			maps_shndx = idx;
+		} else if (sh.sh_type == SHT_SYMTAB) {
 			if (obj->efile.symbols) {
 				pr_warning("bpf: multiple SYMTAB in %s\n",
 					   obj->path);
@@ -625,6 +661,9 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
 		if (err)
 			goto out;
 	}
+
+	if (maps_shndx >= 0)
+		bpf_object__init_maps_name(obj, maps_shndx);
 out:
 	return err;
 }
@@ -1086,6 +1125,7 @@ void bpf_object__close(struct bpf_object *obj)
 	bpf_object__unload(obj);
 
 	for (i = 0; i < obj->nr_maps; i++) {
+		zfree(&obj->maps[i].name);
 		if (obj->maps[i].clear_priv)
 			obj->maps[i].clear_priv(&obj->maps[i],
 						obj->maps[i].priv);
@@ -1266,6 +1306,13 @@ int bpf_map__get_def(struct bpf_map *map, struct bpf_map_def *pdef)
 	return 0;
 }
 
+const char *bpf_map__get_name(struct bpf_map *map)
+{
+	if (!map)
+		return NULL;
+	return map->name;
+}
+
 int bpf_map__set_private(struct bpf_map *map, void *priv,
 			 bpf_map_clear_priv_t clear_priv)
 {
@@ -1318,3 +1365,15 @@ bpf_map__next(struct bpf_map *prev, struct bpf_object *obj)
 		return NULL;
 	return &obj->maps[idx];
 }
+
+struct bpf_map *
+bpf_object__get_map_by_name(struct bpf_object *obj, const char *name)
+{
+	struct bpf_map *pos;
+
+	bpf_map__for_each(pos, obj) {
+		if (strcmp(pos->name, name) == 0)
+			return pos;
+	}
+	return NULL;
+}
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index ef63125..a51594c 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -170,6 +170,8 @@ struct bpf_map_def {
  * it is not a uapi header so no need to consider name clash.
  */
 struct bpf_map;
+struct bpf_map *
+bpf_object__get_map_by_name(struct bpf_object *obj, const char *name);
 
 struct bpf_map *
 bpf_map__next(struct bpf_map *map, struct bpf_object *obj);
@@ -180,6 +182,7 @@ bpf_map__next(struct bpf_map *map, struct bpf_object *obj);
 
 int bpf_map__get_fd(struct bpf_map *map);
 int bpf_map__get_def(struct bpf_map *map, struct bpf_map_def *pdef);
+const char *bpf_map__get_name(struct bpf_map *map);
 
 typedef void (*bpf_map_clear_priv_t)(struct bpf_map *, void *);
 int bpf_map__set_private(struct bpf_map *map, void *priv,
-- 
cgit v1.1


From 0bb93490170477224f8bd4cc9ce8920517461643 Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Fri, 27 Nov 2015 08:47:37 +0000
Subject: perf bpf: Rename bpf config to program config

Following patches are going to introduce BPF object level configuration
to enable setting values into BPF maps. To avoid confusion, this patch
renames existing 'config' in bpf-loader.c to 'program config'. Following
patches would introduce 'object config'.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1448614067-197576-4-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/bpf-loader.c | 65 ++++++++++++++++++++++----------------------
 tools/perf/util/bpf-loader.h |  2 +-
 2 files changed, 33 insertions(+), 34 deletions(-)

diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c
index 36544e5..540a7ef 100644
--- a/tools/perf/util/bpf-loader.c
+++ b/tools/perf/util/bpf-loader.c
@@ -120,7 +120,7 @@ bpf_prog_priv__clear(struct bpf_program *prog __maybe_unused,
 }
 
 static int
-config__exec(const char *value, struct perf_probe_event *pev)
+prog_config__exec(const char *value, struct perf_probe_event *pev)
 {
 	pev->uprobes = true;
 	pev->target = strdup(value);
@@ -130,7 +130,7 @@ config__exec(const char *value, struct perf_probe_event *pev)
 }
 
 static int
-config__module(const char *value, struct perf_probe_event *pev)
+prog_config__module(const char *value, struct perf_probe_event *pev)
 {
 	pev->uprobes = false;
 	pev->target = strdup(value);
@@ -140,8 +140,7 @@ config__module(const char *value, struct perf_probe_event *pev)
 }
 
 static int
-config__bool(const char *value,
-	     bool *pbool, bool invert)
+prog_config__bool(const char *value, bool *pbool, bool invert)
 {
 	int err;
 	bool bool_value;
@@ -158,17 +157,17 @@ config__bool(const char *value,
 }
 
 static int
-config__inlines(const char *value,
-		struct perf_probe_event *pev __maybe_unused)
+prog_config__inlines(const char *value,
+		     struct perf_probe_event *pev __maybe_unused)
 {
-	return config__bool(value, &probe_conf.no_inlines, true);
+	return prog_config__bool(value, &probe_conf.no_inlines, true);
 }
 
 static int
-config__force(const char *value,
-	      struct perf_probe_event *pev __maybe_unused)
+prog_config__force(const char *value,
+		   struct perf_probe_event *pev __maybe_unused)
 {
-	return config__bool(value, &probe_conf.force_add, false);
+	return prog_config__bool(value, &probe_conf.force_add, false);
 }
 
 static struct {
@@ -176,58 +175,58 @@ static struct {
 	const char *usage;
 	const char *desc;
 	int (*func)(const char *, struct perf_probe_event *);
-} bpf_config_terms[] = {
+} bpf_prog_config_terms[] = {
 	{
 		.key	= "exec",
 		.usage	= "exec=<full path of file>",
 		.desc	= "Set uprobe target",
-		.func	= config__exec,
+		.func	= prog_config__exec,
 	},
 	{
 		.key	= "module",
 		.usage	= "module=<module name>    ",
 		.desc	= "Set kprobe module",
-		.func	= config__module,
+		.func	= prog_config__module,
 	},
 	{
 		.key	= "inlines",
 		.usage	= "inlines=[yes|no]        ",
 		.desc	= "Probe at inline symbol",
-		.func	= config__inlines,
+		.func	= prog_config__inlines,
 	},
 	{
 		.key	= "force",
 		.usage	= "force=[yes|no]          ",
 		.desc	= "Forcibly add events with existing name",
-		.func	= config__force,
+		.func	= prog_config__force,
 	},
 };
 
 static int
-do_config(const char *key, const char *value,
-	  struct perf_probe_event *pev)
+do_prog_config(const char *key, const char *value,
+	       struct perf_probe_event *pev)
 {
 	unsigned int i;
 
 	pr_debug("config bpf program: %s=%s\n", key, value);
-	for (i = 0; i < ARRAY_SIZE(bpf_config_terms); i++)
-		if (strcmp(key, bpf_config_terms[i].key) == 0)
-			return bpf_config_terms[i].func(value, pev);
+	for (i = 0; i < ARRAY_SIZE(bpf_prog_config_terms); i++)
+		if (strcmp(key, bpf_prog_config_terms[i].key) == 0)
+			return bpf_prog_config_terms[i].func(value, pev);
 
-	pr_debug("BPF: ERROR: invalid config option in object: %s=%s\n",
+	pr_debug("BPF: ERROR: invalid program config option: %s=%s\n",
 		 key, value);
 
-	pr_debug("\nHint: Currently valid options are:\n");
-	for (i = 0; i < ARRAY_SIZE(bpf_config_terms); i++)
-		pr_debug("\t%s:\t%s\n", bpf_config_terms[i].usage,
-			 bpf_config_terms[i].desc);
+	pr_debug("\nHint: Valid options are:\n");
+	for (i = 0; i < ARRAY_SIZE(bpf_prog_config_terms); i++)
+		pr_debug("\t%s:\t%s\n", bpf_prog_config_terms[i].usage,
+			 bpf_prog_config_terms[i].desc);
 	pr_debug("\n");
 
-	return -BPF_LOADER_ERRNO__CONFIG_TERM;
+	return -BPF_LOADER_ERRNO__PROGCONF_TERM;
 }
 
 static const char *
-parse_config_kvpair(const char *config_str, struct perf_probe_event *pev)
+parse_prog_config_kvpair(const char *config_str, struct perf_probe_event *pev)
 {
 	char *text = strdup(config_str);
 	char *sep, *line;
@@ -253,7 +252,7 @@ parse_config_kvpair(const char *config_str, struct perf_probe_event *pev)
 		}
 		*equ = '\0';
 
-		err = do_config(line, equ + 1, pev);
+		err = do_prog_config(line, equ + 1, pev);
 		if (err)
 			break;
 nextline:
@@ -268,10 +267,10 @@ nextline:
 }
 
 static int
-parse_config(const char *config_str, struct perf_probe_event *pev)
+parse_prog_config(const char *config_str, struct perf_probe_event *pev)
 {
 	int err;
-	const char *main_str = parse_config_kvpair(config_str, pev);
+	const char *main_str = parse_prog_config_kvpair(config_str, pev);
 
 	if (IS_ERR(main_str))
 		return PTR_ERR(main_str);
@@ -312,7 +311,7 @@ config_bpf_program(struct bpf_program *prog)
 	pev = &priv->pev;
 
 	pr_debug("bpf: config program '%s'\n", config_str);
-	err = parse_config(config_str, pev);
+	err = parse_prog_config(config_str, pev);
 	if (err)
 		goto errout;
 
@@ -750,7 +749,7 @@ static const char *bpf_loader_strerror_table[NR_ERRNO] = {
 	[ERRCODE_OFFSET(EVENTNAME)]	= "No event name found in config string",
 	[ERRCODE_OFFSET(INTERNAL)]	= "BPF loader internal error",
 	[ERRCODE_OFFSET(COMPILE)]	= "Error when compiling BPF scriptlet",
-	[ERRCODE_OFFSET(CONFIG_TERM)]	= "Invalid config term in config string",
+	[ERRCODE_OFFSET(PROGCONF_TERM)]	= "Invalid program config term in config string",
 	[ERRCODE_OFFSET(PROLOGUE)]	= "Failed to generate prologue",
 	[ERRCODE_OFFSET(PROLOGUE2BIG)]	= "Prologue too big for program",
 	[ERRCODE_OFFSET(PROLOGUEOOB)]	= "Offset out of bound for prologue",
@@ -834,7 +833,7 @@ int bpf__strerror_probe(struct bpf_object *obj __maybe_unused,
 			int err, char *buf, size_t size)
 {
 	bpf__strerror_head(err, buf, size);
-	case BPF_LOADER_ERRNO__CONFIG_TERM: {
+	case BPF_LOADER_ERRNO__PROGCONF_TERM: {
 		scnprintf(buf, size, "%s (add -v to see detail)", emsg);
 		break;
 	}
diff --git a/tools/perf/util/bpf-loader.h b/tools/perf/util/bpf-loader.h
index a58740b..6fdc045 100644
--- a/tools/perf/util/bpf-loader.h
+++ b/tools/perf/util/bpf-loader.h
@@ -20,7 +20,7 @@ enum bpf_loader_errno {
 	BPF_LOADER_ERRNO__EVENTNAME,	/* Event name is missing */
 	BPF_LOADER_ERRNO__INTERNAL,	/* BPF loader internal error */
 	BPF_LOADER_ERRNO__COMPILE,	/* Error when compiling BPF scriptlet */
-	BPF_LOADER_ERRNO__CONFIG_TERM,	/* Invalid config term in config term */
+	BPF_LOADER_ERRNO__PROGCONF_TERM,/* Invalid program config term in config string */
 	BPF_LOADER_ERRNO__PROLOGUE,	/* Failed to generate prologue */
 	BPF_LOADER_ERRNO__PROLOGUE2BIG,	/* Prologue too big for program */
 	BPF_LOADER_ERRNO__PROLOGUEOOB,	/* Offset out of bound for prologue */
-- 
cgit v1.1


From d6b56b0bc68ba7927b286da86eda1d4d4dbe63f6 Mon Sep 17 00:00:00 2001
From: Julia Lawall <Julia.Lawall@lip6.fr>
Date: Sat, 28 Nov 2015 16:58:15 +0100
Subject: x86/platform/calgary: Constify cal_chipset_ops structures

The cal_chipset_ops structures are never modified, so declare
them as const.

Done with the help of Coccinelle.

Signed-off-by: Julia Lawall <Julia.Lawall@lip6.fr>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jon D. Mason <jdmason@kudzu.us>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Muli Ben-Yehuda <muli@il.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1448726295-10959-1-git-send-email-Julia.Lawall@lip6.fr
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/calgary.h   | 2 +-
 arch/x86/kernel/pci-calgary_64.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/calgary.h b/arch/x86/include/asm/calgary.h
index 0d467b3..a8303eb 100644
--- a/arch/x86/include/asm/calgary.h
+++ b/arch/x86/include/asm/calgary.h
@@ -31,7 +31,7 @@
 #include <asm/types.h>
 
 struct iommu_table {
-	struct cal_chipset_ops *chip_ops; /* chipset specific funcs */
+	const struct cal_chipset_ops *chip_ops; /* chipset specific funcs */
 	unsigned long  it_base;      /* mapped address of tce table */
 	unsigned long  it_hint;      /* Hint for next alloc */
 	unsigned long *it_map;       /* A simple allocation bitmap for now */
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 0497f71..833b1d3 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -180,13 +180,13 @@ static void calioc2_dump_error_regs(struct iommu_table *tbl);
 static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl);
 static void get_tce_space_from_tar(void);
 
-static struct cal_chipset_ops calgary_chip_ops = {
+static const struct cal_chipset_ops calgary_chip_ops = {
 	.handle_quirks = calgary_handle_quirks,
 	.tce_cache_blast = calgary_tce_cache_blast,
 	.dump_error_regs = calgary_dump_error_regs
 };
 
-static struct cal_chipset_ops calioc2_chip_ops = {
+static const struct cal_chipset_ops calioc2_chip_ops = {
 	.handle_quirks = calioc2_handle_quirks,
 	.tce_cache_blast = calioc2_tce_cache_blast,
 	.dump_error_regs = calioc2_dump_error_regs
-- 
cgit v1.1


From c332813b51cbe807d539bb059b81235abf1e3fdd Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Tue, 1 Dec 2015 21:44:50 +0100
Subject: x86/mm/mtrr: Mark the 'range_new' static variable in
 mtrr_calc_range_state() as __initdata

'range_new' doesn't seem to be used after init. It is only passed
to memset(), sum_ranges(), memcmp() and x86_get_mtrr_mem_range(), the
latter of which also only passes it on to various *range*
library functions.

So mark it __initdata to free up an extra page after init.

Its contents are wiped at every call to mtrr_calc_range_state(),
so it being static is not about preserving state between calls,
but simply to avoid a 4k+ stack frame. While there, add a
comment explaining this and why it's safe.

We could also mark nr_range_new as __initdata, but since it's
just a single int and also doesn't carry state between calls (it
is unconditionally assigned to before it is read), we might as
well make it an ordinary automatic variable.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Link: http://lkml.kernel.org/r/1449002691-20783-1-git-send-email-linux@rasmusvillemoes.dk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/mtrr/cleanup.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 70d7c93..0d98503 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -593,9 +593,16 @@ mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
 		      unsigned long x_remove_base,
 		      unsigned long x_remove_size, int i)
 {
-	static struct range range_new[RANGE_NUM];
+	/*
+	 * range_new should really be an automatic variable, but
+	 * putting 4096 bytes on the stack is frowned upon, to put it
+	 * mildly. It is safe to make it a static __initdata variable,
+	 * since mtrr_calc_range_state is only called during init and
+	 * there's no way it will call itself recursively.
+	 */
+	static struct range range_new[RANGE_NUM] __initdata;
 	unsigned long range_sums_new;
-	static int nr_range_new;
+	int nr_range_new;
 	int num_reg;
 
 	/* Convert ranges to var ranges state: */
-- 
cgit v1.1


From 74b18e1750201530ce285a5cd1269a9fb592d905 Mon Sep 17 00:00:00 2001
From: "Wei, Gang" <gang.wei@intel.com>
Date: Wed, 2 Dec 2015 07:07:20 +0000
Subject: x86/tboot: Update maintainer list for Intel TXT

Update maintainer list for Intel TXT

Signed-off-by: Gang Wei <gang.wei@intel.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <IMCEAMAILTO-hpa+40zytor+2Ecom@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Joseph Cihula <joseph.cihula@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Ning Sun <ning.sun@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Richard L Maliszewski <richard.l.maliszewski@intel.com>
Cc: Shane Wang <shane.wang@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: tboot-devel@lists.sourceforge.net <tboot-devel@lists.sourceforge.net>
Link: http://lkml.kernel.org/r/D0B11485C64D4B47B66902F8A4E901BE035656E6@shsmsx102.ccr.corp.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 MAINTAINERS | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 5192700..9a502c0 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5621,9 +5621,7 @@ F:	Documentation/trace/intel_th.txt
 F:	drivers/hwtracing/intel_th/
 
 INTEL(R) TRUSTED EXECUTION TECHNOLOGY (TXT)
-M:	Richard L Maliszewski <richard.l.maliszewski@intel.com>
-M:	Gang Wei <gang.wei@intel.com>
-M:	Shane Wang <shane.wang@intel.com>
+M:	Ning Sun <ning.sun@intel.com>
 L:	tboot-devel@lists.sourceforge.net
 W:	http://tboot.sourceforge.net
 T:	hg http://tboot.hg.sourceforge.net:8000/hgroot/tboot/tboot
-- 
cgit v1.1


From b3e0b1b6d841a4b2f64fc09ea728913da8218424 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 16 Oct 2015 14:39:38 +0200
Subject: locking, sched: Introduce smp_cond_acquire() and use it

Introduce smp_cond_acquire() which combines a control dependency and a
read barrier to form acquire semantics.

This primitive has two benefits:

 - it documents control dependencies,
 - its typically cheaper than using smp_load_acquire() in a loop.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/compiler.h   | 17 +++++++++++++++++
 kernel/locking/qspinlock.c |  3 +--
 kernel/sched/core.c        |  8 +-------
 kernel/sched/sched.h       |  2 +-
 4 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 4dac103..00b042c 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -299,6 +299,23 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
 	__u.__val;					\
 })
 
+/**
+ * smp_cond_acquire() - Spin wait for cond with ACQUIRE ordering
+ * @cond: boolean expression to wait for
+ *
+ * Equivalent to using smp_load_acquire() on the condition variable but employs
+ * the control dependency of the wait to reduce the barrier on many platforms.
+ *
+ * The control dependency provides a LOAD->STORE order, the additional RMB
+ * provides LOAD->LOAD order, together they provide LOAD->{LOAD,STORE} order,
+ * aka. ACQUIRE.
+ */
+#define smp_cond_acquire(cond)	do {		\
+	while (!(cond))				\
+		cpu_relax();			\
+	smp_rmb(); /* ctrl + rmb := acquire */	\
+} while (0)
+
 #endif /* __KERNEL__ */
 
 #endif /* __ASSEMBLY__ */
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 9862078..ed9d967 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -433,8 +433,7 @@ queue:
 	 *
 	 */
 	pv_wait_head(lock, node);
-	while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_PENDING_MASK)
-		cpu_relax();
+	smp_cond_acquire(!((val = atomic_read(&lock->val)) & _Q_LOCKED_PENDING_MASK));
 
 	/*
 	 * claim the lock:
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7063c6a..9f7862d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1968,19 +1968,13 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	/*
 	 * If the owning (remote) cpu is still in the middle of schedule() with
 	 * this task as prev, wait until its done referencing the task.
-	 */
-	while (p->on_cpu)
-		cpu_relax();
-	/*
-	 * Combined with the control dependency above, we have an effective
-	 * smp_load_acquire() without the need for full barriers.
 	 *
 	 * Pairs with the smp_store_release() in finish_lock_switch().
 	 *
 	 * This ensures that tasks getting woken will be fully ordered against
 	 * their previous state and preserve Program Order.
 	 */
-	smp_rmb();
+	smp_cond_acquire(!p->on_cpu);
 
 	p->sched_contributes_to_load = !!task_contributes_to_load(p);
 	p->state = TASK_WAKING;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b242775..1e0bb4a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1076,7 +1076,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 	 * In particular, the load of prev->state in finish_task_switch() must
 	 * happen before this.
 	 *
-	 * Pairs with the control dependency and rmb in try_to_wake_up().
+	 * Pairs with the smp_cond_acquire() in try_to_wake_up().
 	 */
 	smp_store_release(&prev->on_cpu, 0);
 #endif
-- 
cgit v1.1


From 8643cda549ca49a403160892db68504569ac9052 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 17 Nov 2015 19:01:11 +0100
Subject: sched/core, locking: Document Program-Order guarantees

These are some notes on the scheduler locking and how it provides
program order guarantees on SMP systems.

( This commit is in the locking tree, because the new documentation
  refers to a newly introduced locking primitive. )

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 91 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 91 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9f7862d..91db750 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1905,6 +1905,97 @@ static void ttwu_queue(struct task_struct *p, int cpu)
 	raw_spin_unlock(&rq->lock);
 }
 
+/*
+ * Notes on Program-Order guarantees on SMP systems.
+ *
+ *  MIGRATION
+ *
+ * The basic program-order guarantee on SMP systems is that when a task [t]
+ * migrates, all its activity on its old cpu [c0] happens-before any subsequent
+ * execution on its new cpu [c1].
+ *
+ * For migration (of runnable tasks) this is provided by the following means:
+ *
+ *  A) UNLOCK of the rq(c0)->lock scheduling out task t
+ *  B) migration for t is required to synchronize *both* rq(c0)->lock and
+ *     rq(c1)->lock (if not at the same time, then in that order).
+ *  C) LOCK of the rq(c1)->lock scheduling in task
+ *
+ * Transitivity guarantees that B happens after A and C after B.
+ * Note: we only require RCpc transitivity.
+ * Note: the cpu doing B need not be c0 or c1
+ *
+ * Example:
+ *
+ *   CPU0            CPU1            CPU2
+ *
+ *   LOCK rq(0)->lock
+ *   sched-out X
+ *   sched-in Y
+ *   UNLOCK rq(0)->lock
+ *
+ *                                   LOCK rq(0)->lock // orders against CPU0
+ *                                   dequeue X
+ *                                   UNLOCK rq(0)->lock
+ *
+ *                                   LOCK rq(1)->lock
+ *                                   enqueue X
+ *                                   UNLOCK rq(1)->lock
+ *
+ *                   LOCK rq(1)->lock // orders against CPU2
+ *                   sched-out Z
+ *                   sched-in X
+ *                   UNLOCK rq(1)->lock
+ *
+ *
+ *  BLOCKING -- aka. SLEEP + WAKEUP
+ *
+ * For blocking we (obviously) need to provide the same guarantee as for
+ * migration. However the means are completely different as there is no lock
+ * chain to provide order. Instead we do:
+ *
+ *   1) smp_store_release(X->on_cpu, 0)
+ *   2) smp_cond_acquire(!X->on_cpu)
+ *
+ * Example:
+ *
+ *   CPU0 (schedule)  CPU1 (try_to_wake_up) CPU2 (schedule)
+ *
+ *   LOCK rq(0)->lock LOCK X->pi_lock
+ *   dequeue X
+ *   sched-out X
+ *   smp_store_release(X->on_cpu, 0);
+ *
+ *                    smp_cond_acquire(!X->on_cpu);
+ *                    X->state = WAKING
+ *                    set_task_cpu(X,2)
+ *
+ *                    LOCK rq(2)->lock
+ *                    enqueue X
+ *                    X->state = RUNNING
+ *                    UNLOCK rq(2)->lock
+ *
+ *                                          LOCK rq(2)->lock // orders against CPU1
+ *                                          sched-out Z
+ *                                          sched-in X
+ *                                          UNLOCK rq(2)->lock
+ *
+ *                    UNLOCK X->pi_lock
+ *   UNLOCK rq(0)->lock
+ *
+ *
+ * However; for wakeups there is a second guarantee we must provide, namely we
+ * must observe the state that lead to our wakeup. That is, not only must our
+ * task observe its own prior state, it must also observe the stores prior to
+ * its wakeup.
+ *
+ * This means that any means of doing remote wakeups must order the CPU doing
+ * the wakeup against the CPU the task is going to end up running on. This,
+ * however, is already required for the regular Program-Order guarantee above,
+ * since the waking CPU is the one issueing the ACQUIRE (smp_cond_acquire).
+ *
+ */
+
 /**
  * try_to_wake_up - wake up a thread
  * @p: the thread to be awakened
-- 
cgit v1.1


From ad936d8658fd348338cb7d42c577dac77892b074 Mon Sep 17 00:00:00 2001
From: Byungchul Park <byungchul.park@lge.com>
Date: Sat, 24 Oct 2015 01:16:19 +0900
Subject: sched/fair: Make it possible to account fair load avg consistently

The current code accounts for the time a task was absent from the fair
class (per ATTACH_AGE_LOAD). However it does not work correctly when a
task got migrated or moved to another cgroup while outside of the fair
class.

This patch tries to address that by aging on migration. We locklessly
read the 'last_update_time' stamp from both the old and new cfs_rq,
ages the load upto the old time, and sets it to the new time.

These timestamps should in general not be more than 1 tick apart from
one another, so there is a definite bound on things.

Signed-off-by: Byungchul Park <byungchul.park@lge.com>
[ Changelog, a few edits and !SMP build fix ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1445616981-29904-2-git-send-email-byungchul.park@lge.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c  |  4 ++++
 kernel/sched/fair.c  | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/sched.h | 11 ++++++++++-
 3 files changed, 60 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8969a9a..32d83e4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2120,6 +2120,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	p->se.vruntime			= 0;
 	INIT_LIST_HEAD(&p->se.group_node);
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	p->se.cfs_rq			= NULL;
+#endif
+
 #ifdef CONFIG_SCHEDSTATS
 	memset(&p->se.statistics, 0, sizeof(p->se.statistics));
 #endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ff8ec86..efd664c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2715,6 +2715,52 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
 	}
 }
 
+/*
+ * Called within set_task_rq() right before setting a task's cpu. The
+ * caller only guarantees p->pi_lock is held; no other assumptions,
+ * including the state of rq->lock, should be made.
+ */
+void set_task_rq_fair(struct sched_entity *se,
+		      struct cfs_rq *prev, struct cfs_rq *next)
+{
+	if (!sched_feat(ATTACH_AGE_LOAD))
+		return;
+
+	/*
+	 * We are supposed to update the task to "current" time, then its up to
+	 * date and ready to go to new CPU/cfs_rq. But we have difficulty in
+	 * getting what current time is, so simply throw away the out-of-date
+	 * time. This will result in the wakee task is less decayed, but giving
+	 * the wakee more load sounds not bad.
+	 */
+	if (se->avg.last_update_time && prev) {
+		u64 p_last_update_time;
+		u64 n_last_update_time;
+
+#ifndef CONFIG_64BIT
+		u64 p_last_update_time_copy;
+		u64 n_last_update_time_copy;
+
+		do {
+			p_last_update_time_copy = prev->load_last_update_time_copy;
+			n_last_update_time_copy = next->load_last_update_time_copy;
+
+			smp_rmb();
+
+			p_last_update_time = prev->avg.last_update_time;
+			n_last_update_time = next->avg.last_update_time;
+
+		} while (p_last_update_time != p_last_update_time_copy ||
+			 n_last_update_time != n_last_update_time_copy);
+#else
+		p_last_update_time = prev->avg.last_update_time;
+		n_last_update_time = next->avg.last_update_time;
+#endif
+		__update_load_avg(p_last_update_time, cpu_of(rq_of(prev)),
+				  &se->avg, 0, 0, NULL);
+		se->avg.last_update_time = n_last_update_time;
+	}
+}
 #else /* CONFIG_FAIR_GROUP_SCHED */
 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
 #endif /* CONFIG_FAIR_GROUP_SCHED */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index cdae23d..9a029fa 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -335,7 +335,15 @@ extern void sched_move_task(struct task_struct *tsk);
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
-#endif
+
+#ifdef CONFIG_SMP
+extern void set_task_rq_fair(struct sched_entity *se,
+			     struct cfs_rq *prev, struct cfs_rq *next);
+#else /* !CONFIG_SMP */
+static inline void set_task_rq_fair(struct sched_entity *se,
+			     struct cfs_rq *prev, struct cfs_rq *next) { }
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_FAIR_GROUP_SCHED */
 
 #else /* CONFIG_CGROUP_SCHED */
 
@@ -933,6 +941,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
 #endif
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+	set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]);
 	p->se.cfs_rq = tg->cfs_rq[cpu];
 	p->se.parent = tg->se[cpu];
 #endif
-- 
cgit v1.1


From 7877a0ba5ec63c7b0111b06c773f1696fa17b35a Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Thu, 19 Nov 2015 16:47:29 +0100
Subject: sched/cputime: Remove extra cost in task_cputime()

There is an extra cost in task_cputime() and task_cputime_scaled() when
nohz_full is not activated. When vtime accounting is not enabled, we
don't need to get deltas of utime and stime under vtime seqlock.

This patch removes that cost with adding a shortcut route if vtime
accounting is not enabled.

Use context_tracking_is_enabled() to check if vtime is accounting on
some cpu, in which case only we need to check the tickless cputime delta.

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E . McKenney <paulmck@linux.vnet.ibm.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1447948054-28668-3-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/cputime.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 05de80b..1128d4b 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -853,6 +853,14 @@ void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime)
 {
 	cputime_t udelta, sdelta;
 
+	if (!context_tracking_is_enabled()) {
+		if (utime)
+			*utime = t->utime;
+		if (stime)
+			*stime = t->stime;
+		return;
+	}
+
 	fetch_task_cputime(t, utime, stime, &t->utime,
 			   &t->stime, &udelta, &sdelta);
 	if (utime)
@@ -866,6 +874,14 @@ void task_cputime_scaled(struct task_struct *t,
 {
 	cputime_t udelta, sdelta;
 
+	if (!context_tracking_is_enabled()) {
+		if (utimescaled)
+			*utimescaled = t->utimescaled;
+		if (stimescaled)
+			*stimescaled = t->stimescaled;
+		return;
+	}
+
 	fetch_task_cputime(t, utimescaled, stimescaled,
 			   &t->utimescaled, &t->stimescaled, &udelta, &sdelta);
 	if (utimescaled)
-- 
cgit v1.1


From 7098c1eac75dc03fdbb7249171a6e68ce6044a5a Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 19 Nov 2015 16:47:30 +0100
Subject: sched/cputime: Clarify vtime symbols and document them

VTIME_SLEEPING state happens either when:

1) The task is sleeping and no tickless delta is to be added on the task
   cputime stats.
2) The CPU isn't running vtime at all, so the same properties of 1) applies.

Lets rename the vtime symbol to reflect both states.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E . McKenney <paulmck@linux.vnet.ibm.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1447948054-28668-4-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h  | 5 ++++-
 kernel/fork.c          | 2 +-
 kernel/sched/cputime.c | 6 +++---
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index f425aac..3533168 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1522,8 +1522,11 @@ struct task_struct {
 	seqlock_t vtime_seqlock;
 	unsigned long long vtime_snap;
 	enum {
-		VTIME_SLEEPING = 0,
+		/* Task is sleeping or running in a CPU with VTIME inactive */
+		VTIME_INACTIVE = 0,
+		/* Task runs in userspace in a CPU with VTIME active */
 		VTIME_USER,
+		/* Task runs in kernelspace in a CPU with VTIME active */
 		VTIME_SYS,
 	} vtime_snap_whence;
 #endif
diff --git a/kernel/fork.c b/kernel/fork.c
index f97f2c4..c0a1370 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1350,7 +1350,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 	seqlock_init(&p->vtime_seqlock);
 	p->vtime_snap = 0;
-	p->vtime_snap_whence = VTIME_SLEEPING;
+	p->vtime_snap_whence = VTIME_INACTIVE;
 #endif
 
 #if defined(SPLIT_RSS_COUNTING)
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 1128d4b..4a18a6e 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -680,7 +680,7 @@ static cputime_t get_vtime_delta(struct task_struct *tsk)
 {
 	unsigned long long delta = vtime_delta(tsk);
 
-	WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_SLEEPING);
+	WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
 	tsk->vtime_snap += delta;
 
 	/* CHECKME: always safe to convert nsecs to cputime? */
@@ -764,7 +764,7 @@ void vtime_account_idle(struct task_struct *tsk)
 void arch_vtime_task_switch(struct task_struct *prev)
 {
 	write_seqlock(&prev->vtime_seqlock);
-	prev->vtime_snap_whence = VTIME_SLEEPING;
+	prev->vtime_snap_whence = VTIME_INACTIVE;
 	write_sequnlock(&prev->vtime_seqlock);
 
 	write_seqlock(&current->vtime_seqlock);
@@ -829,7 +829,7 @@ fetch_task_cputime(struct task_struct *t,
 			*s_dst = *s_src;
 
 		/* Task is sleeping, nothing to add */
-		if (t->vtime_snap_whence == VTIME_SLEEPING ||
+		if (t->vtime_snap_whence == VTIME_INACTIVE ||
 		    is_idle_task(t))
 			continue;
 
-- 
cgit v1.1


From cab245d68c38afff1a4c4d018ab7e1d316982f5d Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 19 Nov 2015 16:47:31 +0100
Subject: sched/cputime: Correctly handle task guest time on housekeepers

When a task runs on a housekeeper (a CPU running with the periodic tick
with neighbours running tickless), it doesn't account cputime using vtime
but relies on the tick. Such a task has its vtime_snap_whence value set
to VTIME_INACTIVE.

Readers won't handle that correctly though. As long as vtime is running
on some CPU, readers incorretly assume that vtime runs on all CPUs and
always compute the tickless cputime delta, which is only junk on
housekeepers.

So lets fix this with checking that the target runs on a vtime CPU through
the appropriate state check before computing the tickless delta.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E . McKenney <paulmck@linux.vnet.ibm.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1447948054-28668-5-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/cputime.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 4a18a6e..5cf24e7 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -795,7 +795,7 @@ cputime_t task_gtime(struct task_struct *t)
 		seq = read_seqbegin(&t->vtime_seqlock);
 
 		gtime = t->gtime;
-		if (t->flags & PF_VCPU)
+		if (t->vtime_snap_whence == VTIME_SYS && t->flags & PF_VCPU)
 			gtime += vtime_delta(t);
 
 	} while (read_seqretry(&t->vtime_seqlock, seq));
-- 
cgit v1.1


From 55dbdcfa05533f44c9416070b8a9f6432b22314a Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 19 Nov 2015 16:47:32 +0100
Subject: sched/cputime: Rename vtime_accounting_enabled() to
 vtime_accounting_cpu_enabled()

vtime_accounting_enabled() checks if vtime is running on the current CPU
and is as such a misnomer. Lets rename it to a function that reflect its
locality. We are going to need the current name for a function that tells
if vtime runs at all on some CPU.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E . McKenney <paulmck@linux.vnet.ibm.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1447948054-28668-6-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/context_tracking.h |  4 ++--
 include/linux/vtime.h            | 14 +++++++-------
 kernel/sched/cputime.c           |  2 +-
 kernel/time/tick-sched.c         |  2 +-
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index 68b575a..d259274 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -86,7 +86,7 @@ static inline void context_tracking_init(void) { }
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 static inline void guest_enter(void)
 {
-	if (vtime_accounting_enabled())
+	if (vtime_accounting_cpu_enabled())
 		vtime_guest_enter(current);
 	else
 		current->flags |= PF_VCPU;
@@ -100,7 +100,7 @@ static inline void guest_exit(void)
 	if (context_tracking_is_enabled())
 		__context_tracking_exit(CONTEXT_GUEST);
 
-	if (vtime_accounting_enabled())
+	if (vtime_accounting_cpu_enabled())
 		vtime_guest_exit(current);
 	else
 		current->flags &= ~PF_VCPU;
diff --git a/include/linux/vtime.h b/include/linux/vtime.h
index c5165fd..ca23e83 100644
--- a/include/linux/vtime.h
+++ b/include/linux/vtime.h
@@ -10,14 +10,14 @@
 struct task_struct;
 
 /*
- * vtime_accounting_enabled() definitions/declarations
+ * vtime_accounting_cpu_enabled() definitions/declarations
  */
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-static inline bool vtime_accounting_enabled(void) { return true; }
+static inline bool vtime_accounting_cpu_enabled(void) { return true; }
 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-static inline bool vtime_accounting_enabled(void)
+static inline bool vtime_accounting_cpu_enabled(void)
 {
 	if (context_tracking_is_enabled()) {
 		if (context_tracking_cpu_is_enabled())
@@ -29,7 +29,7 @@ static inline bool vtime_accounting_enabled(void)
 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
 
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
-static inline bool vtime_accounting_enabled(void) { return false; }
+static inline bool vtime_accounting_cpu_enabled(void) { return false; }
 #endif /* !CONFIG_VIRT_CPU_ACCOUNTING */
 
 
@@ -44,7 +44,7 @@ extern void vtime_task_switch(struct task_struct *prev);
 extern void vtime_common_task_switch(struct task_struct *prev);
 static inline void vtime_task_switch(struct task_struct *prev)
 {
-	if (vtime_accounting_enabled())
+	if (vtime_accounting_cpu_enabled())
 		vtime_common_task_switch(prev);
 }
 #endif /* __ARCH_HAS_VTIME_TASK_SWITCH */
@@ -59,7 +59,7 @@ extern void vtime_account_irq_enter(struct task_struct *tsk);
 extern void vtime_common_account_irq_enter(struct task_struct *tsk);
 static inline void vtime_account_irq_enter(struct task_struct *tsk)
 {
-	if (vtime_accounting_enabled())
+	if (vtime_accounting_cpu_enabled())
 		vtime_common_account_irq_enter(tsk);
 }
 #endif /* __ARCH_HAS_VTIME_ACCOUNT */
@@ -78,7 +78,7 @@ extern void vtime_gen_account_irq_exit(struct task_struct *tsk);
 
 static inline void vtime_account_irq_exit(struct task_struct *tsk)
 {
-	if (vtime_accounting_enabled())
+	if (vtime_accounting_cpu_enabled())
 		vtime_gen_account_irq_exit(tsk);
 }
 
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 5cf24e7..5727217 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -466,7 +466,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
 	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
 	struct rq *rq = this_rq();
 
-	if (vtime_accounting_enabled())
+	if (vtime_accounting_cpu_enabled())
 		return;
 
 	if (sched_clock_irqtime) {
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 515edf3..11ce599 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -875,7 +875,7 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 	unsigned long ticks;
 
-	if (vtime_accounting_enabled())
+	if (vtime_accounting_cpu_enabled())
 		return;
 	/*
 	 * We stopped the tick in idle. Update process times would miss the
-- 
cgit v1.1


From e592539466380279a9e6e6fdfe4545aa54f22593 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 19 Nov 2015 16:47:33 +0100
Subject: sched/cputime: Introduce vtime accounting check for readers

Readers need to know if vtime runs at all on some CPU somewhere, this
is a fast-path check to determine if we need to check further the need
to add up any tickless cputime delta.

This fast path check uses context tracking state because vtime is tied
to context tracking as of now. This check appears to be confusing though
so lets use a vtime function that deals with context tracking details
in vtime implementation instead.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E . McKenney <paulmck@linux.vnet.ibm.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1447948054-28668-7-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/vtime.h  | 13 ++++++++++++-
 kernel/sched/cputime.c |  6 +++---
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/include/linux/vtime.h b/include/linux/vtime.h
index ca23e83..fa21969 100644
--- a/include/linux/vtime.h
+++ b/include/linux/vtime.h
@@ -17,9 +17,20 @@ static inline bool vtime_accounting_cpu_enabled(void) { return true; }
 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
+/*
+ * Checks if vtime is enabled on some CPU. Cputime readers want to be careful
+ * in that case and compute the tickless cputime.
+ * For now vtime state is tied to context tracking. We might want to decouple
+ * those later if necessary.
+ */
+static inline bool vtime_accounting_enabled(void)
+{
+	return context_tracking_is_enabled();
+}
+
 static inline bool vtime_accounting_cpu_enabled(void)
 {
-	if (context_tracking_is_enabled()) {
+	if (vtime_accounting_enabled()) {
 		if (context_tracking_cpu_is_enabled())
 			return true;
 	}
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 5727217..9989c3f 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -788,7 +788,7 @@ cputime_t task_gtime(struct task_struct *t)
 	unsigned int seq;
 	cputime_t gtime;
 
-	if (!context_tracking_is_enabled())
+	if (!vtime_accounting_enabled())
 		return t->gtime;
 
 	do {
@@ -853,7 +853,7 @@ void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime)
 {
 	cputime_t udelta, sdelta;
 
-	if (!context_tracking_is_enabled()) {
+	if (!vtime_accounting_enabled()) {
 		if (utime)
 			*utime = t->utime;
 		if (stime)
@@ -874,7 +874,7 @@ void task_cputime_scaled(struct task_struct *t,
 {
 	cputime_t udelta, sdelta;
 
-	if (!context_tracking_is_enabled()) {
+	if (!vtime_accounting_enabled()) {
 		if (utimescaled)
 			*utimescaled = t->utimescaled;
 		if (stimescaled)
-- 
cgit v1.1


From b7ce2277f087fd052e7e1bbf432f7fecbee82bb6 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 19 Nov 2015 16:47:34 +0100
Subject: sched/cputime: Convert vtime_seqlock to seqcount

The cputime can only be updated by the current task itself, even in
vtime case. So we can safely use seqcount instead of seqlock as there
is no writer concurrency involved.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E . McKenney <paulmck@linux.vnet.ibm.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1447948054-28668-8-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/init_task.h |  2 +-
 include/linux/sched.h     |  2 +-
 kernel/fork.c             |  2 +-
 kernel/sched/cputime.c    | 46 ++++++++++++++++++++++++----------------------
 4 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 1c1ff7e..f2cb8d4 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -150,7 +150,7 @@ extern struct task_group root_task_group;
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 # define INIT_VTIME(tsk)						\
-	.vtime_seqlock = __SEQLOCK_UNLOCKED(tsk.vtime_seqlock),	\
+	.vtime_seqcount = SEQCNT_ZERO(tsk.vtime_seqcount),	\
 	.vtime_snap = 0,				\
 	.vtime_snap_whence = VTIME_SYS,
 #else
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3533168..3b0de68 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1519,7 +1519,7 @@ struct task_struct {
 	cputime_t gtime;
 	struct prev_cputime prev_cputime;
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-	seqlock_t vtime_seqlock;
+	seqcount_t vtime_seqcount;
 	unsigned long long vtime_snap;
 	enum {
 		/* Task is sleeping or running in a CPU with VTIME inactive */
diff --git a/kernel/fork.c b/kernel/fork.c
index c0a1370..eea32b5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1348,7 +1348,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	prev_cputime_init(&p->prev_cputime);
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-	seqlock_init(&p->vtime_seqlock);
+	seqcount_init(&p->vtime_seqcount);
 	p->vtime_snap = 0;
 	p->vtime_snap_whence = VTIME_INACTIVE;
 #endif
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 9989c3f..d5ff5c6 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -696,37 +696,37 @@ static void __vtime_account_system(struct task_struct *tsk)
 
 void vtime_account_system(struct task_struct *tsk)
 {
-	write_seqlock(&tsk->vtime_seqlock);
+	write_seqcount_begin(&tsk->vtime_seqcount);
 	__vtime_account_system(tsk);
-	write_sequnlock(&tsk->vtime_seqlock);
+	write_seqcount_end(&tsk->vtime_seqcount);
 }
 
 void vtime_gen_account_irq_exit(struct task_struct *tsk)
 {
-	write_seqlock(&tsk->vtime_seqlock);
+	write_seqcount_begin(&tsk->vtime_seqcount);
 	__vtime_account_system(tsk);
 	if (context_tracking_in_user())
 		tsk->vtime_snap_whence = VTIME_USER;
-	write_sequnlock(&tsk->vtime_seqlock);
+	write_seqcount_end(&tsk->vtime_seqcount);
 }
 
 void vtime_account_user(struct task_struct *tsk)
 {
 	cputime_t delta_cpu;
 
-	write_seqlock(&tsk->vtime_seqlock);
+	write_seqcount_begin(&tsk->vtime_seqcount);
 	delta_cpu = get_vtime_delta(tsk);
 	tsk->vtime_snap_whence = VTIME_SYS;
 	account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
-	write_sequnlock(&tsk->vtime_seqlock);
+	write_seqcount_end(&tsk->vtime_seqcount);
 }
 
 void vtime_user_enter(struct task_struct *tsk)
 {
-	write_seqlock(&tsk->vtime_seqlock);
+	write_seqcount_begin(&tsk->vtime_seqcount);
 	__vtime_account_system(tsk);
 	tsk->vtime_snap_whence = VTIME_USER;
-	write_sequnlock(&tsk->vtime_seqlock);
+	write_seqcount_end(&tsk->vtime_seqcount);
 }
 
 void vtime_guest_enter(struct task_struct *tsk)
@@ -738,19 +738,19 @@ void vtime_guest_enter(struct task_struct *tsk)
 	 * synchronization against the reader (task_gtime())
 	 * that can thus safely catch up with a tickless delta.
 	 */
-	write_seqlock(&tsk->vtime_seqlock);
+	write_seqcount_begin(&tsk->vtime_seqcount);
 	__vtime_account_system(tsk);
 	current->flags |= PF_VCPU;
-	write_sequnlock(&tsk->vtime_seqlock);
+	write_seqcount_end(&tsk->vtime_seqcount);
 }
 EXPORT_SYMBOL_GPL(vtime_guest_enter);
 
 void vtime_guest_exit(struct task_struct *tsk)
 {
-	write_seqlock(&tsk->vtime_seqlock);
+	write_seqcount_begin(&tsk->vtime_seqcount);
 	__vtime_account_system(tsk);
 	current->flags &= ~PF_VCPU;
-	write_sequnlock(&tsk->vtime_seqlock);
+	write_seqcount_end(&tsk->vtime_seqcount);
 }
 EXPORT_SYMBOL_GPL(vtime_guest_exit);
 
@@ -763,24 +763,26 @@ void vtime_account_idle(struct task_struct *tsk)
 
 void arch_vtime_task_switch(struct task_struct *prev)
 {
-	write_seqlock(&prev->vtime_seqlock);
+	write_seqcount_begin(&prev->vtime_seqcount);
 	prev->vtime_snap_whence = VTIME_INACTIVE;
-	write_sequnlock(&prev->vtime_seqlock);
+	write_seqcount_end(&prev->vtime_seqcount);
 
-	write_seqlock(&current->vtime_seqlock);
+	write_seqcount_begin(&current->vtime_seqcount);
 	current->vtime_snap_whence = VTIME_SYS;
 	current->vtime_snap = sched_clock_cpu(smp_processor_id());
-	write_sequnlock(&current->vtime_seqlock);
+	write_seqcount_end(&current->vtime_seqcount);
 }
 
 void vtime_init_idle(struct task_struct *t, int cpu)
 {
 	unsigned long flags;
 
-	write_seqlock_irqsave(&t->vtime_seqlock, flags);
+	local_irq_save(flags);
+	write_seqcount_begin(&t->vtime_seqcount);
 	t->vtime_snap_whence = VTIME_SYS;
 	t->vtime_snap = sched_clock_cpu(cpu);
-	write_sequnlock_irqrestore(&t->vtime_seqlock, flags);
+	write_seqcount_end(&t->vtime_seqcount);
+	local_irq_restore(flags);
 }
 
 cputime_t task_gtime(struct task_struct *t)
@@ -792,13 +794,13 @@ cputime_t task_gtime(struct task_struct *t)
 		return t->gtime;
 
 	do {
-		seq = read_seqbegin(&t->vtime_seqlock);
+		seq = read_seqcount_begin(&t->vtime_seqcount);
 
 		gtime = t->gtime;
 		if (t->vtime_snap_whence == VTIME_SYS && t->flags & PF_VCPU)
 			gtime += vtime_delta(t);
 
-	} while (read_seqretry(&t->vtime_seqlock, seq));
+	} while (read_seqcount_retry(&t->vtime_seqcount, seq));
 
 	return gtime;
 }
@@ -821,7 +823,7 @@ fetch_task_cputime(struct task_struct *t,
 		*udelta = 0;
 		*sdelta = 0;
 
-		seq = read_seqbegin(&t->vtime_seqlock);
+		seq = read_seqcount_begin(&t->vtime_seqcount);
 
 		if (u_dst)
 			*u_dst = *u_src;
@@ -845,7 +847,7 @@ fetch_task_cputime(struct task_struct *t,
 			if (t->vtime_snap_whence == VTIME_SYS)
 				*sdelta = delta;
 		}
-	} while (read_seqretry(&t->vtime_seqlock, seq));
+	} while (read_seqcount_retry(&t->vtime_seqcount, seq));
 }
 
 
-- 
cgit v1.1


From ed82b8a1ff76ed7b2709e36ed361ddd022fe2407 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Sun, 29 Nov 2015 20:59:43 -0800
Subject: sched/core: Move the sched_to_prio[] arrays out of line

When building a kernel with a gcc 6 snapshot the compiler complains
about unused const static variables for prio_to_weight and prio_to_mult
for multiple scheduler files (all but core.c and autogroup.c)

The way the array is currently declared it will be duplicated in
every scheduler file that includes sched.h, which seems rather wasteful.

Move the array out of line into core.c. I also added a sched_ prefix
to avoid any potential name space collisions.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1448859583-3252-1-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/auto_group.c |  2 +-
 kernel/sched/core.c       | 45 +++++++++++++++++++++++++++++++++++++++++++--
 kernel/sched/sched.h      | 42 ++----------------------------------------
 3 files changed, 46 insertions(+), 43 deletions(-)

diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 750ed60..a5d966c 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -212,7 +212,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
 	ag = autogroup_task_get(p);
 
 	down_write(&ag->lock);
-	err = sched_group_set_shares(ag->tg, prio_to_weight[nice + 20]);
+	err = sched_group_set_shares(ag->tg, sched_prio_to_weight[nice + 20]);
 	if (!err)
 		ag->nice = nice;
 	up_write(&ag->lock);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 32d83e4..d591db1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -823,8 +823,8 @@ static void set_load_weight(struct task_struct *p)
 		return;
 	}
 
-	load->weight = scale_load(prio_to_weight[prio]);
-	load->inv_weight = prio_to_wmult[prio];
+	load->weight = scale_load(sched_prio_to_weight[prio]);
+	load->inv_weight = sched_prio_to_wmult[prio];
 }
 
 static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -8625,3 +8625,44 @@ void dump_cpu_task(int cpu)
 	pr_info("Task dump for CPU %d:\n", cpu);
 	sched_show_task(cpu_curr(cpu));
 }
+
+/*
+ * Nice levels are multiplicative, with a gentle 10% change for every
+ * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
+ * nice 1, it will get ~10% less CPU time than another CPU-bound task
+ * that remained on nice 0.
+ *
+ * The "10% effect" is relative and cumulative: from _any_ nice level,
+ * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
+ * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
+ * If a task goes up by ~10% and another task goes down by ~10% then
+ * the relative distance between them is ~25%.)
+ */
+const int sched_prio_to_weight[40] = {
+ /* -20 */     88761,     71755,     56483,     46273,     36291,
+ /* -15 */     29154,     23254,     18705,     14949,     11916,
+ /* -10 */      9548,      7620,      6100,      4904,      3906,
+ /*  -5 */      3121,      2501,      1991,      1586,      1277,
+ /*   0 */      1024,       820,       655,       526,       423,
+ /*   5 */       335,       272,       215,       172,       137,
+ /*  10 */       110,        87,        70,        56,        45,
+ /*  15 */        36,        29,        23,        18,        15,
+};
+
+/*
+ * Inverse (2^32/x) values of the sched_prio_to_weight[] array, precalculated.
+ *
+ * In cases where the weight does not change often, we can use the
+ * precalculated inverse to speed up arithmetics by turning divisions
+ * into multiplications:
+ */
+const u32 sched_prio_to_wmult[40] = {
+ /* -20 */     48388,     59856,     76040,     92818,    118348,
+ /* -15 */    147320,    184698,    229616,    287308,    360437,
+ /* -10 */    449829,    563644,    704093,    875809,   1099582,
+ /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326,
+ /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587,
+ /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126,
+ /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
+ /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
+};
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9a029fa..472cd14 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1122,46 +1122,8 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 #define WEIGHT_IDLEPRIO                3
 #define WMULT_IDLEPRIO         1431655765
 
-/*
- * Nice levels are multiplicative, with a gentle 10% change for every
- * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
- * nice 1, it will get ~10% less CPU time than another CPU-bound task
- * that remained on nice 0.
- *
- * The "10% effect" is relative and cumulative: from _any_ nice level,
- * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
- * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
- * If a task goes up by ~10% and another task goes down by ~10% then
- * the relative distance between them is ~25%.)
- */
-static const int prio_to_weight[40] = {
- /* -20 */     88761,     71755,     56483,     46273,     36291,
- /* -15 */     29154,     23254,     18705,     14949,     11916,
- /* -10 */      9548,      7620,      6100,      4904,      3906,
- /*  -5 */      3121,      2501,      1991,      1586,      1277,
- /*   0 */      1024,       820,       655,       526,       423,
- /*   5 */       335,       272,       215,       172,       137,
- /*  10 */       110,        87,        70,        56,        45,
- /*  15 */        36,        29,        23,        18,        15,
-};
-
-/*
- * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
- *
- * In cases where the weight does not change often, we can use the
- * precalculated inverse to speed up arithmetics by turning divisions
- * into multiplications:
- */
-static const u32 prio_to_wmult[40] = {
- /* -20 */     48388,     59856,     76040,     92818,    118348,
- /* -15 */    147320,    184698,    229616,    287308,    360437,
- /* -10 */    449829,    563644,    704093,    875809,   1099582,
- /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326,
- /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587,
- /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126,
- /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
- /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
-};
+extern const int sched_prio_to_weight[40];
+extern const u32 sched_prio_to_wmult[40];
 
 #define ENQUEUE_WAKEUP		0x01
 #define ENQUEUE_HEAD		0x02
-- 
cgit v1.1


From a426f99c91d1036767a7819aaaba6bd3191b7f06 Mon Sep 17 00:00:00 2001
From: Waiman Long <Waiman.Long@hpe.com>
Date: Wed, 25 Nov 2015 14:09:38 -0500
Subject: sched/fair: Avoid redundant idle_cpu() call in update_sg_lb_stats()

Part of the responsibility of the update_sg_lb_stats() function is to
update the idle_cpus statistical counter in struct sg_lb_stats. This
check is done by calling idle_cpu(). The idle_cpu() function, in
turn, checks a number of fields within the run queue structure such
as rq->curr and rq->nr_running.

With the current layout of the run queue structure, rq->curr and
rq->nr_running are in separate cachelines. The rq->curr variable is
checked first followed by nr_running. As nr_running is also accessed
by update_sg_lb_stats() earlier, it makes no sense to load another
cacheline when nr_running is not 0 as idle_cpu() will always return
false in this case.

This patch eliminates this redundant cacheline load by checking the
cached nr_running before calling idle_cpu().

Signed-off-by: Waiman Long <Waiman.Long@hpe.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Douglas Hatch <doug.hatch@hpe.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Scott J Norton <scott.norton@hpe.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1448478580-26467-2-git-send-email-Waiman.Long@hpe.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index efd664c..4b0e8b8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6398,7 +6398,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 			bool *overload)
 {
 	unsigned long load;
-	int i;
+	int i, nr_running;
 
 	memset(sgs, 0, sizeof(*sgs));
 
@@ -6415,7 +6415,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 		sgs->group_util += cpu_util(i);
 		sgs->sum_nr_running += rq->cfs.h_nr_running;
 
-		if (rq->nr_running > 1)
+		nr_running = rq->nr_running;
+		if (nr_running > 1)
 			*overload = true;
 
 #ifdef CONFIG_NUMA_BALANCING
@@ -6423,7 +6424,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 		sgs->nr_preferred_running += rq->nr_preferred_running;
 #endif
 		sgs->sum_weighted_load += weighted_cpuload(i);
-		if (idle_cpu(i))
+		/*
+		 * No need to call idle_cpu() if nr_running is not 0
+		 */
+		if (!nr_running && idle_cpu(i))
 			sgs->idle_cpus++;
 	}
 
-- 
cgit v1.1


From b0367629acf62a78404c467cd09df447c2fea804 Mon Sep 17 00:00:00 2001
From: Waiman Long <Waiman.Long@hpe.com>
Date: Wed, 2 Dec 2015 13:41:49 -0500
Subject: sched/fair: Move the cache-hot 'load_avg' variable into its own
 cacheline

If a system with large number of sockets was driven to full
utilization, it was found that the clock tick handling occupied a
rather significant proportion of CPU time when fair group scheduling
and autogroup were enabled.

Running a java benchmark on a 16-socket IvyBridge-EX system, the perf
profile looked like:

  10.52%   0.00%  java   [kernel.vmlinux]  [k] smp_apic_timer_interrupt
   9.66%   0.05%  java   [kernel.vmlinux]  [k] hrtimer_interrupt
   8.65%   0.03%  java   [kernel.vmlinux]  [k] tick_sched_timer
   8.56%   0.00%  java   [kernel.vmlinux]  [k] update_process_times
   8.07%   0.03%  java   [kernel.vmlinux]  [k] scheduler_tick
   6.91%   1.78%  java   [kernel.vmlinux]  [k] task_tick_fair
   5.24%   5.04%  java   [kernel.vmlinux]  [k] update_cfs_shares

In particular, the high CPU time consumed by update_cfs_shares()
was mostly due to contention on the cacheline that contained the
task_group's load_avg statistical counter. This cacheline may also
contains variables like shares, cfs_rq & se which are accessed rather
frequently during clock tick processing.

This patch moves the load_avg variable into another cacheline
separated from the other frequently accessed variables. It also
creates a cacheline aligned kmemcache for task_group to make sure
that all the allocated task_group's are cacheline aligned.

By doing so, the perf profile became:

   9.44%   0.00%  java   [kernel.vmlinux]  [k] smp_apic_timer_interrupt
   8.74%   0.01%  java   [kernel.vmlinux]  [k] hrtimer_interrupt
   7.83%   0.03%  java   [kernel.vmlinux]  [k] tick_sched_timer
   7.74%   0.00%  java   [kernel.vmlinux]  [k] update_process_times
   7.27%   0.03%  java   [kernel.vmlinux]  [k] scheduler_tick
   5.94%   1.74%  java   [kernel.vmlinux]  [k] task_tick_fair
   4.15%   3.92%  java   [kernel.vmlinux]  [k] update_cfs_shares

The %cpu time is still pretty high, but it is better than before. The
benchmark results before and after the patch was as follows:

  Before patch - Max-jOPs: 907533    Critical-jOps: 134877
  After patch  - Max-jOPs: 916011    Critical-jOps: 142366

Signed-off-by: Waiman Long <Waiman.Long@hpe.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Ben Segall <bsegall@google.com>
Cc: Douglas Hatch <doug.hatch@hpe.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Scott J Norton <scott.norton@hpe.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yuyang Du <yuyang.du@intel.com>
Link: http://lkml.kernel.org/r/1449081710-20185-3-git-send-email-Waiman.Long@hpe.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c  | 10 +++++++---
 kernel/sched/sched.h |  7 ++++++-
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d591db1..aa3f978 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7370,6 +7370,9 @@ int in_sched_functions(unsigned long addr)
  */
 struct task_group root_task_group;
 LIST_HEAD(task_groups);
+
+/* Cacheline aligned slab cache for task_group */
+static struct kmem_cache *task_group_cache __read_mostly;
 #endif
 
 DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
@@ -7427,11 +7430,12 @@ void __init sched_init(void)
 #endif /* CONFIG_RT_GROUP_SCHED */
 
 #ifdef CONFIG_CGROUP_SCHED
+	task_group_cache = KMEM_CACHE(task_group, 0);
+
 	list_add(&root_task_group.list, &task_groups);
 	INIT_LIST_HEAD(&root_task_group.children);
 	INIT_LIST_HEAD(&root_task_group.siblings);
 	autogroup_init(&init_task);
-
 #endif /* CONFIG_CGROUP_SCHED */
 
 	for_each_possible_cpu(i) {
@@ -7712,7 +7716,7 @@ static void free_sched_group(struct task_group *tg)
 	free_fair_sched_group(tg);
 	free_rt_sched_group(tg);
 	autogroup_free(tg);
-	kfree(tg);
+	kmem_cache_free(task_group_cache, tg);
 }
 
 /* allocate runqueue etc for a new task group */
@@ -7720,7 +7724,7 @@ struct task_group *sched_create_group(struct task_group *parent)
 {
 	struct task_group *tg;
 
-	tg = kzalloc(sizeof(*tg), GFP_KERNEL);
+	tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);
 	if (!tg)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 472cd14..a5a6b3e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -248,7 +248,12 @@ struct task_group {
 	unsigned long shares;
 
 #ifdef	CONFIG_SMP
-	atomic_long_t load_avg;
+	/*
+	 * load_avg can be heavily contended at clock tick time, so put
+	 * it in its own cacheline separated from the fields above which
+	 * will also be accessed at each tick.
+	 */
+	atomic_long_t load_avg ____cacheline_aligned;
 #endif
 #endif
 
-- 
cgit v1.1


From aa0b7ae06387d40a988ce16a189082dee6e570bc Mon Sep 17 00:00:00 2001
From: Waiman Long <Waiman.Long@hpe.com>
Date: Wed, 2 Dec 2015 13:41:50 -0500
Subject: sched/fair: Disable the task group load_avg update for the
 root_task_group

Currently, the update_tg_load_avg() function attempts to update the
tg's load_avg value whenever the load changes even for root_task_group
where the load_avg value will never be used. This patch will disable
the load_avg update when the given task group is the root_task_group.

Running a Java benchmark with noautogroup and a 4.3 kernel on a
16-socket IvyBridge-EX system, the amount of CPU time (as reported by
perf) consumed by task_tick_fair() which includes update_tg_load_avg()
decreased from 0.71% to 0.22%, a more than 3X reduction. The Max-jOPs
results also increased slightly from 983015 to 986449.

Signed-off-by: Waiman Long <Waiman.Long@hpe.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ben Segall <bsegall@google.com>
Cc: Douglas Hatch <doug.hatch@hpe.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Scott J Norton <scott.norton@hpe.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yuyang Du <yuyang.du@intel.com>
Link: http://lkml.kernel.org/r/1449081710-20185-4-git-send-email-Waiman.Long@hpe.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4b0e8b8..1093873 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2709,6 +2709,12 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
 {
 	long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
 
+	/*
+	 * No need to update load_avg for root_task_group as it is not used.
+	 */
+	if (cfs_rq->tg == &root_task_group)
+		return;
+
 	if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
 		atomic_long_add(delta, &cfs_rq->tg->load_avg);
 		cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
-- 
cgit v1.1


From 45e898b735620f426eddf105fc886d2966593a58 Mon Sep 17 00:00:00 2001
From: Waiman Long <Waiman.Long@hpe.com>
Date: Mon, 9 Nov 2015 19:09:25 -0500
Subject: locking/pvqspinlock: Collect slowpath lock statistics

This patch enables the accumulation of kicking and waiting related
PV qspinlock statistics when the new QUEUED_LOCK_STAT configuration
option is selected. It also enables the collection of data which
enable us to calculate the kicking and wakeup latencies which have
a heavy dependency on the CPUs being used.

The statistical counters are per-cpu variables to minimize the
performance overhead in their updates. These counters are exported
via the debugfs filesystem under the qlockstat directory.  When the
corresponding debugfs files are read, summation and computing of the
required data are then performed.

The measured latencies for different CPUs are:

	CPU		Wakeup		Kicking
	---		------		-------
	Haswell-EX	63.6us		 7.4us
	Westmere-EX	67.6us		 9.3us

The measured latencies varied a bit from run-to-run. The wakeup
latency is much higher than the kicking latency.

A sample of statistical counters after system bootup (with vCPU
overcommit) was:

	pv_hash_hops=1.00
	pv_kick_unlock=1148
	pv_kick_wake=1146
	pv_latency_kick=11040
	pv_latency_wake=194840
	pv_spurious_wakeup=7
	pv_wait_again=4
	pv_wait_head=23
	pv_wait_node=1129

Signed-off-by: Waiman Long <Waiman.Long@hpe.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Douglas Hatch <doug.hatch@hpe.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Scott J Norton <scott.norton@hpe.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1447114167-47185-6-git-send-email-Waiman.Long@hpe.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kconfig                    |   8 +
 kernel/locking/qspinlock_paravirt.h |  32 +++-
 kernel/locking/qspinlock_stat.h     | 281 ++++++++++++++++++++++++++++++++++++
 3 files changed, 316 insertions(+), 5 deletions(-)
 create mode 100644 kernel/locking/qspinlock_stat.h

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index db3622f..965fc42 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -687,6 +687,14 @@ config PARAVIRT_SPINLOCKS
 
 	  If you are unsure how to answer this question, answer Y.
 
+config QUEUED_LOCK_STAT
+	bool "Paravirt queued spinlock statistics"
+	depends on PARAVIRT_SPINLOCKS && DEBUG_FS && QUEUED_SPINLOCKS
+	---help---
+	  Enable the collection of statistical data on the slowpath
+	  behavior of paravirtualized queued spinlocks and report
+	  them on debugfs.
+
 source "arch/x86/xen/Kconfig"
 
 config KVM_GUEST
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 4bd323d..aaeeefb 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -41,6 +41,11 @@ struct pv_node {
 };
 
 /*
+ * Include queued spinlock statistics code
+ */
+#include "qspinlock_stat.h"
+
+/*
  * Lock and MCS node addresses hash table for fast lookup
  *
  * Hashing is done on a per-cacheline basis to minimize the need to access
@@ -100,10 +105,13 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
 {
 	unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
 	struct pv_hash_entry *he;
+	int hopcnt = 0;
 
 	for_each_hash_entry(he, offset, hash) {
+		hopcnt++;
 		if (!cmpxchg(&he->lock, NULL, lock)) {
 			WRITE_ONCE(he->node, node);
+			qstat_hop(hopcnt);
 			return &he->lock;
 		}
 	}
@@ -164,9 +172,11 @@ static void pv_init_node(struct mcs_spinlock *node)
 static void pv_wait_node(struct mcs_spinlock *node)
 {
 	struct pv_node *pn = (struct pv_node *)node;
+	int waitcnt = 0;
 	int loop;
 
-	for (;;) {
+	/* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */
+	for (;; waitcnt++) {
 		for (loop = SPIN_THRESHOLD; loop; loop--) {
 			if (READ_ONCE(node->locked))
 				return;
@@ -184,12 +194,16 @@ static void pv_wait_node(struct mcs_spinlock *node)
 		 */
 		smp_store_mb(pn->state, vcpu_halted);
 
-		if (!READ_ONCE(node->locked))
+		if (!READ_ONCE(node->locked)) {
+			qstat_inc(qstat_pv_wait_node, true);
+			qstat_inc(qstat_pv_wait_again, waitcnt);
 			pv_wait(&pn->state, vcpu_halted);
+		}
 
 		/*
-		 * If pv_kick_node() changed us to vcpu_hashed, retain that value
-		 * so that pv_wait_head() knows to not also try to hash this lock.
+		 * If pv_kick_node() changed us to vcpu_hashed, retain that
+		 * value so that pv_wait_head() knows to not also try to hash
+		 * this lock.
 		 */
 		cmpxchg(&pn->state, vcpu_halted, vcpu_running);
 
@@ -200,6 +214,7 @@ static void pv_wait_node(struct mcs_spinlock *node)
 		 * So it is better to spin for a while in the hope that the
 		 * MCS lock will be released soon.
 		 */
+		qstat_inc(qstat_pv_spurious_wakeup, !READ_ONCE(node->locked));
 	}
 
 	/*
@@ -250,6 +265,7 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
 	struct pv_node *pn = (struct pv_node *)node;
 	struct __qspinlock *l = (void *)lock;
 	struct qspinlock **lp = NULL;
+	int waitcnt = 0;
 	int loop;
 
 	/*
@@ -259,7 +275,7 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
 	if (READ_ONCE(pn->state) == vcpu_hashed)
 		lp = (struct qspinlock **)1;
 
-	for (;;) {
+	for (;; waitcnt++) {
 		for (loop = SPIN_THRESHOLD; loop; loop--) {
 			if (!READ_ONCE(l->locked))
 				return;
@@ -290,14 +306,19 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
 				return;
 			}
 		}
+		qstat_inc(qstat_pv_wait_head, true);
+		qstat_inc(qstat_pv_wait_again, waitcnt);
 		pv_wait(&l->locked, _Q_SLOW_VAL);
 
+		if (!READ_ONCE(l->locked))
+			return;
 		/*
 		 * The unlocker should have freed the lock before kicking the
 		 * CPU. So if the lock is still not free, it is a spurious
 		 * wakeup and so the vCPU should wait again after spinning for
 		 * a while.
 		 */
+		qstat_inc(qstat_pv_spurious_wakeup, true);
 	}
 
 	/*
@@ -352,6 +373,7 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
 	 * vCPU is harmless other than the additional latency in completing
 	 * the unlock.
 	 */
+	qstat_inc(qstat_pv_kick_unlock, true);
 	pv_kick(node->cpu);
 }
 
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
new file mode 100644
index 0000000..b1553ad
--- /dev/null
+++ b/kernel/locking/qspinlock_stat.h
@@ -0,0 +1,281 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Waiman Long <waiman.long@hpe.com>
+ */
+
+/*
+ * When queued spinlock statistical counters are enabled, the following
+ * debugfs files will be created for reporting the counter values:
+ *
+ * <debugfs>/qlockstat/
+ *   pv_hash_hops	- average # of hops per hashing operation
+ *   pv_kick_unlock	- # of vCPU kicks issued at unlock time
+ *   pv_kick_wake	- # of vCPU kicks used for computing pv_latency_wake
+ *   pv_latency_kick	- average latency (ns) of vCPU kick operation
+ *   pv_latency_wake	- average latency (ns) from vCPU kick to wakeup
+ *   pv_spurious_wakeup	- # of spurious wakeups
+ *   pv_wait_again	- # of vCPU wait's that happened after a vCPU kick
+ *   pv_wait_head	- # of vCPU wait's at the queue head
+ *   pv_wait_node	- # of vCPU wait's at a non-head queue node
+ *
+ * Writing to the "reset_counters" file will reset all the above counter
+ * values.
+ *
+ * These statistical counters are implemented as per-cpu variables which are
+ * summed and computed whenever the corresponding debugfs files are read. This
+ * minimizes added overhead making the counters usable even in a production
+ * environment.
+ *
+ * There may be slight difference between pv_kick_wake and pv_kick_unlock.
+ */
+enum qlock_stats {
+	qstat_pv_hash_hops,
+	qstat_pv_kick_unlock,
+	qstat_pv_kick_wake,
+	qstat_pv_latency_kick,
+	qstat_pv_latency_wake,
+	qstat_pv_spurious_wakeup,
+	qstat_pv_wait_again,
+	qstat_pv_wait_head,
+	qstat_pv_wait_node,
+	qstat_num,	/* Total number of statistical counters */
+	qstat_reset_cnts = qstat_num,
+};
+
+#ifdef CONFIG_QUEUED_LOCK_STAT
+/*
+ * Collect pvqspinlock statistics
+ */
+#include <linux/debugfs.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+
+static const char * const qstat_names[qstat_num + 1] = {
+	[qstat_pv_hash_hops]	   = "pv_hash_hops",
+	[qstat_pv_kick_unlock]     = "pv_kick_unlock",
+	[qstat_pv_kick_wake]       = "pv_kick_wake",
+	[qstat_pv_spurious_wakeup] = "pv_spurious_wakeup",
+	[qstat_pv_latency_kick]	   = "pv_latency_kick",
+	[qstat_pv_latency_wake]    = "pv_latency_wake",
+	[qstat_pv_wait_again]      = "pv_wait_again",
+	[qstat_pv_wait_head]       = "pv_wait_head",
+	[qstat_pv_wait_node]       = "pv_wait_node",
+	[qstat_reset_cnts]         = "reset_counters",
+};
+
+/*
+ * Per-cpu counters
+ */
+static DEFINE_PER_CPU(unsigned long, qstats[qstat_num]);
+static DEFINE_PER_CPU(u64, pv_kick_time);
+
+/*
+ * Function to read and return the qlock statistical counter values
+ *
+ * The following counters are handled specially:
+ * 1. qstat_pv_latency_kick
+ *    Average kick latency (ns) = pv_latency_kick/pv_kick_unlock
+ * 2. qstat_pv_latency_wake
+ *    Average wake latency (ns) = pv_latency_wake/pv_kick_wake
+ * 3. qstat_pv_hash_hops
+ *    Average hops/hash = pv_hash_hops/pv_kick_unlock
+ */
+static ssize_t qstat_read(struct file *file, char __user *user_buf,
+			  size_t count, loff_t *ppos)
+{
+	char buf[64];
+	int cpu, counter, len;
+	u64 stat = 0, kicks = 0;
+
+	/*
+	 * Get the counter ID stored in file->f_inode->i_private
+	 */
+	if (!file->f_inode) {
+		WARN_ON_ONCE(1);
+		return -EBADF;
+	}
+	counter = (long)(file->f_inode->i_private);
+
+	if (counter >= qstat_num)
+		return -EBADF;
+
+	for_each_possible_cpu(cpu) {
+		stat += per_cpu(qstats[counter], cpu);
+		/*
+		 * Need to sum additional counter for some of them
+		 */
+		switch (counter) {
+
+		case qstat_pv_latency_kick:
+		case qstat_pv_hash_hops:
+			kicks += per_cpu(qstats[qstat_pv_kick_unlock], cpu);
+			break;
+
+		case qstat_pv_latency_wake:
+			kicks += per_cpu(qstats[qstat_pv_kick_wake], cpu);
+			break;
+		}
+	}
+
+	if (counter == qstat_pv_hash_hops) {
+		u64 frac;
+
+		frac = 100ULL * do_div(stat, kicks);
+		frac = DIV_ROUND_CLOSEST_ULL(frac, kicks);
+
+		/*
+		 * Return a X.XX decimal number
+		 */
+		len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n", stat, frac);
+	} else {
+		/*
+		 * Round to the nearest ns
+		 */
+		if ((counter == qstat_pv_latency_kick) ||
+		    (counter == qstat_pv_latency_wake)) {
+			stat = 0;
+			if (kicks)
+				stat = DIV_ROUND_CLOSEST_ULL(stat, kicks);
+		}
+		len = snprintf(buf, sizeof(buf) - 1, "%llu\n", stat);
+	}
+
+	return simple_read_from_buffer(user_buf, count, ppos, buf, len);
+}
+
+/*
+ * Function to handle write request
+ *
+ * When counter = reset_cnts, reset all the counter values.
+ * Since the counter updates aren't atomic, the resetting is done twice
+ * to make sure that the counters are very likely to be all cleared.
+ */
+static ssize_t qstat_write(struct file *file, const char __user *user_buf,
+			   size_t count, loff_t *ppos)
+{
+	int cpu;
+
+	/*
+	 * Get the counter ID stored in file->f_inode->i_private
+	 */
+	if (!file->f_inode) {
+		WARN_ON_ONCE(1);
+		return -EBADF;
+	}
+	if ((long)(file->f_inode->i_private) != qstat_reset_cnts)
+		return count;
+
+	for_each_possible_cpu(cpu) {
+		int i;
+		unsigned long *ptr = per_cpu_ptr(qstats, cpu);
+
+		for (i = 0 ; i < qstat_num; i++)
+			WRITE_ONCE(ptr[i], 0);
+		for (i = 0 ; i < qstat_num; i++)
+			WRITE_ONCE(ptr[i], 0);
+	}
+	return count;
+}
+
+/*
+ * Debugfs data structures
+ */
+static const struct file_operations fops_qstat = {
+	.read = qstat_read,
+	.write = qstat_write,
+	.llseek = default_llseek,
+};
+
+/*
+ * Initialize debugfs for the qspinlock statistical counters
+ */
+static int __init init_qspinlock_stat(void)
+{
+	struct dentry *d_qstat = debugfs_create_dir("qlockstat", NULL);
+	int i;
+
+	if (!d_qstat) {
+		pr_warn("Could not create 'qlockstat' debugfs directory\n");
+		return 0;
+	}
+
+	/*
+	 * Create the debugfs files
+	 *
+	 * As reading from and writing to the stat files can be slow, only
+	 * root is allowed to do the read/write to limit impact to system
+	 * performance.
+	 */
+	for (i = 0; i < qstat_num; i++)
+		debugfs_create_file(qstat_names[i], 0400, d_qstat,
+				   (void *)(long)i, &fops_qstat);
+
+	debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat,
+			   (void *)(long)qstat_reset_cnts, &fops_qstat);
+	return 0;
+}
+fs_initcall(init_qspinlock_stat);
+
+/*
+ * Increment the PV qspinlock statistical counters
+ */
+static inline void qstat_inc(enum qlock_stats stat, bool cond)
+{
+	if (cond)
+		this_cpu_inc(qstats[stat]);
+}
+
+/*
+ * PV hash hop count
+ */
+static inline void qstat_hop(int hopcnt)
+{
+	this_cpu_add(qstats[qstat_pv_hash_hops], hopcnt);
+}
+
+/*
+ * Replacement function for pv_kick()
+ */
+static inline void __pv_kick(int cpu)
+{
+	u64 start = sched_clock();
+
+	per_cpu(pv_kick_time, cpu) = start;
+	pv_kick(cpu);
+	this_cpu_add(qstats[qstat_pv_latency_kick], sched_clock() - start);
+}
+
+/*
+ * Replacement function for pv_wait()
+ */
+static inline void __pv_wait(u8 *ptr, u8 val)
+{
+	u64 *pkick_time = this_cpu_ptr(&pv_kick_time);
+
+	*pkick_time = 0;
+	pv_wait(ptr, val);
+	if (*pkick_time) {
+		this_cpu_add(qstats[qstat_pv_latency_wake],
+			     sched_clock() - *pkick_time);
+		qstat_inc(qstat_pv_kick_wake, true);
+	}
+}
+
+#define pv_kick(c)	__pv_kick(c)
+#define pv_wait(p, v)	__pv_wait(p, v)
+
+#else /* CONFIG_QUEUED_LOCK_STAT */
+
+static inline void qstat_inc(enum qlock_stats stat, bool cond)	{ }
+static inline void qstat_hop(int hopcnt)			{ }
+
+#endif /* CONFIG_QUEUED_LOCK_STAT */
-- 
cgit v1.1


From 1c4941fd53afb46ab15826628e4819866d008a28 Mon Sep 17 00:00:00 2001
From: Waiman Long <Waiman.Long@hpe.com>
Date: Tue, 10 Nov 2015 16:18:56 -0500
Subject: locking/pvqspinlock: Allow limited lock stealing

This patch allows one attempt for the lock waiter to steal the lock
when entering the PV slowpath. To prevent lock starvation, the pending
bit will be set by the queue head vCPU when it is in the active lock
spinning loop to disable any lock stealing attempt.  This helps to
reduce the performance penalty caused by lock waiter preemption while
not having much of the downsides of a real unfair lock.

The pv_wait_head() function was renamed as pv_wait_head_or_lock()
as it was modified to acquire the lock before returning. This is
necessary because of possible lock stealing attempts from other tasks.

Linux kernel builds were run in KVM guest on an 8-socket, 4
cores/socket Westmere-EX system and a 4-socket, 8 cores/socket
Haswell-EX system. Both systems are configured to have 32 physical
CPUs. The kernel build times before and after the patch were:

                    Westmere                    Haswell
  Patch         32 vCPUs    48 vCPUs    32 vCPUs    48 vCPUs
  -----         --------    --------    --------    --------
  Before patch   3m15.6s    10m56.1s     1m44.1s     5m29.1s
  After patch    3m02.3s     5m00.2s     1m43.7s     3m03.5s

For the overcommited case (48 vCPUs), this patch is able to reduce
kernel build time by more than 54% for Westmere and 44% for Haswell.

Signed-off-by: Waiman Long <Waiman.Long@hpe.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Douglas Hatch <doug.hatch@hpe.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Scott J Norton <scott.norton@hpe.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1447190336-53317-1-git-send-email-Waiman.Long@hpe.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/qspinlock.c          |  26 +++++--
 kernel/locking/qspinlock_paravirt.h | 141 ++++++++++++++++++++++++++++++------
 kernel/locking/qspinlock_stat.h     |  16 ++++
 3 files changed, 155 insertions(+), 28 deletions(-)

diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index ed9d967..2ea4299 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -251,15 +251,16 @@ static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
 static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
 static __always_inline void __pv_kick_node(struct qspinlock *lock,
 					   struct mcs_spinlock *node) { }
-static __always_inline void __pv_wait_head(struct qspinlock *lock,
-					   struct mcs_spinlock *node) { }
+static __always_inline u32  __pv_wait_head_or_lock(struct qspinlock *lock,
+						   struct mcs_spinlock *node)
+						   { return 0; }
 
 #define pv_enabled()		false
 
 #define pv_init_node		__pv_init_node
 #define pv_wait_node		__pv_wait_node
 #define pv_kick_node		__pv_kick_node
-#define pv_wait_head		__pv_wait_head
+#define pv_wait_head_or_lock	__pv_wait_head_or_lock
 
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
 #define queued_spin_lock_slowpath	native_queued_spin_lock_slowpath
@@ -431,10 +432,22 @@ queue:
 	 * sequentiality; this is because the set_locked() function below
 	 * does not imply a full barrier.
 	 *
+	 * The PV pv_wait_head_or_lock function, if active, will acquire
+	 * the lock and return a non-zero value. So we have to skip the
+	 * smp_load_acquire() call. As the next PV queue head hasn't been
+	 * designated yet, there is no way for the locked value to become
+	 * _Q_SLOW_VAL. So both the set_locked() and the
+	 * atomic_cmpxchg_relaxed() calls will be safe.
+	 *
+	 * If PV isn't active, 0 will be returned instead.
+	 *
 	 */
-	pv_wait_head(lock, node);
+	if ((val = pv_wait_head_or_lock(lock, node)))
+		goto locked;
+
 	smp_cond_acquire(!((val = atomic_read(&lock->val)) & _Q_LOCKED_PENDING_MASK));
 
+locked:
 	/*
 	 * claim the lock:
 	 *
@@ -446,7 +459,8 @@ queue:
 	 * to grab the lock.
 	 */
 	for (;;) {
-		if (val != tail) {
+		/* In the PV case we might already have _Q_LOCKED_VAL set */
+		if ((val & _Q_TAIL_MASK) != tail) {
 			set_locked(lock);
 			break;
 		}
@@ -493,7 +507,7 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath);
 #undef pv_init_node
 #undef pv_wait_node
 #undef pv_kick_node
-#undef pv_wait_head
+#undef pv_wait_head_or_lock
 
 #undef  queued_spin_lock_slowpath
 #define queued_spin_lock_slowpath	__pv_queued_spin_lock_slowpath
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index aaeeefb..ace60a4 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -41,6 +41,89 @@ struct pv_node {
 };
 
 /*
+ * By replacing the regular queued_spin_trylock() with the function below,
+ * it will be called once when a lock waiter enter the PV slowpath before
+ * being queued. By allowing one lock stealing attempt here when the pending
+ * bit is off, it helps to reduce the performance impact of lock waiter
+ * preemption without the drawback of lock starvation.
+ */
+#define queued_spin_trylock(l)	pv_queued_spin_steal_lock(l)
+static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock)
+{
+	struct __qspinlock *l = (void *)lock;
+
+	return !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
+		(cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0);
+}
+
+/*
+ * The pending bit is used by the queue head vCPU to indicate that it
+ * is actively spinning on the lock and no lock stealing is allowed.
+ */
+#if _Q_PENDING_BITS == 8
+static __always_inline void set_pending(struct qspinlock *lock)
+{
+	struct __qspinlock *l = (void *)lock;
+
+	WRITE_ONCE(l->pending, 1);
+}
+
+static __always_inline void clear_pending(struct qspinlock *lock)
+{
+	struct __qspinlock *l = (void *)lock;
+
+	WRITE_ONCE(l->pending, 0);
+}
+
+/*
+ * The pending bit check in pv_queued_spin_steal_lock() isn't a memory
+ * barrier. Therefore, an atomic cmpxchg() is used to acquire the lock
+ * just to be sure that it will get it.
+ */
+static __always_inline int trylock_clear_pending(struct qspinlock *lock)
+{
+	struct __qspinlock *l = (void *)lock;
+
+	return !READ_ONCE(l->locked) &&
+	       (cmpxchg(&l->locked_pending, _Q_PENDING_VAL, _Q_LOCKED_VAL)
+			== _Q_PENDING_VAL);
+}
+#else /* _Q_PENDING_BITS == 8 */
+static __always_inline void set_pending(struct qspinlock *lock)
+{
+	atomic_set_mask(_Q_PENDING_VAL, &lock->val);
+}
+
+static __always_inline void clear_pending(struct qspinlock *lock)
+{
+	atomic_clear_mask(_Q_PENDING_VAL, &lock->val);
+}
+
+static __always_inline int trylock_clear_pending(struct qspinlock *lock)
+{
+	int val = atomic_read(&lock->val);
+
+	for (;;) {
+		int old, new;
+
+		if (val  & _Q_LOCKED_MASK)
+			break;
+
+		/*
+		 * Try to clear pending bit & set locked bit
+		 */
+		old = val;
+		new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL;
+		val = atomic_cmpxchg(&lock->val, old, new);
+
+		if (val == old)
+			return 1;
+	}
+	return 0;
+}
+#endif /* _Q_PENDING_BITS == 8 */
+
+/*
  * Include queued spinlock statistics code
  */
 #include "qspinlock_stat.h"
@@ -202,8 +285,8 @@ static void pv_wait_node(struct mcs_spinlock *node)
 
 		/*
 		 * If pv_kick_node() changed us to vcpu_hashed, retain that
-		 * value so that pv_wait_head() knows to not also try to hash
-		 * this lock.
+		 * value so that pv_wait_head_or_lock() knows to not also try
+		 * to hash this lock.
 		 */
 		cmpxchg(&pn->state, vcpu_halted, vcpu_running);
 
@@ -227,8 +310,9 @@ static void pv_wait_node(struct mcs_spinlock *node)
 /*
  * Called after setting next->locked = 1 when we're the lock owner.
  *
- * Instead of waking the waiters stuck in pv_wait_node() advance their state such
- * that they're waiting in pv_wait_head(), this avoids a wake/sleep cycle.
+ * Instead of waking the waiters stuck in pv_wait_node() advance their state
+ * such that they're waiting in pv_wait_head_or_lock(), this avoids a
+ * wake/sleep cycle.
  */
 static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
 {
@@ -257,10 +341,14 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
 }
 
 /*
- * Wait for l->locked to become clear; halt the vcpu after a short spin.
+ * Wait for l->locked to become clear and acquire the lock;
+ * halt the vcpu after a short spin.
  * __pv_queued_spin_unlock() will wake us.
+ *
+ * The current value of the lock will be returned for additional processing.
  */
-static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
+static u32
+pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
 {
 	struct pv_node *pn = (struct pv_node *)node;
 	struct __qspinlock *l = (void *)lock;
@@ -276,11 +364,18 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
 		lp = (struct qspinlock **)1;
 
 	for (;; waitcnt++) {
+		/*
+		 * Set the pending bit in the active lock spinning loop to
+		 * disable lock stealing before attempting to acquire the lock.
+		 */
+		set_pending(lock);
 		for (loop = SPIN_THRESHOLD; loop; loop--) {
-			if (!READ_ONCE(l->locked))
-				return;
+			if (trylock_clear_pending(lock))
+				goto gotlock;
 			cpu_relax();
 		}
+		clear_pending(lock);
+
 
 		if (!lp) { /* ONCE */
 			lp = pv_hash(lock, pn);
@@ -296,36 +391,38 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
 			 *
 			 * Matches the smp_rmb() in __pv_queued_spin_unlock().
 			 */
-			if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) {
+			if (xchg(&l->locked, _Q_SLOW_VAL) == 0) {
 				/*
-				 * The lock is free and _Q_SLOW_VAL has never
-				 * been set. Therefore we need to unhash before
-				 * getting the lock.
+				 * The lock was free and now we own the lock.
+				 * Change the lock value back to _Q_LOCKED_VAL
+				 * and unhash the table.
 				 */
+				WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
 				WRITE_ONCE(*lp, NULL);
-				return;
+				goto gotlock;
 			}
 		}
 		qstat_inc(qstat_pv_wait_head, true);
 		qstat_inc(qstat_pv_wait_again, waitcnt);
 		pv_wait(&l->locked, _Q_SLOW_VAL);
 
-		if (!READ_ONCE(l->locked))
-			return;
 		/*
 		 * The unlocker should have freed the lock before kicking the
 		 * CPU. So if the lock is still not free, it is a spurious
-		 * wakeup and so the vCPU should wait again after spinning for
-		 * a while.
+		 * wakeup or another vCPU has stolen the lock. The current
+		 * vCPU should spin again.
 		 */
-		qstat_inc(qstat_pv_spurious_wakeup, true);
+		qstat_inc(qstat_pv_spurious_wakeup, READ_ONCE(l->locked));
 	}
 
 	/*
-	 * Lock is unlocked now; the caller will acquire it without waiting.
-	 * As with pv_wait_node() we rely on the caller to do a load-acquire
-	 * for us.
+	 * The cmpxchg() or xchg() call before coming here provides the
+	 * acquire semantics for locking. The dummy ORing of _Q_LOCKED_VAL
+	 * here is to indicate to the compiler that the value will always
+	 * be nozero to enable better code optimization.
 	 */
+gotlock:
+	return (u32)(atomic_read(&lock->val) | _Q_LOCKED_VAL);
 }
 
 /*
@@ -350,7 +447,7 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
 	 * so we need a barrier to order the read of the node data in
 	 * pv_unhash *after* we've read the lock being _Q_SLOW_VAL.
 	 *
-	 * Matches the cmpxchg() in pv_wait_head() setting _Q_SLOW_VAL.
+	 * Matches the cmpxchg() in pv_wait_head_or_lock() setting _Q_SLOW_VAL.
 	 */
 	smp_rmb();
 
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index b1553ad..94d4533 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -22,6 +22,7 @@
  *   pv_kick_wake	- # of vCPU kicks used for computing pv_latency_wake
  *   pv_latency_kick	- average latency (ns) of vCPU kick operation
  *   pv_latency_wake	- average latency (ns) from vCPU kick to wakeup
+ *   pv_lock_stealing	- # of lock stealing operations
  *   pv_spurious_wakeup	- # of spurious wakeups
  *   pv_wait_again	- # of vCPU wait's that happened after a vCPU kick
  *   pv_wait_head	- # of vCPU wait's at the queue head
@@ -43,6 +44,7 @@ enum qlock_stats {
 	qstat_pv_kick_wake,
 	qstat_pv_latency_kick,
 	qstat_pv_latency_wake,
+	qstat_pv_lock_stealing,
 	qstat_pv_spurious_wakeup,
 	qstat_pv_wait_again,
 	qstat_pv_wait_head,
@@ -66,6 +68,7 @@ static const char * const qstat_names[qstat_num + 1] = {
 	[qstat_pv_spurious_wakeup] = "pv_spurious_wakeup",
 	[qstat_pv_latency_kick]	   = "pv_latency_kick",
 	[qstat_pv_latency_wake]    = "pv_latency_wake",
+	[qstat_pv_lock_stealing]   = "pv_lock_stealing",
 	[qstat_pv_wait_again]      = "pv_wait_again",
 	[qstat_pv_wait_head]       = "pv_wait_head",
 	[qstat_pv_wait_node]       = "pv_wait_node",
@@ -273,6 +276,19 @@ static inline void __pv_wait(u8 *ptr, u8 val)
 #define pv_kick(c)	__pv_kick(c)
 #define pv_wait(p, v)	__pv_wait(p, v)
 
+/*
+ * PV unfair trylock count tracking function
+ */
+static inline int qstat_spin_steal_lock(struct qspinlock *lock)
+{
+	int ret = pv_queued_spin_steal_lock(lock);
+
+	qstat_inc(qstat_pv_lock_stealing, ret);
+	return ret;
+}
+#undef  queued_spin_trylock
+#define queued_spin_trylock(l)	qstat_spin_steal_lock(l)
+
 #else /* CONFIG_QUEUED_LOCK_STAT */
 
 static inline void qstat_inc(enum qlock_stats stat, bool cond)	{ }
-- 
cgit v1.1


From cd0272fab785077c121aa91ec2401090965bbc37 Mon Sep 17 00:00:00 2001
From: Waiman Long <Waiman.Long@hpe.com>
Date: Mon, 9 Nov 2015 19:09:27 -0500
Subject: locking/pvqspinlock: Queue node adaptive spinning

In an overcommitted guest where some vCPUs have to be halted to make
forward progress in other areas, it is highly likely that a vCPU later
in the spinlock queue will be spinning while the ones earlier in the
queue would have been halted. The spinning in the later vCPUs is then
just a waste of precious CPU cycles because they are not going to
get the lock soon as the earlier ones have to be woken up and take
their turn to get the lock.

This patch implements an adaptive spinning mechanism where the vCPU
will call pv_wait() if the previous vCPU is not running.

Linux kernel builds were run in KVM guest on an 8-socket, 4
cores/socket Westmere-EX system and a 4-socket, 8 cores/socket
Haswell-EX system. Both systems are configured to have 32 physical
CPUs. The kernel build times before and after the patch were:

		    Westmere			Haswell
  Patch		32 vCPUs    48 vCPUs	32 vCPUs    48 vCPUs
  -----		--------    --------    --------    --------
  Before patch   3m02.3s     5m00.2s     1m43.7s     3m03.5s
  After patch    3m03.0s     4m37.5s	 1m43.0s     2m47.2s

For 32 vCPUs, this patch doesn't cause any noticeable change in
performance. For 48 vCPUs (over-committed), there is about 8%
performance improvement.

Signed-off-by: Waiman Long <Waiman.Long@hpe.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Douglas Hatch <doug.hatch@hpe.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Scott J Norton <scott.norton@hpe.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1447114167-47185-8-git-send-email-Waiman.Long@hpe.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/qspinlock.c          |  5 ++--
 kernel/locking/qspinlock_paravirt.h | 46 +++++++++++++++++++++++++++++++++++--
 kernel/locking/qspinlock_stat.h     |  3 +++
 3 files changed, 50 insertions(+), 4 deletions(-)

diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 2ea4299..393d187 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -248,7 +248,8 @@ static __always_inline void set_locked(struct qspinlock *lock)
  */
 
 static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
-static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_wait_node(struct mcs_spinlock *node,
+					   struct mcs_spinlock *prev) { }
 static __always_inline void __pv_kick_node(struct qspinlock *lock,
 					   struct mcs_spinlock *node) { }
 static __always_inline u32  __pv_wait_head_or_lock(struct qspinlock *lock,
@@ -407,7 +408,7 @@ queue:
 		prev = decode_tail(old);
 		WRITE_ONCE(prev->next, node);
 
-		pv_wait_node(node);
+		pv_wait_node(node, prev);
 		arch_mcs_spin_lock_contended(&node->locked);
 
 		/*
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index ace60a4..87bb235 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -23,6 +23,20 @@
 #define _Q_SLOW_VAL	(3U << _Q_LOCKED_OFFSET)
 
 /*
+ * Queue Node Adaptive Spinning
+ *
+ * A queue node vCPU will stop spinning if the vCPU in the previous node is
+ * not running. The one lock stealing attempt allowed at slowpath entry
+ * mitigates the slight slowdown for non-overcommitted guest with this
+ * aggressive wait-early mechanism.
+ *
+ * The status of the previous node will be checked at fixed interval
+ * controlled by PV_PREV_CHECK_MASK. This is to ensure that we won't
+ * pound on the cacheline of the previous node too heavily.
+ */
+#define PV_PREV_CHECK_MASK	0xff
+
+/*
  * Queue node uses: vcpu_running & vcpu_halted.
  * Queue head uses: vcpu_running & vcpu_hashed.
  */
@@ -235,6 +249,20 @@ static struct pv_node *pv_unhash(struct qspinlock *lock)
 }
 
 /*
+ * Return true if when it is time to check the previous node which is not
+ * in a running state.
+ */
+static inline bool
+pv_wait_early(struct pv_node *prev, int loop)
+{
+
+	if ((loop & PV_PREV_CHECK_MASK) != 0)
+		return false;
+
+	return READ_ONCE(prev->state) != vcpu_running;
+}
+
+/*
  * Initialize the PV part of the mcs_spinlock node.
  */
 static void pv_init_node(struct mcs_spinlock *node)
@@ -252,17 +280,23 @@ static void pv_init_node(struct mcs_spinlock *node)
  * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its
  * behalf.
  */
-static void pv_wait_node(struct mcs_spinlock *node)
+static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
 {
 	struct pv_node *pn = (struct pv_node *)node;
+	struct pv_node *pp = (struct pv_node *)prev;
 	int waitcnt = 0;
 	int loop;
+	bool wait_early;
 
 	/* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */
 	for (;; waitcnt++) {
-		for (loop = SPIN_THRESHOLD; loop; loop--) {
+		for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) {
 			if (READ_ONCE(node->locked))
 				return;
+			if (pv_wait_early(pp, loop)) {
+				wait_early = true;
+				break;
+			}
 			cpu_relax();
 		}
 
@@ -280,6 +314,7 @@ static void pv_wait_node(struct mcs_spinlock *node)
 		if (!READ_ONCE(node->locked)) {
 			qstat_inc(qstat_pv_wait_node, true);
 			qstat_inc(qstat_pv_wait_again, waitcnt);
+			qstat_inc(qstat_pv_wait_early, wait_early);
 			pv_wait(&pn->state, vcpu_halted);
 		}
 
@@ -365,6 +400,12 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
 
 	for (;; waitcnt++) {
 		/*
+		 * Set correct vCPU state to be used by queue node wait-early
+		 * mechanism.
+		 */
+		WRITE_ONCE(pn->state, vcpu_running);
+
+		/*
 		 * Set the pending bit in the active lock spinning loop to
 		 * disable lock stealing before attempting to acquire the lock.
 		 */
@@ -402,6 +443,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
 				goto gotlock;
 			}
 		}
+		WRITE_ONCE(pn->state, vcpu_halted);
 		qstat_inc(qstat_pv_wait_head, true);
 		qstat_inc(qstat_pv_wait_again, waitcnt);
 		pv_wait(&l->locked, _Q_SLOW_VAL);
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index 94d4533..640dcec 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -25,6 +25,7 @@
  *   pv_lock_stealing	- # of lock stealing operations
  *   pv_spurious_wakeup	- # of spurious wakeups
  *   pv_wait_again	- # of vCPU wait's that happened after a vCPU kick
+ *   pv_wait_early	- # of early vCPU wait's
  *   pv_wait_head	- # of vCPU wait's at the queue head
  *   pv_wait_node	- # of vCPU wait's at a non-head queue node
  *
@@ -47,6 +48,7 @@ enum qlock_stats {
 	qstat_pv_lock_stealing,
 	qstat_pv_spurious_wakeup,
 	qstat_pv_wait_again,
+	qstat_pv_wait_early,
 	qstat_pv_wait_head,
 	qstat_pv_wait_node,
 	qstat_num,	/* Total number of statistical counters */
@@ -70,6 +72,7 @@ static const char * const qstat_names[qstat_num + 1] = {
 	[qstat_pv_latency_wake]    = "pv_latency_wake",
 	[qstat_pv_lock_stealing]   = "pv_lock_stealing",
 	[qstat_pv_wait_again]      = "pv_wait_again",
+	[qstat_pv_wait_early]      = "pv_wait_early",
 	[qstat_pv_wait_head]       = "pv_wait_head",
 	[qstat_pv_wait_node]       = "pv_wait_node",
 	[qstat_reset_cnts]         = "reset_counters",
-- 
cgit v1.1


From fbd35c0d2fb41b75863a0e45fe939c8440375b0a Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Tue, 27 Oct 2015 12:53:48 -0700
Subject: locking/cmpxchg, arch: Remove tas() definitions

It seems that commit 5dc12ddee93 ("Remove tas()") missed some files.
Correct this and fully drop this macro, for which we should be using
cmpxchg() like calls.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <linux-arch@vger.kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Aurelien Jacquiot <a-jacquiot@ti.com>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: David Howells <dhowells@re hat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Miao <realmz6@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dave@stgolabs.net
Link: http://lkml.kernel.org/r/1445975631-17047-2-git-send-email-dave@stgolabs.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/blackfin/include/asm/cmpxchg.h | 1 -
 arch/c6x/include/asm/cmpxchg.h      | 2 --
 arch/frv/include/asm/cmpxchg.h      | 2 --
 arch/tile/include/asm/cmpxchg.h     | 2 --
 4 files changed, 7 deletions(-)

diff --git a/arch/blackfin/include/asm/cmpxchg.h b/arch/blackfin/include/asm/cmpxchg.h
index c05868c..2539288 100644
--- a/arch/blackfin/include/asm/cmpxchg.h
+++ b/arch/blackfin/include/asm/cmpxchg.h
@@ -128,6 +128,5 @@ static inline unsigned long __xchg(unsigned long x, volatile void *ptr,
 #endif /* !CONFIG_SMP */
 
 #define xchg(ptr, x) ((__typeof__(*(ptr)))__xchg((unsigned long)(x), (ptr), sizeof(*(ptr))))
-#define tas(ptr) ((void)xchg((ptr), 1))
 
 #endif /* __ARCH_BLACKFIN_CMPXCHG__ */
diff --git a/arch/c6x/include/asm/cmpxchg.h b/arch/c6x/include/asm/cmpxchg.h
index b27c8ce..93d0a5a 100644
--- a/arch/c6x/include/asm/cmpxchg.h
+++ b/arch/c6x/include/asm/cmpxchg.h
@@ -47,8 +47,6 @@ static inline unsigned int __xchg(unsigned int x, volatile void *ptr, int size)
 #define xchg(ptr, x) \
 	((__typeof__(*(ptr)))__xchg((unsigned int)(x), (void *) (ptr), \
 				    sizeof(*(ptr))))
-#define tas(ptr)    xchg((ptr), 1)
-
 
 #include <asm-generic/cmpxchg-local.h>
 
diff --git a/arch/frv/include/asm/cmpxchg.h b/arch/frv/include/asm/cmpxchg.h
index 5b04dd0..a899765 100644
--- a/arch/frv/include/asm/cmpxchg.h
+++ b/arch/frv/include/asm/cmpxchg.h
@@ -69,8 +69,6 @@ extern uint32_t __xchg_32(uint32_t i, volatile void *v);
 
 #endif
 
-#define tas(ptr) (xchg((ptr), 1))
-
 /*****************************************************************************/
 /*
  * compare and conditionally exchange value with memory
diff --git a/arch/tile/include/asm/cmpxchg.h b/arch/tile/include/asm/cmpxchg.h
index 0ccda3c..25d5899 100644
--- a/arch/tile/include/asm/cmpxchg.h
+++ b/arch/tile/include/asm/cmpxchg.h
@@ -127,8 +127,6 @@ long long _atomic64_cmpxchg(long long *v, long long o, long long n);
 
 #endif
 
-#define tas(ptr) xchg((ptr), 1)
-
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_TILE_CMPXCHG_H */
-- 
cgit v1.1


From d5a73cadf3fdec95e9518ee5bb91bd0747c42b30 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Tue, 27 Oct 2015 12:53:49 -0700
Subject: lcoking/barriers, arch: Use smp barriers in smp_store_release()

With commit b92b8b35a2e ("locking/arch: Rename set_mb() to smp_store_mb()")
it was made clear that the context of this call (and thus set_mb)
is strictly for CPU ordering, as opposed to IO. As such all archs
should use the smp variant of mb(), respecting the semantics and
saving a mandatory barrier on UP.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <linux-arch@vger.kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: dave@stgolabs.net
Link: http://lkml.kernel.org/r/1445975631-17047-3-git-send-email-dave@stgolabs.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/ia64/include/asm/barrier.h    | 2 +-
 arch/powerpc/include/asm/barrier.h | 2 +-
 arch/s390/include/asm/barrier.h    | 2 +-
 include/asm-generic/barrier.h      | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/ia64/include/asm/barrier.h b/arch/ia64/include/asm/barrier.h
index df896a1..209c4b8 100644
--- a/arch/ia64/include/asm/barrier.h
+++ b/arch/ia64/include/asm/barrier.h
@@ -77,7 +77,7 @@ do {									\
 	___p1;								\
 })
 
-#define smp_store_mb(var, value)	do { WRITE_ONCE(var, value); mb(); } while (0)
+#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); smp_mb(); } while (0)
 
 /*
  * The group barrier in front of the rsm & ssm are necessary to ensure
diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h
index 0eca6ef..a7af5fb 100644
--- a/arch/powerpc/include/asm/barrier.h
+++ b/arch/powerpc/include/asm/barrier.h
@@ -34,7 +34,7 @@
 #define rmb()  __asm__ __volatile__ ("sync" : : : "memory")
 #define wmb()  __asm__ __volatile__ ("sync" : : : "memory")
 
-#define smp_store_mb(var, value)	do { WRITE_ONCE(var, value); mb(); } while (0)
+#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); smp_mb(); } while (0)
 
 #ifdef __SUBARCH_HAS_LWSYNC
 #    define SMPWMB      LWSYNC
diff --git a/arch/s390/include/asm/barrier.h b/arch/s390/include/asm/barrier.h
index d68e11e..7ffd0b1 100644
--- a/arch/s390/include/asm/barrier.h
+++ b/arch/s390/include/asm/barrier.h
@@ -36,7 +36,7 @@
 #define smp_mb__before_atomic()		smp_mb()
 #define smp_mb__after_atomic()		smp_mb()
 
-#define smp_store_mb(var, value)		do { WRITE_ONCE(var, value); mb(); } while (0)
+#define smp_store_mb(var, value)	do { WRITE_ONCE(var, value); smp_mb(); } while (0)
 
 #define smp_store_release(p, v)						\
 do {									\
diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h
index b42afad..0f45f93 100644
--- a/include/asm-generic/barrier.h
+++ b/include/asm-generic/barrier.h
@@ -93,7 +93,7 @@
 #endif	/* CONFIG_SMP */
 
 #ifndef smp_store_mb
-#define smp_store_mb(var, value)  do { WRITE_ONCE(var, value); mb(); } while (0)
+#define smp_store_mb(var, value)  do { WRITE_ONCE(var, value); smp_mb(); } while (0)
 #endif
 
 #ifndef smp_mb__before_atomic
-- 
cgit v1.1


From 2d142e599bf73ab70a3457e6947f86935245415e Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Tue, 27 Oct 2015 12:53:51 -0700
Subject: locking/barriers, arch: Remove ambiguous statement in the
 smp_store_mb() documentation

It serves no purpose but to confuse readers, and is most
likely a left over from constant memory-barriers.txt
updates. I.e.:

  http://lists.openwall.net/linux-kernel/2006/07/15/27

Signed-off-by: Davidlohr Bueso <dave@stgolabs.net>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <linux-arch@vger.kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1445975631-17047-5-git-send-email-dave@stgolabs.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/memory-barriers.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index aef9487..c85054d 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -1673,8 +1673,8 @@ There are some more advanced barrier functions:
  (*) smp_store_mb(var, value)
 
      This assigns the value to the variable and then inserts a full memory
-     barrier after it, depending on the function.  It isn't guaranteed to
-     insert anything more than a compiler barrier in a UP compilation.
+     barrier after it.  It isn't guaranteed to insert anything more than a
+     compiler barrier in a UP compilation.
 
 
  (*) smp_mb__before_atomic();
-- 
cgit v1.1


From 071ac0c4e8e90d5de05f0779b03ae69ce84820d5 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 30 Nov 2015 13:12:59 +0100
Subject: x86/mm/ptdump: Make (debugfs)/kernel_page_tables read-only

File should be created with S_IRUSR and not with S_IWUSR too
because writing to it doesn't make any sense. I mean, we don't
have a ->write method anyway but let's have the permissions
correct too.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1448885579-32506-1-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/debug_pagetables.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c
index b35ee86..bfcffdf 100644
--- a/arch/x86/mm/debug_pagetables.c
+++ b/arch/x86/mm/debug_pagetables.c
@@ -26,7 +26,7 @@ static struct dentry *pe;
 
 static int __init pt_dump_debug_init(void)
 {
-	pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL,
+	pe = debugfs_create_file("kernel_page_tables", S_IRUSR, NULL, NULL,
 				 &ptdump_fops);
 	if (!pe)
 		return -ENOMEM;
-- 
cgit v1.1


From 06f60de19d3141f07d954c9275fe7ccca8e96b42 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 29 Sep 2015 08:15:52 -0700
Subject: rcu: Short-circuit synchronize_sched_expedited() if only one CPU

If there is only one CPU, then invoking synchronize_sched_expedited()
is by definition a grace period.  This commit checks for this condition
and does a short-circuit return in that case.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 81aa1cd..bd2605c 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3822,6 +3822,10 @@ void synchronize_sched_expedited(void)
 	struct rcu_node *rnp;
 	struct rcu_state *rsp = &rcu_sched_state;
 
+	/* If only one CPU, this is automatically a grace period. */
+	if (rcu_blocking_is_gp())
+		return;
+
 	/* Take a snapshot of the sequence number.  */
 	s = rcu_exp_gp_seq_snap(rsp);
 
-- 
cgit v1.1


From 1de6e56ddc043437d335ee0455a1b34b73510c91 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 29 Sep 2015 09:45:00 -0700
Subject: rcu: Clarify role of ->expmaskinitnext

Analogy with the ->qsmaskinitnext field might lead one to believe that
->expmaskinitnext tracks online CPUs.  This belief is incorrect: Any CPU
that has ever been online will have its bit set in the ->expmaskinitnext
field.  This commit therefore adds a comment to make this clear, at
least to people who read comments.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index f32bebb..8151971 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -178,6 +178,8 @@ struct rcu_node {
 				/*  beginning of each expedited GP. */
 	unsigned long expmaskinitnext;
 				/* Online CPUs for next expedited GP. */
+				/*  Any CPU that has ever been online will */
+				/*  have its bit set. */
 	unsigned long grpmask;	/* Mask to apply to parent qsmask. */
 				/*  Only one bit will be set in this mask. */
 	int	grplo;		/* lowest-numbered CPU or group here. */
-- 
cgit v1.1


From 886ef5a18a4a771d5fdc0e23ae9373bb35d529e7 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 29 Sep 2015 12:34:40 -0700
Subject: rcu: Move smp_mb() from rcu_seq_snap() to rcu_exp_gp_seq_snap()

The memory barrier in rcu_seq_snap() is needed only for grace periods,
so this commit moves it to the grace-period-oriented wrapper
rcu_exp_gp_seq_snap().

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index bd2605c..a4a0475 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3347,7 +3347,6 @@ static unsigned long rcu_seq_snap(unsigned long *sp)
 {
 	unsigned long s;
 
-	smp_mb(); /* Caller's modifications seen first by other CPUs. */
 	s = (READ_ONCE(*sp) + 3) & ~0x1;
 	smp_mb(); /* Above access must not bleed into critical section. */
 	return s;
@@ -3374,6 +3373,7 @@ static void rcu_exp_gp_seq_end(struct rcu_state *rsp)
 }
 static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp)
 {
+	smp_mb(); /* Caller's modifications seen first by other CPUs. */
 	return rcu_seq_snap(&rsp->expedited_sequence);
 }
 static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s)
-- 
cgit v1.1


From 1307f2148719cc9e9d12f5fa7d5b3b61ec5aef72 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 29 Sep 2015 15:29:21 -0700
Subject: rcu: Invert sync_rcu_exp_select_cpus() "if" statement

This commit saves a couple lines of code and reduces indentation
by inverting the sense of an "if" statement in the function
sync_rcu_exp_select_cpus().

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 30 ++++++++++++++----------------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index a4a0475..00f07d6 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3716,24 +3716,22 @@ retry_ipi:
 			ret = smp_call_function_single(cpu, func, rsp, 0);
 			if (!ret) {
 				mask_ofl_ipi &= ~mask;
-			} else {
-				/* Failed, raced with offline. */
-				raw_spin_lock_irqsave_rcu_node(rnp, flags);
-				if (cpu_online(cpu) &&
-				    (rnp->expmask & mask)) {
-					raw_spin_unlock_irqrestore(&rnp->lock,
-								   flags);
-					schedule_timeout_uninterruptible(1);
-					if (cpu_online(cpu) &&
-					    (rnp->expmask & mask))
-						goto retry_ipi;
-					raw_spin_lock_irqsave_rcu_node(rnp,
-								       flags);
-				}
-				if (!(rnp->expmask & mask))
-					mask_ofl_ipi &= ~mask;
+				continue;
+			}
+			/* Failed, raced with offline. */
+			raw_spin_lock_irqsave_rcu_node(rnp, flags);
+			if (cpu_online(cpu) &&
+			    (rnp->expmask & mask)) {
 				raw_spin_unlock_irqrestore(&rnp->lock, flags);
+				schedule_timeout_uninterruptible(1);
+				if (cpu_online(cpu) &&
+				    (rnp->expmask & mask))
+					goto retry_ipi;
+				raw_spin_lock_irqsave_rcu_node(rnp, flags);
 			}
+			if (!(rnp->expmask & mask))
+				mask_ofl_ipi &= ~mask;
+			raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		}
 		/* Report quiescent states for those that went offline. */
 		mask_ofl_test |= mask_ofl_ipi;
-- 
cgit v1.1


From df5bd5144a80a9f6c3807383b11f735dae9caf9d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 1 Oct 2015 10:26:24 -0700
Subject: rcu: Reduce expedited GP memory contention via per-CPU variables

Currently, the piggybacked-work checks carried out by sync_exp_work_done()
atomically increment a small set of variables (the ->expedited_workdone0,
->expedited_workdone1, ->expedited_workdone2, ->expedited_workdone3
fields in the rcu_state structure), which will form a memory-contention
bottleneck given a sufficiently large number of CPUs concurrently invoking
either synchronize_rcu_expedited() or synchronize_sched_expedited().

This commit therefore moves these for fields to the per-CPU rcu_data
structure, eliminating the memory contention.  The show_rcuexp() function
also changes to sum up each field in the rcu_data structures.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c       | 11 +++++------
 kernel/rcu/tree.h       |  8 ++++----
 kernel/rcu/tree_trace.c | 18 ++++++++++++------
 3 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 00f07d6..33d7e25 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3585,7 +3585,7 @@ static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp,
  */
 static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
 {
-	struct rcu_data *rdp;
+	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
 	struct rcu_node *rnp0;
 	struct rcu_node *rnp1 = NULL;
 
@@ -3599,7 +3599,7 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
 	if (!mutex_is_locked(&rnp0->exp_funnel_mutex)) {
 		if (mutex_trylock(&rnp0->exp_funnel_mutex)) {
 			if (sync_exp_work_done(rsp, rnp0, NULL,
-					       &rsp->expedited_workdone0, s))
+					       &rdp->expedited_workdone0, s))
 				return NULL;
 			return rnp0;
 		}
@@ -3613,14 +3613,13 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
 	 * can be inexact, as it is just promoting locality and is not
 	 * strictly needed for correctness.
 	 */
-	rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
-	if (sync_exp_work_done(rsp, NULL, NULL, &rsp->expedited_workdone1, s))
+	if (sync_exp_work_done(rsp, NULL, NULL, &rdp->expedited_workdone1, s))
 		return NULL;
 	mutex_lock(&rdp->exp_funnel_mutex);
 	rnp0 = rdp->mynode;
 	for (; rnp0 != NULL; rnp0 = rnp0->parent) {
 		if (sync_exp_work_done(rsp, rnp1, rdp,
-				       &rsp->expedited_workdone2, s))
+				       &rdp->expedited_workdone2, s))
 			return NULL;
 		mutex_lock(&rnp0->exp_funnel_mutex);
 		if (rnp1)
@@ -3630,7 +3629,7 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
 		rnp1 = rnp0;
 	}
 	if (sync_exp_work_done(rsp, rnp1, rdp,
-			       &rsp->expedited_workdone3, s))
+			       &rdp->expedited_workdone3, s))
 		return NULL;
 	return rnp1;
 }
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 8151971..6cbec31 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -386,6 +386,10 @@ struct rcu_data {
 	struct rcu_head oom_head;
 #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
 	struct mutex exp_funnel_mutex;
+	atomic_long_t expedited_workdone0;	/* # done by others #0. */
+	atomic_long_t expedited_workdone1;	/* # done by others #1. */
+	atomic_long_t expedited_workdone2;	/* # done by others #2. */
+	atomic_long_t expedited_workdone3;	/* # done by others #3. */
 
 	/* 7) Callback offloading. */
 #ifdef CONFIG_RCU_NOCB_CPU
@@ -500,10 +504,6 @@ struct rcu_state {
 	/* End of fields guarded by barrier_mutex. */
 
 	unsigned long expedited_sequence;	/* Take a ticket. */
-	atomic_long_t expedited_workdone0;	/* # done by others #0. */
-	atomic_long_t expedited_workdone1;	/* # done by others #1. */
-	atomic_long_t expedited_workdone2;	/* # done by others #2. */
-	atomic_long_t expedited_workdone3;	/* # done by others #3. */
 	atomic_long_t expedited_normal;		/* # fallbacks to normal. */
 	atomic_t expedited_need_qs;		/* # CPUs left to check in. */
 	wait_queue_head_t expedited_wq;		/* Wait for check-ins. */
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 8efaba8..d436494 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -183,14 +183,20 @@ static const struct file_operations rcudata_fops = {
 
 static int show_rcuexp(struct seq_file *m, void *v)
 {
+	int cpu;
 	struct rcu_state *rsp = (struct rcu_state *)m->private;
-
+	struct rcu_data *rdp;
+	unsigned long s0 = 0, s1 = 0, s2 = 0, s3 = 0;
+
+	for_each_possible_cpu(cpu) {
+		rdp = per_cpu_ptr(rsp->rda, cpu);
+		s0 += atomic_long_read(&rdp->expedited_workdone0);
+		s1 += atomic_long_read(&rdp->expedited_workdone1);
+		s2 += atomic_long_read(&rdp->expedited_workdone2);
+		s3 += atomic_long_read(&rdp->expedited_workdone3);
+	}
 	seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n",
-		   rsp->expedited_sequence,
-		   atomic_long_read(&rsp->expedited_workdone0),
-		   atomic_long_read(&rsp->expedited_workdone1),
-		   atomic_long_read(&rsp->expedited_workdone2),
-		   atomic_long_read(&rsp->expedited_workdone3),
+		   rsp->expedited_sequence, s0, s1, s2, s3,
 		   atomic_long_read(&rsp->expedited_normal),
 		   atomic_read(&rsp->expedited_need_qs),
 		   rsp->expedited_sequence / 2);
-- 
cgit v1.1


From 73f36f9de8bed78bcda2704a348594c20518b455 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 17 Nov 2015 10:56:55 -0800
Subject: rcu: Make expedited grace periods resolve stall-warning ties

Currently, if a grace period ends just as the stall-warning timeout
fires, an empty stall warning will be printed.  This is not helpful,
so this commit avoids these useless warnings by rechecking completion
after awakening in synchronize_sched_expedited_wait().

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 33d7e25..bc6b797 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3757,7 +3757,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 				rsp->expedited_wq,
 				sync_rcu_preempt_exp_done(rnp_root),
 				jiffies_stall);
-		if (ret > 0)
+		if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root))
 			return;
 		if (ret < 0) {
 			/* Hit a signal, disable CPU stall warnings. */
-- 
cgit v1.1


From 72611ab9f5d2d384a04e72d560c9c82463115cbf Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 17 Nov 2015 13:25:21 -0800
Subject: rcu: Add more diagnostics to expedited stall warning messages.

This commit adds print statements that check the rcu_node structure to
find which ->expmask bits and which ->exp_tasks structures are blocking
the current expedited grace period.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index bc6b797..6a652d1 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3745,6 +3745,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 	unsigned long jiffies_stall;
 	unsigned long jiffies_start;
 	unsigned long mask;
+	int ndetected;
 	struct rcu_node *rnp;
 	struct rcu_node *rnp_root = rcu_get_root(rsp);
 	int ret;
@@ -3767,14 +3768,16 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 		}
 		pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {",
 		       rsp->name);
+		ndetected = 0;
 		rcu_for_each_leaf_node(rsp, rnp) {
-			(void)rcu_print_task_exp_stall(rnp);
+			ndetected = rcu_print_task_exp_stall(rnp);
 			mask = 1;
 			for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
 				struct rcu_data *rdp;
 
 				if (!(rnp->expmask & mask))
 					continue;
+				ndetected++;
 				rdp = per_cpu_ptr(rsp->rda, cpu);
 				pr_cont(" %d-%c%c%c", cpu,
 					"O."[cpu_online(cpu)],
@@ -3783,8 +3786,23 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 			}
 			mask <<= 1;
 		}
-		pr_cont(" } %lu jiffies s: %lu\n",
-			jiffies - jiffies_start, rsp->expedited_sequence);
+		pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
+			jiffies - jiffies_start, rsp->expedited_sequence,
+			rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]);
+		if (!ndetected) {
+			pr_err("blocking rcu_node structures:");
+			rcu_for_each_node_breadth_first(rsp, rnp) {
+				if (rnp == rnp_root)
+					continue; /* printed unconditionally */
+				if (sync_rcu_preempt_exp_done(rnp))
+					continue;
+				pr_cont(" l=%u:%d-%d:%#lx/%c",
+					rnp->level, rnp->grplo, rnp->grphi,
+					rnp->expmask,
+					".T"[!!rnp->exp_tasks]);
+			}
+			pr_cont("\n");
+		}
 		rcu_for_each_leaf_node(rsp, rnp) {
 			mask = 1;
 			for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
-- 
cgit v1.1


From 5a9be7c628c5273f84abacebf7faf2488376e0f0 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 24 Nov 2015 15:44:06 -0800
Subject: rcu: Add rcu_normal kernel parameter to suppress expediting

Although expedited grace periods can be quite useful, and although their
OS jitter has been greatly reduced, they can still pose problems for
extreme real-time workloads.  This commit therefore adds a rcu_normal
kernel boot parameter (which can also be manipulated via sysfs)
to suppress expedited grace periods, that is, to treat requests for
expedited grace periods as if they were requests for normal grace periods.
If both rcu_expedited and rcu_normal are specified, rcu_normal wins.
This means that if you are relying on expedited grace periods to speed up
boot, you will want to specify rcu_expedited on the kernel command line,
and then specify rcu_normal via sysfs once boot completes.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 Documentation/kernel-parameters.txt | 19 ++++++++++++++-----
 include/linux/rcupdate.h            |  6 ++++++
 kernel/ksysfs.c                     | 22 ++++++++++++++++++++--
 kernel/rcu/srcu.c                   |  2 +-
 kernel/rcu/tree.c                   |  6 ++++++
 kernel/rcu/tree_plugin.h            |  6 ++++++
 kernel/rcu/update.c                 | 12 ++++++++++++
 7 files changed, 65 insertions(+), 8 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 742f69d..7673943 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -3296,6 +3296,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 	rcutorture.verbose= [KNL]
 			Enable additional printk() statements.
 
+	rcupdate.rcu_cpu_stall_suppress= [KNL]
+			Suppress RCU CPU stall warning messages.
+
+	rcupdate.rcu_cpu_stall_timeout= [KNL]
+			Set timeout for RCU CPU stall warning messages.
+
 	rcupdate.rcu_expedited= [KNL]
 			Use expedited grace-period primitives, for
 			example, synchronize_rcu_expedited() instead
@@ -3303,11 +3309,14 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			but can increase CPU utilization, degrade
 			real-time latency, and degrade energy efficiency.
 
-	rcupdate.rcu_cpu_stall_suppress= [KNL]
-			Suppress RCU CPU stall warning messages.
-
-	rcupdate.rcu_cpu_stall_timeout= [KNL]
-			Set timeout for RCU CPU stall warning messages.
+	rcupdate.rcu_normal= [KNL]
+			Use only normal grace-period primitives,
+			for example, synchronize_rcu() instead of
+			synchronize_rcu_expedited().  This improves
+			real-time latency, CPU utilization, and energy
+			efficiency, but can expose users to increased
+			grace-period latency.  This parameter overrides
+			rcupdate.rcu_expedited.
 
 	rcupdate.rcu_task_stall_timeout= [KNL]
 			Set timeout in jiffies for RCU task stall warning
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index a0189ba..98d9f30c0 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -49,9 +49,14 @@
 #include <asm/barrier.h>
 
 extern int rcu_expedited; /* for sysctl */
+extern int rcu_normal;    /* also for sysctl */
 
 #ifdef CONFIG_TINY_RCU
 /* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */
+static inline bool rcu_gp_is_normal(void)  /* Internal RCU use. */
+{
+	return true;
+}
 static inline bool rcu_gp_is_expedited(void)  /* Internal RCU use. */
 {
 	return false;
@@ -65,6 +70,7 @@ static inline void rcu_unexpedite_gp(void)
 {
 }
 #else /* #ifdef CONFIG_TINY_RCU */
+bool rcu_gp_is_normal(void);     /* Internal RCU use. */
 bool rcu_gp_is_expedited(void);  /* Internal RCU use. */
 void rcu_expedite_gp(void);
 void rcu_unexpedite_gp(void);
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index e83b264..b4e2fa5 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -20,7 +20,7 @@
 #include <linux/capability.h>
 #include <linux/compiler.h>
 
-#include <linux/rcupdate.h>	/* rcu_expedited */
+#include <linux/rcupdate.h>	/* rcu_expedited and rcu_normal */
 
 #define KERNEL_ATTR_RO(_name) \
 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
@@ -148,7 +148,7 @@ int rcu_expedited;
 static ssize_t rcu_expedited_show(struct kobject *kobj,
 				  struct kobj_attribute *attr, char *buf)
 {
-	return sprintf(buf, "%d\n", rcu_expedited);
+	return sprintf(buf, "%d\n", READ_ONCE(rcu_expedited));
 }
 static ssize_t rcu_expedited_store(struct kobject *kobj,
 				   struct kobj_attribute *attr,
@@ -161,6 +161,23 @@ static ssize_t rcu_expedited_store(struct kobject *kobj,
 }
 KERNEL_ATTR_RW(rcu_expedited);
 
+int rcu_normal;
+static ssize_t rcu_normal_show(struct kobject *kobj,
+			       struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%d\n", READ_ONCE(rcu_normal));
+}
+static ssize_t rcu_normal_store(struct kobject *kobj,
+				struct kobj_attribute *attr,
+				const char *buf, size_t count)
+{
+	if (kstrtoint(buf, 0, &rcu_normal))
+		return -EINVAL;
+
+	return count;
+}
+KERNEL_ATTR_RW(rcu_normal);
+
 /*
  * Make /sys/kernel/notes give the raw contents of our kernel .notes section.
  */
@@ -203,6 +220,7 @@ static struct attribute * kernel_attrs[] = {
 	&vmcoreinfo_attr.attr,
 #endif
 	&rcu_expedited_attr.attr,
+	&rcu_normal_attr.attr,
 	NULL
 };
 
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index a63a1ea..9b9cdd5 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -489,7 +489,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
  */
 void synchronize_srcu(struct srcu_struct *sp)
 {
-	__synchronize_srcu(sp, rcu_gp_is_expedited()
+	__synchronize_srcu(sp, (rcu_gp_is_expedited() && !rcu_gp_is_normal())
 			   ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT
 			   : SYNCHRONIZE_SRCU_TRYCOUNT);
 }
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 6a652d1..4899929 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3841,6 +3841,12 @@ void synchronize_sched_expedited(void)
 	if (rcu_blocking_is_gp())
 		return;
 
+	/* If expedited grace periods are prohibited, fall back to normal. */
+	if (rcu_gp_is_normal()) {
+		wait_rcu_gp(call_rcu_sched);
+		return;
+	}
+
 	/* Take a snapshot of the sequence number.  */
 	s = rcu_exp_gp_seq_snap(rsp);
 
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 57ba873..d45df37 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -746,6 +746,12 @@ void synchronize_rcu_expedited(void)
 	struct rcu_state *rsp = rcu_state_p;
 	unsigned long s;
 
+	/* If expedited grace periods are prohibited, fall back to normal. */
+	if (rcu_gp_is_normal()) {
+		wait_rcu_gp(call_rcu);
+		return;
+	}
+
 	s = rcu_exp_gp_seq_snap(rsp);
 
 	rnp_unlock = exp_funnel_lock(rsp, s);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 5f748c5..8fccda3 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -61,6 +61,7 @@ MODULE_ALIAS("rcupdate");
 #define MODULE_PARAM_PREFIX "rcupdate."
 
 module_param(rcu_expedited, int, 0);
+module_param(rcu_normal, int, 0);
 
 #if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_PREEMPT_COUNT)
 /**
@@ -113,6 +114,17 @@ EXPORT_SYMBOL(rcu_read_lock_sched_held);
 
 #ifndef CONFIG_TINY_RCU
 
+/*
+ * Should expedited grace-period primitives always fall back to their
+ * non-expedited counterparts?  Intended for use within RCU.  Note
+ * that if the user specifies both rcu_expedited and rcu_normal, then
+ * rcu_normal wins.
+ */
+bool rcu_gp_is_normal(void)
+{
+	return READ_ONCE(rcu_normal);
+}
+
 static atomic_t rcu_expedited_nesting =
 	ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0);
 
-- 
cgit v1.1


From 967dcb8fe6a9a75be346400539261e0416baf370 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 25 Nov 2015 16:52:36 -0800
Subject: rcu: Wire up rcu_end_inkernel_boot()

This commit adds the invocation of rcu_end_inkernel_boot() just before
init is invoked.  This allows the CONFIG_RCU_EXPEDITE_BOOT Kconfig
option to do something useful and prepares for the upcoming
rcupdate.rcu_normal_after_boot kernel parameter.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 init/main.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/init/main.c b/init/main.c
index 9e64d70..c6ebefa 100644
--- a/init/main.c
+++ b/init/main.c
@@ -943,6 +943,8 @@ static int __ref kernel_init(void *unused)
 
 	flush_delayed_fput();
 
+	rcu_end_inkernel_boot();
+
 	if (ramdisk_execute_command) {
 		ret = run_init_process(ramdisk_execute_command);
 		if (!ret)
-- 
cgit v1.1


From 3e42ec1aa716f10c68294b8492ae3ea684528699 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 25 Nov 2015 18:56:00 -0800
Subject: rcu: Allow expedited grace periods to be disabled at init

Expedited grace periods can speed up boot, but are undesirable in
aggressive real-time systems.  This commit therefore introduces a
kernel parameter rcupdate.rcu_normal_after_boot that disables
expedited grace periods just before init is spawned.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 Documentation/kernel-parameters.txt | 5 +++++
 kernel/rcu/update.c                 | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 7673943..197305bb 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -3318,6 +3318,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			grace-period latency.  This parameter overrides
 			rcupdate.rcu_expedited.
 
+	rcupdate.rcu_normal_after_boot= [KNL]
+			Once boot has completed (that is, after
+			rcu_end_inkernel_boot() has been invoked), use
+			only normal grace-period primitives.
+
 	rcupdate.rcu_task_stall_timeout= [KNL]
 			Set timeout in jiffies for RCU task stall warning
 			messages.  Disable with a value less than or equal
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 8fccda3..12b91f5 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -63,6 +63,9 @@ MODULE_ALIAS("rcupdate");
 module_param(rcu_expedited, int, 0);
 module_param(rcu_normal, int, 0);
 
+static int rcu_normal_after_boot;
+module_param(rcu_normal_after_boot, int, 0);
+
 #if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_PREEMPT_COUNT)
 /**
  * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section?
@@ -178,6 +181,8 @@ void rcu_end_inkernel_boot(void)
 {
 	if (IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT))
 		rcu_unexpedite_gp();
+	if (rcu_normal_after_boot)
+		WRITE_ONCE(rcu_normal, 1);
 }
 
 #ifdef CONFIG_PREEMPT_RCU
-- 
cgit v1.1


From 3dc5dbe9a1b815b659a6b04540fc6fd4b4e3831b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 26 Sep 2015 14:51:24 -0700
Subject: rcu: Move lock_class_key to local scope

Currently, the rcu_node_class[], rcu_fqs_class[], and rcu_exp_class[]
arrays needlessly pollute the global namespace within tree.c.  This
commit therefore converts them to static local variables within
rcu_init_one().

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 81aa1cd..23df266 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -68,10 +68,6 @@ MODULE_ALIAS("rcutree");
 
 /* Data structures. */
 
-static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
-static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
-static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS];
-
 /*
  * In order to export the rcu_state name to the tracing tools, it
  * needs to be added in the __tracepoint_string section.
@@ -4365,6 +4361,9 @@ static void __init rcu_init_one(struct rcu_state *rsp,
 	static const char * const buf[] = RCU_NODE_NAME_INIT;
 	static const char * const fqs[] = RCU_FQS_NAME_INIT;
 	static const char * const exp[] = RCU_EXP_NAME_INIT;
+	static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
+	static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
+	static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS];
 	static u8 fl_mask = 0x1;
 
 	int levelcnt[RCU_NUM_LVLS];		/* # nodes in each level. */
-- 
cgit v1.1


From 47dbc90663f697a4515a8dd5c99ae43dba108cb4 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Sun, 27 Sep 2015 19:14:57 -0400
Subject: kernel: Make rcu/tree_trace.c explicitly non-modular

The Kconfig currently controlling compilation of this code is:

init/Kconfig:config TREE_RCU_TRACE
init/Kconfig:   def_bool RCU_TRACE && ( TREE_RCU || PREEMPT_RCU )

...meaning that it currently is not being built as a module by anyone.

Lets remove the modular code that is essentially orphaned, so that
when reading the file there is no doubt it is builtin-only.

Since module_init translates to device_initcall in the non-modular
case, the init ordering remains unchanged with this commit.  We could
consider moving this to an earlier initcall if desired.

We don't replace module.h with init.h since the file already has that.
We also delete the moduleparam.h include that is left over from
commit 64db4cfff99c04cd5f550357edcc8780f96b54a2 (""Tree RCU": scalable
classic RCU implementation") since it is not needed here either.

We morph some tags like MODULE_AUTHOR into the comments at the top of
the file for documentation purposes.

Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Josh Triplett <josh@joshtriplett.org>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree_trace.c | 19 +++----------------
 1 file changed, 3 insertions(+), 16 deletions(-)

diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 8efaba8..82aca98 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -1,5 +1,5 @@
 /*
- * Read-Copy Update tracing for classic implementation
+ * Read-Copy Update tracing for hierarchical implementation.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -16,6 +16,7 @@
  * http://www.gnu.org/licenses/gpl-2.0.html.
  *
  * Copyright IBM Corporation, 2008
+ * Author: Paul E. McKenney
  *
  * Papers:  http://www.rdrop.com/users/paulmck/RCU
  *
@@ -33,9 +34,7 @@
 #include <linux/sched.h>
 #include <linux/atomic.h>
 #include <linux/bitops.h>
-#include <linux/module.h>
 #include <linux/completion.h>
-#include <linux/moduleparam.h>
 #include <linux/percpu.h>
 #include <linux/notifier.h>
 #include <linux/cpu.h>
@@ -487,16 +486,4 @@ free_out:
 	debugfs_remove_recursive(rcudir);
 	return 1;
 }
-
-static void __exit rcutree_trace_cleanup(void)
-{
-	debugfs_remove_recursive(rcudir);
-}
-
-
-module_init(rcutree_trace_init);
-module_exit(rcutree_trace_cleanup);
-
-MODULE_AUTHOR("Paul E. McKenney");
-MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation");
-MODULE_LICENSE("GPL");
+device_initcall(rcutree_trace_init);
-- 
cgit v1.1


From fecbf6f01fbd83e6419ccb7f61d9a6eb987f1d92 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 28 Sep 2015 18:19:24 -0700
Subject: rcu: Simplify rcu_sched_qs() control flow

This commit applies an early-exit approach to rcu_sched_qs(), reducing
the nesting level and saving a line of code.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 23df266..ed3bc05 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -244,22 +244,21 @@ void rcu_sched_qs(void)
 {
 	unsigned long flags;
 
-	if (__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) {
-		trace_rcu_grace_period(TPS("rcu_sched"),
-				       __this_cpu_read(rcu_sched_data.gpnum),
-				       TPS("cpuqs"));
-		__this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false);
-		if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))
-			return;
-		local_irq_save(flags);
-		if (__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) {
-			__this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false);
-			rcu_report_exp_rdp(&rcu_sched_state,
-					   this_cpu_ptr(&rcu_sched_data),
-					   true);
-		}
-		local_irq_restore(flags);
+	if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.s))
+		return;
+	trace_rcu_grace_period(TPS("rcu_sched"),
+			       __this_cpu_read(rcu_sched_data.gpnum),
+			       TPS("cpuqs"));
+	__this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false);
+	if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))
+		return;
+	local_irq_save(flags);
+	if (__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) {
+		__this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false);
+		rcu_report_exp_rdp(&rcu_sched_state,
+				   this_cpu_ptr(&rcu_sched_data), true);
 	}
+	local_irq_restore(flags);
 }
 
 void rcu_bh_qs(void)
-- 
cgit v1.1


From 8ba9153b2c3ab733d64e22adb57820ccb6afc496 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 29 Sep 2015 07:55:41 -0700
Subject: rcu: Remove lock-acquisition loop from rcu_read_unlock_special()

Several releases have come and gone without the warning triggering,
so remove the lock-acquisition loop.  Retain the WARN_ON_ONCE()
out of sheer paranoia.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree_plugin.h | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 57ba873..ae4ce2b 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -449,19 +449,13 @@ void rcu_read_unlock_special(struct task_struct *t)
 
 		/*
 		 * Remove this task from the list it blocked on.  The task
-		 * now remains queued on the rcu_node corresponding to
-		 * the CPU it first blocked on, so the first attempt to
-		 * acquire the task's rcu_node's ->lock will succeed.
-		 * Keep the loop and add a WARN_ON() out of sheer paranoia.
+		 * now remains queued on the rcu_node corresponding to the
+		 * CPU it first blocked on, so there is no longer any need
+		 * to loop.  Retain a WARN_ON_ONCE() out of sheer paranoia.
 		 */
-		for (;;) {
-			rnp = t->rcu_blocked_node;
-			raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
-			if (rnp == t->rcu_blocked_node)
-				break;
-			WARN_ON_ONCE(1);
-			raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
-		}
+		rnp = t->rcu_blocked_node;
+		raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
+		WARN_ON_ONCE(rnp != t->rcu_blocked_node);
 		empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
 		empty_exp = sync_rcu_preempt_exp_done(rnp);
 		smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
-- 
cgit v1.1


From 699d40352059e64a4d993af170272585c41988d0 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 29 Sep 2015 08:47:49 -0700
Subject: rcu: Fix obsolete rcu_bootup_announce_oddness() comment

This function no longer has #ifdefs, so this commit removes the
header comment calling them out.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree_plugin.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index ae4ce2b..42df937 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -63,8 +63,7 @@ static bool __read_mostly rcu_nocb_poll;    /* Offload kthread are to poll. */
 
 /*
  * Check the RCU kernel configuration parameters and print informative
- * messages about anything out of the ordinary.  If you like #ifdef, you
- * will love this function.
+ * messages about anything out of the ordinary.
  */
 static void __init rcu_bootup_announce_oddness(void)
 {
-- 
cgit v1.1


From f0f2e7d307fff226e0c1df5a07101a1216a46d8a Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 29 Sep 2015 08:59:32 -0700
Subject: rcu: Avoid tick_nohz_active checks on NOCBs CPUs

Currently, rcu_prepare_for_idle() checks for tick_nohz_active, even on
individual NOCBs CPUs, unless all CPUs are marked as NOCBs CPUs at build
time.  This check is pointless on NOCBs CPUs because they never have any
callbacks posted, given that all of their callbacks are handed off to the
corresponding rcuo kthread.  There is a check for individually designated
NOCBs CPUs, but it pointelessly follows the check for tick_nohz_active.

This commit therefore moves the check for individually designated NOCBs
CPUs up with the check for CONFIG_RCU_NOCB_CPU_ALL.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree_plugin.h | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 42df937..8e9d4a4 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1513,7 +1513,8 @@ static void rcu_prepare_for_idle(void)
 	struct rcu_state *rsp;
 	int tne;
 
-	if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL))
+	if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) ||
+	    rcu_is_nocb_cpu(smp_processor_id()))
 		return;
 
 	/* Handle nohz enablement switches conservatively. */
@@ -1527,10 +1528,6 @@ static void rcu_prepare_for_idle(void)
 	if (!tne)
 		return;
 
-	/* If this is a no-CBs CPU, no callbacks, just return. */
-	if (rcu_is_nocb_cpu(smp_processor_id()))
-		return;
-
 	/*
 	 * If a non-lazy callback arrived at a CPU having only lazy
 	 * callbacks, invoke RCU core for the side-effect of recalculating
-- 
cgit v1.1


From 46a5d164db53ba6066b11889abb7fa6bddbe5cf7 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 7 Oct 2015 09:10:48 -0700
Subject: rcu: Stop disabling interrupts in scheduler fastpaths

We need the scheduler's fastpaths to be, well, fast, and unnecessarily
disabling and re-enabling interrupts is not necessarily consistent with
this goal.  Especially given that there are regions of the scheduler that
already have interrupts disabled.

This commit therefore moves the call to rcu_note_context_switch()
to one of the interrupts-disabled regions of the scheduler, and
removes the now-redundant disabling and re-enabling of interrupts from
rcu_note_context_switch() and the functions it calls.

Reported-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
[ paulmck: Shift rcu_note_context_switch() to avoid deadlock, as suggested
  by Peter Zijlstra. ]
---
 include/linux/rcutree.h  |  2 +-
 kernel/rcu/tree.c        | 27 ++++++++++++---------------
 kernel/rcu/tree_plugin.h | 14 ++++++--------
 kernel/sched/core.c      |  6 ++++--
 4 files changed, 23 insertions(+), 26 deletions(-)

diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 60d15a0..9d3eda3 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -37,7 +37,7 @@ void rcu_cpu_stall_reset(void);
 /*
  * Note a virtualization-based context switch.  This is simply a
  * wrapper around rcu_note_context_switch(), which allows TINY_RCU
- * to save a few bytes.
+ * to save a few bytes. The caller must have disabled interrupts.
  */
 static inline void rcu_virt_note_context_switch(int cpu)
 {
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index ed3bc05..93941d3 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -242,8 +242,6 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
  */
 void rcu_sched_qs(void)
 {
-	unsigned long flags;
-
 	if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.s))
 		return;
 	trace_rcu_grace_period(TPS("rcu_sched"),
@@ -252,13 +250,9 @@ void rcu_sched_qs(void)
 	__this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false);
 	if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))
 		return;
-	local_irq_save(flags);
-	if (__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) {
-		__this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false);
-		rcu_report_exp_rdp(&rcu_sched_state,
-				   this_cpu_ptr(&rcu_sched_data), true);
-	}
-	local_irq_restore(flags);
+	__this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false);
+	rcu_report_exp_rdp(&rcu_sched_state,
+			   this_cpu_ptr(&rcu_sched_data), true);
 }
 
 void rcu_bh_qs(void)
@@ -295,17 +289,16 @@ EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr);
  * We inform the RCU core by emulating a zero-duration dyntick-idle
  * period, which we in turn do by incrementing the ->dynticks counter
  * by two.
+ *
+ * The caller must have disabled interrupts.
  */
 static void rcu_momentary_dyntick_idle(void)
 {
-	unsigned long flags;
 	struct rcu_data *rdp;
 	struct rcu_dynticks *rdtp;
 	int resched_mask;
 	struct rcu_state *rsp;
 
-	local_irq_save(flags);
-
 	/*
 	 * Yes, we can lose flag-setting operations.  This is OK, because
 	 * the flag will be set again after some delay.
@@ -335,13 +328,12 @@ static void rcu_momentary_dyntick_idle(void)
 		smp_mb__after_atomic(); /* Later stuff after QS. */
 		break;
 	}
-	local_irq_restore(flags);
 }
 
 /*
  * Note a context switch.  This is a quiescent state for RCU-sched,
  * and requires special handling for preemptible RCU.
- * The caller must have disabled preemption.
+ * The caller must have disabled interrupts.
  */
 void rcu_note_context_switch(void)
 {
@@ -371,9 +363,14 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch);
  */
 void rcu_all_qs(void)
 {
+	unsigned long flags;
+
 	barrier(); /* Avoid RCU read-side critical sections leaking down. */
-	if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
+	if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) {
+		local_irq_save(flags);
 		rcu_momentary_dyntick_idle();
+		local_irq_restore(flags);
+	}
 	this_cpu_inc(rcu_qs_ctr);
 	barrier(); /* Avoid RCU read-side critical sections leaking up. */
 }
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 8e9d4a4..e6da888 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -146,8 +146,8 @@ static void __init rcu_bootup_announce(void)
  * the corresponding expedited grace period will also be the end of the
  * normal grace period.
  */
-static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp,
-				   unsigned long flags) __releases(rnp->lock)
+static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
+	__releases(rnp->lock) /* But leaves rrupts disabled. */
 {
 	int blkd_state = (rnp->gp_tasks ? RCU_GP_TASKS : 0) +
 			 (rnp->exp_tasks ? RCU_EXP_TASKS : 0) +
@@ -235,7 +235,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp,
 		rnp->gp_tasks = &t->rcu_node_entry;
 	if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD))
 		rnp->exp_tasks = &t->rcu_node_entry;
-	raw_spin_unlock(&rnp->lock);
+	raw_spin_unlock(&rnp->lock); /* rrupts remain disabled. */
 
 	/*
 	 * Report the quiescent state for the expedited GP.  This expedited
@@ -250,7 +250,6 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp,
 	} else {
 		WARN_ON_ONCE(t->rcu_read_unlock_special.b.exp_need_qs);
 	}
-	local_irq_restore(flags);
 }
 
 /*
@@ -285,12 +284,11 @@ static void rcu_preempt_qs(void)
  * predating the current grace period drain, in other words, until
  * rnp->gp_tasks becomes NULL.
  *
- * Caller must disable preemption.
+ * Caller must disable interrupts.
  */
 static void rcu_preempt_note_context_switch(void)
 {
 	struct task_struct *t = current;
-	unsigned long flags;
 	struct rcu_data *rdp;
 	struct rcu_node *rnp;
 
@@ -300,7 +298,7 @@ static void rcu_preempt_note_context_switch(void)
 		/* Possibly blocking in an RCU read-side critical section. */
 		rdp = this_cpu_ptr(rcu_state_p->rda);
 		rnp = rdp->mynode;
-		raw_spin_lock_irqsave_rcu_node(rnp, flags);
+		raw_spin_lock_rcu_node(rnp);
 		t->rcu_read_unlock_special.b.blocked = true;
 		t->rcu_blocked_node = rnp;
 
@@ -316,7 +314,7 @@ static void rcu_preempt_note_context_switch(void)
 				       (rnp->qsmask & rdp->grpmask)
 				       ? rnp->gpnum
 				       : rnp->gpnum + 1);
-		rcu_preempt_ctxt_queue(rnp, rdp, flags);
+		rcu_preempt_ctxt_queue(rnp, rdp);
 	} else if (t->rcu_read_lock_nesting < 0 &&
 		   t->rcu_read_unlock_special.s) {
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4d568ac9..ec72de2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3085,7 +3085,6 @@ static void __sched notrace __schedule(bool preempt)
 
 	cpu = smp_processor_id();
 	rq = cpu_rq(cpu);
-	rcu_note_context_switch();
 	prev = rq->curr;
 
 	/*
@@ -3104,13 +3103,16 @@ static void __sched notrace __schedule(bool preempt)
 	if (sched_feat(HRTICK))
 		hrtick_clear(rq);
 
+	local_irq_disable();
+	rcu_note_context_switch();
+
 	/*
 	 * Make sure that signal_pending_state()->signal_pending() below
 	 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
 	 * done by the caller to avoid the race with signal_wake_up().
 	 */
 	smp_mb__before_spinlock();
-	raw_spin_lock_irq(&rq->lock);
+	raw_spin_lock(&rq->lock);
 	lockdep_pin_lock(&rq->lock);
 
 	rq->clock_skip_update <<= 1; /* promote REQ to ACT */
-- 
cgit v1.1


From 7d86dccf28a3ae2f790f399fc82d4c82521fd078 Mon Sep 17 00:00:00 2001
From: Petko Manolov <petkan@mip-labs.com>
Date: Mon, 12 Oct 2015 18:23:51 +0300
Subject: list: Introduces generic list_splice_tail_init_rcu()

The list_splice_init_rcu() can be used as a stack onto which full lists
are pushed, but queue-like behavior is now needed by some security
policies.  This requires a list_splice_tail_init_rcu().

This commit therefore supplies a list_splice_tail_init_rcu() by
pulling code common it and to list_splice_init_rcu() into a new
__list_splice_init_rcu() function.  This new function is based on the
existing list_splice_init_rcu() implementation.

Signed-off-by: Petko Manolov <petkan@mip-labs.com>
Cc: Mimi Zohar <zohar@linux.vnet.ibm.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rculist.h | 69 +++++++++++++++++++++++++++++++++++--------------
 1 file changed, 49 insertions(+), 20 deletions(-)

diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index 5ed5409..e99d834 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -179,32 +179,31 @@ static inline void list_replace_rcu(struct list_head *old,
 }
 
 /**
- * list_splice_init_rcu - splice an RCU-protected list into an existing list.
+ * __list_splice_init_rcu - join an RCU-protected list into an existing list.
  * @list:	the RCU-protected list to splice
- * @head:	the place in the list to splice the first list into
+ * @prev:	points to the last element of the existing list
+ * @next:	points to the first element of the existing list
  * @sync:	function to sync: synchronize_rcu(), synchronize_sched(), ...
  *
- * @head can be RCU-read traversed concurrently with this function.
+ * The list pointed to by @prev and @next can be RCU-read traversed
+ * concurrently with this function.
  *
  * Note that this function blocks.
  *
- * Important note: the caller must take whatever action is necessary to
- *	prevent any other updates to @head.  In principle, it is possible
- *	to modify the list as soon as sync() begins execution.
- *	If this sort of thing becomes necessary, an alternative version
- *	based on call_rcu() could be created.  But only if -really-
- *	needed -- there is no shortage of RCU API members.
+ * Important note: the caller must take whatever action is necessary to prevent
+ * any other updates to the existing list.  In principle, it is possible to
+ * modify the list as soon as sync() begins execution. If this sort of thing
+ * becomes necessary, an alternative version based on call_rcu() could be
+ * created.  But only if -really- needed -- there is no shortage of RCU API
+ * members.
  */
-static inline void list_splice_init_rcu(struct list_head *list,
-					struct list_head *head,
-					void (*sync)(void))
+static inline void __list_splice_init_rcu(struct list_head *list,
+					  struct list_head *prev,
+					  struct list_head *next,
+					  void (*sync)(void))
 {
 	struct list_head *first = list->next;
 	struct list_head *last = list->prev;
-	struct list_head *at = head->next;
-
-	if (list_empty(list))
-		return;
 
 	/*
 	 * "first" and "last" tracking list, so initialize it.  RCU readers
@@ -231,10 +230,40 @@ static inline void list_splice_init_rcu(struct list_head *list,
 	 * this function.
 	 */
 
-	last->next = at;
-	rcu_assign_pointer(list_next_rcu(head), first);
-	first->prev = head;
-	at->prev = last;
+	last->next = next;
+	rcu_assign_pointer(list_next_rcu(prev), first);
+	first->prev = prev;
+	next->prev = last;
+}
+
+/**
+ * list_splice_init_rcu - splice an RCU-protected list into an existing list,
+ *                        designed for stacks.
+ * @list:	the RCU-protected list to splice
+ * @head:	the place in the existing list to splice the first list into
+ * @sync:	function to sync: synchronize_rcu(), synchronize_sched(), ...
+ */
+static inline void list_splice_init_rcu(struct list_head *list,
+					struct list_head *head,
+					void (*sync)(void))
+{
+	if (!list_empty(list))
+		__list_splice_init_rcu(list, head, head->next, sync);
+}
+
+/**
+ * list_splice_tail_init_rcu - splice an RCU-protected list into an existing
+ *                             list, designed for queues.
+ * @list:	the RCU-protected list to splice
+ * @head:	the place in the existing list to splice the first list into
+ * @sync:	function to sync: synchronize_rcu(), synchronize_sched(), ...
+ */
+static inline void list_splice_tail_init_rcu(struct list_head *list,
+					     struct list_head *head,
+					     void (*sync)(void))
+{
+	if (!list_empty(list))
+		__list_splice_init_rcu(list, head->prev, head, sync);
 }
 
 /**
-- 
cgit v1.1


From 2f073848c3cc8aff2655ab7c46d8c0de90cf4e50 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 12 Oct 2015 16:56:42 -0700
Subject: list: Use WRITE_ONCE() when initializing list_head structures

Code that does lockless emptiness testing of non-RCU lists is relying
on INIT_LIST_HEAD() to write the list head's ->next pointer atomically,
particularly when INIT_LIST_HEAD() is invoked from list_del_init().
This commit therefore adds WRITE_ONCE() to this function's pointer stores
that could affect the head's ->next pointer.

Reported-by: Andrey Konovalov <andreyknvl@google.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/list.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/list.h b/include/linux/list.h
index 06c2d88..5356f4d 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -24,7 +24,7 @@
 
 static inline void INIT_LIST_HEAD(struct list_head *list)
 {
-	list->next = list;
+	WRITE_ONCE(list->next, list);
 	list->prev = list;
 }
 
-- 
cgit v1.1


From 649e4368ff786e3d02eb2a06b1493fb217d74408 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 7 Oct 2015 13:32:08 -0700
Subject: documentation: Record RCU requirements

This commit adds RCU requirements as published in a 2015 LWN series.
Bringing these requirements in-tree allows them to be updated as changes
are discovered.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
[ paulmck: Updates to charset and URLs as suggested by Josh Triplett. ]
---
 .../RCU/Design/Requirements/2013-08-is-it-dead.png |  Bin 0 -> 100825 bytes
 .../Design/Requirements/GPpartitionReaders1.svg    |  374 +++
 .../RCU/Design/Requirements/RCUApplicability.svg   |  237 ++
 .../Design/Requirements/ReadersPartitionGP1.svg    |  639 +++++
 .../RCU/Design/Requirements/Requirements.html      | 2799 ++++++++++++++++++++
 .../RCU/Design/Requirements/Requirements.htmlx     | 2643 ++++++++++++++++++
 Documentation/RCU/Design/htmlqqz.sh                |  108 +
 7 files changed, 6800 insertions(+)
 create mode 100644 Documentation/RCU/Design/Requirements/2013-08-is-it-dead.png
 create mode 100644 Documentation/RCU/Design/Requirements/GPpartitionReaders1.svg
 create mode 100644 Documentation/RCU/Design/Requirements/RCUApplicability.svg
 create mode 100644 Documentation/RCU/Design/Requirements/ReadersPartitionGP1.svg
 create mode 100644 Documentation/RCU/Design/Requirements/Requirements.html
 create mode 100644 Documentation/RCU/Design/Requirements/Requirements.htmlx
 create mode 100755 Documentation/RCU/Design/htmlqqz.sh

diff --git a/Documentation/RCU/Design/Requirements/2013-08-is-it-dead.png b/Documentation/RCU/Design/Requirements/2013-08-is-it-dead.png
new file mode 100644
index 0000000..7496a55
Binary files /dev/null and b/Documentation/RCU/Design/Requirements/2013-08-is-it-dead.png differ
diff --git a/Documentation/RCU/Design/Requirements/GPpartitionReaders1.svg b/Documentation/RCU/Design/Requirements/GPpartitionReaders1.svg
new file mode 100644
index 0000000..4b4014f
--- /dev/null
+++ b/Documentation/RCU/Design/Requirements/GPpartitionReaders1.svg
@@ -0,0 +1,374 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   width="447.99197"
+   height="428.19299"
+   id="svg2"
+   version="1.1"
+   inkscape:version="0.48.3.1 r9886"
+   sodipodi:docname="GPpartitionReaders1.svg">
+  <defs
+     id="defs4">
+    <marker
+       inkscape:stockid="Arrow2Lend"
+       orient="auto"
+       refY="0"
+       refX="0"
+       id="Arrow2Lend"
+       style="overflow:visible">
+      <path
+         id="path3792"
+         style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
+         d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+         transform="matrix(-1.1,0,0,-1.1,-1.1,0)"
+         inkscape:connector-curvature="0" />
+    </marker>
+    <marker
+       inkscape:stockid="Arrow2Lstart"
+       orient="auto"
+       refY="0"
+       refX="0"
+       id="Arrow2Lstart"
+       style="overflow:visible">
+      <path
+         id="path3789"
+         style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
+         d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+         transform="matrix(1.1,0,0,1.1,1.1,0)"
+         inkscape:connector-curvature="0" />
+    </marker>
+  </defs>
+  <sodipodi:namedview
+     id="base"
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1.0"
+     inkscape:pageopacity="0.0"
+     inkscape:pageshadow="2"
+     inkscape:zoom="1.6184291"
+     inkscape:cx="223.99599"
+     inkscape:cy="214.0965"
+     inkscape:document-units="px"
+     inkscape:current-layer="layer1"
+     showgrid="false"
+     inkscape:window-width="979"
+     inkscape:window-height="836"
+     inkscape:window-x="571"
+     inkscape:window-y="335"
+     inkscape:window-maximized="0"
+     fit-margin-top="5"
+     fit-margin-left="5"
+     fit-margin-right="5"
+     fit-margin-bottom="5" />
+  <metadata
+     id="metadata7">
+    <rdf:RDF>
+      <cc:Work
+         rdf:about="">
+        <dc:format>image/svg+xml</dc:format>
+        <dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+        <dc:title></dc:title>
+      </cc:Work>
+    </rdf:RDF>
+  </metadata>
+  <g
+     inkscape:label="Layer 1"
+     inkscape:groupmode="layer"
+     id="layer1"
+     transform="translate(-28.441125,-185.60612)">
+    <flowRoot
+       xml:space="preserve"
+       id="flowRoot2985"
+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"><flowRegion
+         id="flowRegion2987"><rect
+           id="rect2989"
+           width="82.85714"
+           height="11.428572"
+           x="240"
+           y="492.36218" /></flowRegion><flowPara
+         id="flowPara2991"></flowPara></flowRoot>    <g
+       id="g4433"
+       transform="translate(2,0)">
+      <text
+         sodipodi:linespacing="125%"
+         id="text2993"
+         y="-261.66608"
+         x="412.12299"
+         style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+         xml:space="preserve"
+         transform="matrix(0,1,-1,0,0,0)"><tspan
+           y="-261.66608"
+           x="412.12299"
+           id="tspan2995"
+           sodipodi:role="line">synchronize_rcu()</tspan></text>
+      <g
+         id="g4417"
+         transform="matrix(0,1,-1,0,730.90257,222.4928)">
+        <path
+           style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-start:url(#Arrow2Lstart);marker-end:url(#Arrow2Lend)"
+           d="m 97.580736,477.4048 183.140664,0"
+           id="path2997"
+           inkscape:connector-curvature="0"
+           sodipodi:nodetypes="cc" />
+        <path
+           style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+           d="m 96.752718,465.38398 0,22.62742"
+           id="path4397"
+           inkscape:connector-curvature="0"
+           sodipodi:nodetypes="cc" />
+        <path
+           style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+           d="m 281.54942,465.38397 0,22.62742"
+           id="path4397-5"
+           inkscape:connector-curvature="0"
+           sodipodi:nodetypes="cc" />
+      </g>
+    </g>
+    <text
+       xml:space="preserve"
+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+       x="112.04738"
+       y="268.18076"
+       id="text4429"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan4431"
+         x="112.04738"
+         y="268.18076">WRITE_ONCE(a, 1);</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+       x="112.04738"
+       y="439.13766"
+       id="text4441"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan4443"
+         x="112.04738"
+         y="439.13766">WRITE_ONCE(b, 1);</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+       x="255.60869"
+       y="309.29346"
+       id="text4445"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan4447"
+         x="255.60869"
+         y="309.29346">r1 = READ_ONCE(a);</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+       x="255.14423"
+       y="520.61786"
+       id="text4449"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan4451"
+         x="255.14423"
+         y="520.61786">WRITE_ONCE(c, 1);</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+       x="396.10254"
+       y="384.71124"
+       id="text4453"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan4455"
+         x="396.10254"
+         y="384.71124">r2 = READ_ONCE(b);</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+       x="396.10254"
+       y="582.13617"
+       id="text4457"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan4459"
+         x="396.10254"
+         y="582.13617">r3 = READ_ONCE(c);</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+       x="112.08231"
+       y="213.91006"
+       id="text4461"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan4463"
+         x="112.08231"
+         y="213.91006">thread0()</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+       x="252.34512"
+       y="213.91006"
+       id="text4461-6"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan4463-0"
+         x="252.34512"
+         y="213.91006">thread1()</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+       x="396.42557"
+       y="213.91006"
+       id="text4461-2"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan4463-2"
+         x="396.42557"
+         y="213.91006">thread2()</tspan></text>
+    <rect
+       style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+       id="rect4495"
+       width="436.28488"
+       height="416.4859"
+       x="34.648232"
+       y="191.10612" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+       d="m 183.14066,191.10612 0,417.193 -0.70711,0"
+       id="path4497"
+       inkscape:connector-curvature="0" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+       d="m 325.13867,191.10612 0,417.193 -0.70711,0"
+       id="path4497-5"
+       inkscape:connector-curvature="0" />
+    <text
+       xml:space="preserve"
+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+       x="111.75929"
+       y="251.53981"
+       id="text4429-8"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan4431-9"
+         x="111.75929"
+         y="251.53981">rcu_read_lock();</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+       x="396.10254"
+       y="367.91556"
+       id="text4429-8-9"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan4431-9-4"
+         x="396.10254"
+         y="367.91556">rcu_read_lock();</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+       x="396.10254"
+       y="597.40289"
+       id="text4429-8-9-3"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan4431-9-4-4"
+         x="396.10254"
+         y="597.40289">rcu_read_unlock();</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+       x="111.75929"
+       y="453.15311"
+       id="text4429-8-9-3-1"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan4431-9-4-4-6"
+         x="111.75929"
+         y="453.15311">rcu_read_unlock();</tspan></text>
+    <path
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       d="m 33.941125,227.87568 436.284885,0 0,0.7071"
+       id="path4608"
+       inkscape:connector-curvature="0" />
+    <text
+       xml:space="preserve"
+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+       x="394.94427"
+       y="345.66351"
+       id="text4648"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan4650"
+         x="394.94427"
+         y="345.66351">QS</tspan></text>
+    <path
+       sodipodi:type="arc"
+       style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+       id="path4652"
+       sodipodi:cx="358.85669"
+       sodipodi:cy="142.87541"
+       sodipodi:rx="10.960155"
+       sodipodi:ry="10.253048"
+       d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
+       transform="translate(36.441125,199.60612)"
+       sodipodi:start="4.7135481"
+       sodipodi:end="10.994651"
+       sodipodi:open="true" />
+    <text
+       xml:space="preserve"
+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+       x="112.11968"
+       y="475.77856"
+       id="text4648-4"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan4650-4"
+         x="112.11968"
+         y="475.77856">QS</tspan></text>
+    <path
+       sodipodi:type="arc"
+       style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+       id="path4652-7"
+       sodipodi:cx="358.85669"
+       sodipodi:cy="142.87541"
+       sodipodi:rx="10.960155"
+       sodipodi:ry="10.253048"
+       d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
+       transform="translate(-246.38346,329.72117)"
+       sodipodi:start="4.7135481"
+       sodipodi:end="10.994651"
+       sodipodi:open="true" />
+    <path
+       sodipodi:type="arc"
+       style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+       id="path4652-7-7"
+       sodipodi:cx="358.85669"
+       sodipodi:cy="142.87541"
+       sodipodi:rx="10.960155"
+       sodipodi:ry="10.253048"
+       d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
+       transform="translate(-103.65246,202.90878)"
+       sodipodi:start="4.7135481"
+       sodipodi:end="10.994651"
+       sodipodi:open="true" />
+    <text
+       xml:space="preserve"
+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+       x="254.85066"
+       y="348.96619"
+       id="text4648-4-3"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan4650-4-5"
+         x="254.85066"
+         y="348.96619">QS</tspan></text>
+  </g>
+</svg>
diff --git a/Documentation/RCU/Design/Requirements/RCUApplicability.svg b/Documentation/RCU/Design/Requirements/RCUApplicability.svg
new file mode 100644
index 0000000..ebcbeee
--- /dev/null
+++ b/Documentation/RCU/Design/Requirements/RCUApplicability.svg
@@ -0,0 +1,237 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Creator: fig2dev Version 3.2 Patchlevel 5d -->
+
+<!-- CreationDate: Tue Mar  4 18:34:25 2014 -->
+
+<!-- Magnification: 3.000 -->
+
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   width="1089.1382"
+   height="668.21368"
+   viewBox="-2121 -36 14554.634 8876.4061"
+   id="svg2"
+   version="1.1"
+   inkscape:version="0.48.3.1 r9886"
+   sodipodi:docname="RCUApplicability.svg">
+  <metadata
+     id="metadata40">
+    <rdf:RDF>
+      <cc:Work
+         rdf:about="">
+        <dc:format>image/svg+xml</dc:format>
+        <dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+        <dc:title />
+      </cc:Work>
+    </rdf:RDF>
+  </metadata>
+  <defs
+     id="defs38" />
+  <sodipodi:namedview
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1"
+     objecttolerance="10"
+     gridtolerance="10"
+     guidetolerance="10"
+     inkscape:pageopacity="0"
+     inkscape:pageshadow="2"
+     inkscape:window-width="849"
+     inkscape:window-height="639"
+     id="namedview36"
+     showgrid="false"
+     inkscape:zoom="0.51326165"
+     inkscape:cx="544.56912"
+     inkscape:cy="334.10686"
+     inkscape:window-x="149"
+     inkscape:window-y="448"
+     inkscape:window-maximized="0"
+     inkscape:current-layer="g4"
+     fit-margin-top="5"
+     fit-margin-left="5"
+     fit-margin-right="5"
+     fit-margin-bottom="5" />
+  <g
+     style="fill:none;stroke-width:0.025in"
+     id="g4"
+     transform="translate(-2043.6828,14.791398)">
+    <!-- Line: box -->
+    <rect
+       x="0"
+       y="0"
+       width="14400"
+       height="8775"
+       rx="0"
+       style="fill:#ffa1a1;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter"
+       id="rect6" />
+    <!-- Line: box -->
+    <rect
+       x="1350"
+       y="0"
+       width="11700"
+       height="6075"
+       rx="0"
+       style="fill:#ffff00;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter"
+       id="rect8" />
+    <!-- Line: box -->
+    <rect
+       x="2700"
+       y="0"
+       width="9000"
+       height="4275"
+       rx="0"
+       style="fill:#00ff00;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter"
+       id="rect10" />
+    <!-- Line: box -->
+    <rect
+       x="4050"
+       y="0"
+       width="6300"
+       height="2475"
+       rx="0"
+       style="fill:#87cfff;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter"
+       id="rect12" />
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="7200"
+       y="900"
+       font-style="normal"
+       font-weight="normal"
+       font-size="324"
+       id="text14"
+       sodipodi:linespacing="125%"
+       style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan
+         style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
+         id="tspan3017">Read-Mostly, Stale &amp;</tspan></text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="7200"
+       y="1350"
+       font-style="normal"
+       font-weight="normal"
+       font-size="324"
+       id="text16"
+       sodipodi:linespacing="125%"
+       style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan
+         style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
+         id="tspan3019">Inconsistent Data OK</tspan></text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="7200"
+       y="1800"
+       font-style="normal"
+       font-weight="normal"
+       font-size="324"
+       id="text18"
+       sodipodi:linespacing="125%"
+       style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan
+         style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
+         id="tspan3021">(RCU Works Great!!!)</tspan></text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="7200"
+       y="3825"
+       font-style="normal"
+       font-weight="normal"
+       font-size="324"
+       id="text20"
+       sodipodi:linespacing="125%"
+       style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan
+         style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
+         id="tspan3023">(RCU Works Well)</tspan></text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="7200"
+       y="3375"
+       font-style="normal"
+       font-weight="normal"
+       font-size="324"
+       id="text22"
+       sodipodi:linespacing="125%"
+       style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan
+         style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
+         id="tspan3025">Read-Mostly, Need Consistent Data</tspan></text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="7200"
+       y="5175"
+       font-style="normal"
+       font-weight="normal"
+       font-size="324"
+       id="text24"
+       sodipodi:linespacing="125%"
+       style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan
+         style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
+         id="tspan3027">Read-Write, Need Consistent Data</tspan></text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="7200"
+       y="6975"
+       font-style="normal"
+       font-weight="normal"
+       font-size="324"
+       id="text26"
+       style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
+       sodipodi:linespacing="125%">Update-Mostly, Need Consistent Data</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="7200"
+       y="5625"
+       font-style="normal"
+       font-weight="normal"
+       font-size="324"
+       id="text28"
+       sodipodi:linespacing="125%"
+       style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan
+         style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
+         id="tspan3029">(RCU Might Be OK...)</tspan></text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="7200"
+       y="7875"
+       font-style="normal"
+       font-weight="normal"
+       font-size="324"
+       id="text30"
+       style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
+       sodipodi:linespacing="125%">(1) Provide Existence Guarantees For Update-Friendly Mechanisms</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="7200"
+       y="8325"
+       font-style="normal"
+       font-weight="normal"
+       font-size="324"
+       id="text32"
+       style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
+       sodipodi:linespacing="125%">(2) Provide Wait-Free Read-Side Primitives for Real-Time Use)</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="7200"
+       y="7425"
+       font-style="normal"
+       font-weight="normal"
+       font-size="324"
+       id="text34"
+       style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
+       sodipodi:linespacing="125%">(RCU is Very Unlikely to be the Right Tool For The Job, But it Can:</text>
+  </g>
+</svg>
diff --git a/Documentation/RCU/Design/Requirements/ReadersPartitionGP1.svg b/Documentation/RCU/Design/Requirements/ReadersPartitionGP1.svg
new file mode 100644
index 0000000..48cd162
--- /dev/null
+++ b/Documentation/RCU/Design/Requirements/ReadersPartitionGP1.svg
@@ -0,0 +1,639 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   width="735.25"
+   height="516.21875"
+   id="svg2"
+   version="1.1"
+   inkscape:version="0.48.3.1 r9886"
+   sodipodi:docname="ReadersPartitionGP1.svg">
+  <defs
+     id="defs4">
+    <marker
+       inkscape:stockid="Arrow2Lend"
+       orient="auto"
+       refY="0"
+       refX="0"
+       id="Arrow2Lend"
+       style="overflow:visible">
+      <path
+         id="path3792"
+         style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
+         d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+         transform="matrix(-1.1,0,0,-1.1,-1.1,0)"
+         inkscape:connector-curvature="0" />
+    </marker>
+    <marker
+       inkscape:stockid="Arrow2Lstart"
+       orient="auto"
+       refY="0"
+       refX="0"
+       id="Arrow2Lstart"
+       style="overflow:visible">
+      <path
+         id="path3789"
+         style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
+         d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+         transform="matrix(1.1,0,0,1.1,1.1,0)"
+         inkscape:connector-curvature="0" />
+    </marker>
+    <marker
+       inkscape:stockid="Arrow2Lstart"
+       orient="auto"
+       refY="0"
+       refX="0"
+       id="Arrow2Lstart-4"
+       style="overflow:visible">
+      <path
+         id="path3789-9"
+         style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
+         d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+         transform="matrix(1.1,0,0,1.1,1.1,0)"
+         inkscape:connector-curvature="0" />
+    </marker>
+    <marker
+       inkscape:stockid="Arrow2Lend"
+       orient="auto"
+       refY="0"
+       refX="0"
+       id="Arrow2Lend-4"
+       style="overflow:visible">
+      <path
+         id="path3792-4"
+         style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
+         d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+         transform="matrix(-1.1,0,0,-1.1,-1.1,0)"
+         inkscape:connector-curvature="0" />
+    </marker>
+  </defs>
+  <sodipodi:namedview
+     id="base"
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1.0"
+     inkscape:pageopacity="0.0"
+     inkscape:pageshadow="2"
+     inkscape:zoom="1.3670394"
+     inkscape:cx="367.26465"
+     inkscape:cy="258.46182"
+     inkscape:document-units="px"
+     inkscape:current-layer="g4433-6"
+     showgrid="false"
+     inkscape:window-width="1351"
+     inkscape:window-height="836"
+     inkscape:window-x="438"
+     inkscape:window-y="335"
+     inkscape:window-maximized="0"
+     fit-margin-top="5"
+     fit-margin-left="5"
+     fit-margin-right="5"
+     fit-margin-bottom="5" />
+  <metadata
+     id="metadata7">
+    <rdf:RDF>
+      <cc:Work
+         rdf:about="">
+        <dc:format>image/svg+xml</dc:format>
+        <dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+        <dc:title />
+      </cc:Work>
+    </rdf:RDF>
+  </metadata>
+  <g
+     inkscape:label="Layer 1"
+     inkscape:groupmode="layer"
+     id="layer1"
+     transform="translate(-29.15625,-185.59375)">
+    <flowRoot
+       xml:space="preserve"
+       id="flowRoot2985"
+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"><flowRegion
+         id="flowRegion2987"><rect
+           id="rect2989"
+           width="82.85714"
+           height="11.428572"
+           x="240"
+           y="492.36218" /></flowRegion><flowPara
+         id="flowPara2991" /></flowRoot>    <g
+       id="g4433"
+       transform="translate(2,-12)">
+      <text
+         sodipodi:linespacing="125%"
+         id="text2993"
+         y="-261.66608"
+         x="436.12299"
+         style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+         xml:space="preserve"
+         transform="matrix(0,1,-1,0,0,0)"><tspan
+           y="-261.66608"
+           x="436.12299"
+           id="tspan2995"
+           sodipodi:role="line">synchronize_rcu()</tspan></text>
+      <g
+         id="g4417"
+         transform="matrix(0,1,-1,0,730.90257,222.4928)">
+        <path
+           style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-start:url(#Arrow2Lstart);marker-end:url(#Arrow2Lend)"
+           d="M 97.580736,477.4048 327.57913,476.09759"
+           id="path2997"
+           inkscape:connector-curvature="0"
+           sodipodi:nodetypes="cc" />
+        <path
+           style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+           d="m 96.752718,465.38398 0,22.62742"
+           id="path4397"
+           inkscape:connector-curvature="0"
+           sodipodi:nodetypes="cc" />
+        <path
+           style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+           d="m 328.40703,465.38397 0,22.62742"
+           id="path4397-5"
+           inkscape:connector-curvature="0"
+           sodipodi:nodetypes="cc" />
+      </g>
+    </g>
+    <text
+       xml:space="preserve"
+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+       x="112.04738"
+       y="268.18076"
+       id="text4429"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan4431"
+         x="112.04738"
+         y="268.18076">WRITE_ONCE(a, 1);</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+       x="112.04738"
+       y="487.13766"
+       id="text4441"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan4443"
+         x="112.04738"
+         y="487.13766">WRITE_ONCE(b, 1);</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+       x="255.60869"
+       y="297.29346"
+       id="text4445"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan4447"
+         x="255.60869"
+         y="297.29346">r1 = READ_ONCE(a);</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+       x="255.14423"
+       y="554.61786"
+       id="text4449"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan4451"
+         x="255.14423"
+         y="554.61786">WRITE_ONCE(c, 1);</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+       x="396.10254"
+       y="370.71124"
+       id="text4453"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan4455"
+         x="396.10254"
+         y="370.71124">WRITE_ONCE(d, 1);</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+       x="396.10254"
+       y="572.13617"
+       id="text4457"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan4459"
+         x="396.10254"
+         y="572.13617">r2 = READ_ONCE(c);</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+       x="112.08231"
+       y="213.91006"
+       id="text4461"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan4463"
+         x="112.08231"
+         y="213.91006">thread0()</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+       x="252.34512"
+       y="213.91006"
+       id="text4461-6"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan4463-0"
+         x="252.34512"
+         y="213.91006">thread1()</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+       x="396.42557"
+       y="213.91006"
+       id="text4461-2"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan4463-2"
+         x="396.42557"
+         y="213.91006">thread2()</tspan></text>
+    <rect
+       style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+       id="rect4495"
+       width="724.25244"
+       height="505.21201"
+       x="34.648232"
+       y="191.10612" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+       d="m 183.14066,191.10612 0,504.24243"
+       id="path4497"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cc" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+       d="m 325.13867,191.10612 0,504.24243"
+       id="path4497-5"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cc" />
+    <text
+       xml:space="preserve"
+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+       x="111.75929"
+       y="251.53981"
+       id="text4429-8"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan4431-9"
+         x="111.75929"
+         y="251.53981">rcu_read_lock();</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+       x="396.10254"
+       y="353.91556"
+       id="text4429-8-9"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan4431-9-4"
+         x="396.10254"
+         y="353.91556">rcu_read_lock();</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+       x="396.10254"
+       y="587.40289"
+       id="text4429-8-9-3"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan4431-9-4-4"
+         x="396.10254"
+         y="587.40289">rcu_read_unlock();</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+       x="111.75929"
+       y="501.15311"
+       id="text4429-8-9-3-1"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan4431-9-4-4-6"
+         x="111.75929"
+         y="501.15311">rcu_read_unlock();</tspan></text>
+    <path
+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       d="m 33.941125,227.87568 724.941765,0"
+       id="path4608"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cc" />
+    <text
+       xml:space="preserve"
+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+       x="394.94427"
+       y="331.66351"
+       id="text4648"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan4650"
+         x="394.94427"
+         y="331.66351">QS</tspan></text>
+    <path
+       sodipodi:type="arc"
+       style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+       id="path4652"
+       sodipodi:cx="358.85669"
+       sodipodi:cy="142.87541"
+       sodipodi:rx="10.960155"
+       sodipodi:ry="10.253048"
+       d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
+       transform="translate(36.441125,185.60612)"
+       sodipodi:start="4.7135481"
+       sodipodi:end="10.994651"
+       sodipodi:open="true" />
+    <text
+       xml:space="preserve"
+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+       x="112.11968"
+       y="523.77856"
+       id="text4648-4"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan4650-4"
+         x="112.11968"
+         y="523.77856">QS</tspan></text>
+    <path
+       sodipodi:type="arc"
+       style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+       id="path4652-7"
+       sodipodi:cx="358.85669"
+       sodipodi:cy="142.87541"
+       sodipodi:rx="10.960155"
+       sodipodi:ry="10.253048"
+       d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
+       transform="translate(-246.38346,377.72117)"
+       sodipodi:start="4.7135481"
+       sodipodi:end="10.994651"
+       sodipodi:open="true" />
+    <path
+       sodipodi:type="arc"
+       style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+       id="path4652-7-7"
+       sodipodi:cx="358.85669"
+       sodipodi:cy="142.87541"
+       sodipodi:rx="10.960155"
+       sodipodi:ry="10.253048"
+       d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
+       transform="translate(-103.65246,190.90878)"
+       sodipodi:start="4.7135481"
+       sodipodi:end="10.994651"
+       sodipodi:open="true" />
+    <text
+       xml:space="preserve"
+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+       x="254.85066"
+       y="336.96619"
+       id="text4648-4-3"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan4650-4-5"
+         x="254.85066"
+         y="336.96619">QS</tspan></text>
+    <path
+       style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+       d="m 470.93311,190.39903 0,504.24243"
+       id="path4497-5-6"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cc" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+       d="m 616.22755,190.38323 0,504.24243"
+       id="path4497-5-2"
+       inkscape:connector-curvature="0"
+       sodipodi:nodetypes="cc" />
+    <g
+       id="g4433-6"
+       transform="translate(288.0964,78.32827)">
+      <text
+         sodipodi:linespacing="125%"
+         id="text2993-7"
+         y="-261.66608"
+         x="440.12299"
+         style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+         xml:space="preserve"
+         transform="matrix(0,1,-1,0,0,0)"><tspan
+           y="-261.66608"
+           x="440.12299"
+           id="tspan2995-1"
+           sodipodi:role="line">synchronize_rcu()</tspan></text>
+      <g
+         id="g4417-1"
+         transform="matrix(0,1,-1,0,730.90257,222.4928)">
+        <path
+           style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-start:url(#Arrow2Lstart);marker-end:url(#Arrow2Lend)"
+           d="M 97.580736,477.4048 328.5624,477.07246"
+           id="path2997-2"
+           inkscape:connector-curvature="0"
+           sodipodi:nodetypes="cc" />
+        <path
+           style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+           d="m 96.752718,465.38398 0,22.62742"
+           id="path4397-3"
+           inkscape:connector-curvature="0"
+           sodipodi:nodetypes="cc" />
+        <path
+           style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+           d="m 329.39039,465.38397 0,22.62742"
+           id="path4397-5-4"
+           inkscape:connector-curvature="0"
+           sodipodi:nodetypes="cc" />
+      </g>
+    </g>
+    <text
+       xml:space="preserve"
+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+       x="541.70508"
+       y="387.6217"
+       id="text4445-0"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan4447-5"
+         x="541.70508"
+         y="387.6217">r3 = READ_ONCE(d);</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+       x="541.2406"
+       y="646.94611"
+       id="text4449-6"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan4451-6"
+         x="541.2406"
+         y="646.94611">WRITE_ONCE(e, 1);</tspan></text>
+    <path
+       sodipodi:type="arc"
+       style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+       id="path4652-7-7-5"
+       sodipodi:cx="358.85669"
+       sodipodi:cy="142.87541"
+       sodipodi:rx="10.960155"
+       sodipodi:ry="10.253048"
+       d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
+       transform="translate(182.44393,281.23704)"
+       sodipodi:start="4.7135481"
+       sodipodi:end="10.994651"
+       sodipodi:open="true" />
+    <text
+       xml:space="preserve"
+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+       x="540.94702"
+       y="427.29443"
+       id="text4648-4-3-1"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan4650-4-5-7"
+         x="540.94702"
+         y="427.29443">QS</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+       x="686.27747"
+       y="461.83929"
+       id="text4453-7"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan4455-1"
+         x="686.27747"
+         y="461.83929">r4 = READ_ONCE(b);</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+       x="686.27747"
+       y="669.26422"
+       id="text4457-9"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan4459-2"
+         x="686.27747"
+         y="669.26422">r5 = READ_ONCE(e);</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+       x="686.27747"
+       y="445.04358"
+       id="text4429-8-9-33"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan4431-9-4-2"
+         x="686.27747"
+         y="445.04358">rcu_read_lock();</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+       x="686.27747"
+       y="684.53094"
+       id="text4429-8-9-3-8"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan4431-9-4-4-5"
+         x="686.27747"
+         y="684.53094">rcu_read_unlock();</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+       x="685.11914"
+       y="422.79153"
+       id="text4648-9"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan4650-7"
+         x="685.11914"
+         y="422.79153">QS</tspan></text>
+    <path
+       sodipodi:type="arc"
+       style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+       id="path4652-8"
+       sodipodi:cx="358.85669"
+       sodipodi:cy="142.87541"
+       sodipodi:rx="10.960155"
+       sodipodi:ry="10.253048"
+       d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
+       transform="translate(326.61602,276.73415)"
+       sodipodi:start="4.7135481"
+       sodipodi:end="10.994651"
+       sodipodi:open="true" />
+    <text
+       xml:space="preserve"
+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+       x="397.85934"
+       y="609.59003"
+       id="text4648-5"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan4650-77"
+         x="397.85934"
+         y="609.59003">QS</tspan></text>
+    <path
+       sodipodi:type="arc"
+       style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+       id="path4652-80"
+       sodipodi:cx="358.85669"
+       sodipodi:cy="142.87541"
+       sodipodi:rx="10.960155"
+       sodipodi:ry="10.253048"
+       d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
+       transform="translate(39.356201,463.53264)"
+       sodipodi:start="4.7135481"
+       sodipodi:end="10.994651"
+       sodipodi:open="true" />
+    <text
+       xml:space="preserve"
+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+       x="256.75986"
+       y="586.99133"
+       id="text4648-5-2"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan4650-77-7"
+         x="256.75986"
+         y="586.99133">QS</tspan></text>
+    <path
+       sodipodi:type="arc"
+       style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+       id="path4652-80-5"
+       sodipodi:cx="358.85669"
+       sodipodi:cy="142.87541"
+       sodipodi:rx="10.960155"
+       sodipodi:ry="10.253048"
+       d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
+       transform="translate(-101.74328,440.93395)"
+       sodipodi:start="4.7135481"
+       sodipodi:end="10.994651"
+       sodipodi:open="true" />
+    <text
+       xml:space="preserve"
+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+       x="546.22791"
+       y="213.91006"
+       id="text4461-2-5"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan4463-2-6"
+         x="546.22791"
+         y="213.91006">thread3()</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+       x="684.00067"
+       y="213.91006"
+       id="text4461-2-1"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan4463-2-0"
+         x="684.00067"
+         y="213.91006">thread4()</tspan></text>
+  </g>
+</svg>
diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html
new file mode 100644
index 0000000..36de7aa
--- /dev/null
+++ b/Documentation/RCU/Design/Requirements/Requirements.html
@@ -0,0 +1,2799 @@
+<!-- DO NOT HAND EDIT. -->
+<!-- Instead, edit Requirements.htmlx and run 'sh htmlqqz.sh Requirements' -->
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
+        "http://www.w3.org/TR/html4/loose.dtd">
+        <html>
+        <head><title>A Tour Through RCU's Requirements [LWN.net]</title>
+        <meta HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=utf-8">
+
+<h1>A Tour Through RCU's Requirements</h1>
+
+<p>Copyright IBM Corporation, 2015</p>
+<p>Author: Paul E.&nbsp;McKenney</p>
+<p><i>The initial version of this document appeared in the
+<a href="https://lwn.net/">LWN</a> articles
+<a href="https://lwn.net/Articles/652156/">here</a>,
+<a href="https://lwn.net/Articles/652677/">here</a>, and
+<a href="https://lwn.net/Articles/653326/">here</a>.</i></p>
+
+<h2>Introduction</h2>
+
+<p>
+Read-copy update (RCU) is a synchronization mechanism that is often
+used as a replacement for reader-writer locking.
+RCU is unusual in that updaters do not block readers,
+which means that RCU's read-side primitives can be exceedingly fast
+and scalable.
+In addition, updaters can make useful forward progress concurrently
+with readers.
+However, all this concurrency between RCU readers and updaters does raise
+the question of exactly what RCU readers are doing, which in turn
+raises the question of exactly what RCU's requirements are.
+
+<p>
+This document therefore summarizes RCU's requirements, and can be thought
+of as an informal, high-level specification for RCU.
+It is important to understand that RCU's specification is primarily
+empirical in nature;
+in fact, I learned about many of these requirements the hard way.
+This situation might cause some consternation, however, not only
+has this learning process been a lot of fun, but it has also been
+a great privilege to work with so many people willing to apply
+technologies in interesting new ways.
+
+<p>
+All that aside, here are the categories of currently known RCU requirements:
+</p>
+
+<ol>
+<li>	<a href="#Fundamental Requirements">
+	Fundamental Requirements</a>
+<li>	<a href="#Fundamental Non-Requirements">Fundamental Non-Requirements</a>
+<li>	<a href="#Parallelism Facts of Life">
+	Parallelism Facts of Life</a>
+<li>	<a href="#Quality-of-Implementation Requirements">
+	Quality-of-Implementation Requirements</a>
+<li>	<a href="#Linux Kernel Complications">
+	Linux Kernel Complications</a>
+<li>	<a href="#Software-Engineering Requirements">
+	Software-Engineering Requirements</a>
+<li>	<a href="#Other RCU Flavors">
+	Other RCU Flavors</a>
+<li>	<a href="#Possible Future Changes">
+	Possible Future Changes</a>
+</ol>
+
+<p>
+This is followed by a <a href="#Summary">summary</a>,
+which is in turn followed by the inevitable
+<a href="#Answers to Quick Quizzes">answers to the quick quizzes</a>.
+
+<h2><a name="Fundamental Requirements">Fundamental Requirements</a></h2>
+
+<p>
+RCU's fundamental requirements are the closest thing RCU has to hard
+mathematical requirements.
+These are:
+
+<ol>
+<li>	<a href="#Grace-Period Guarantee">
+	Grace-Period Guarantee</a>
+<li>	<a href="#Publish-Subscribe Guarantee">
+	Publish-Subscribe Guarantee</a>
+<li>	<a href="#RCU Primitives Guaranteed to Execute Unconditionally">
+	RCU Primitives Guaranteed to Execute Unconditionally</a>
+<li>	<a href="#Guaranteed Read-to-Write Upgrade">
+	Guaranteed Read-to-Write Upgrade</a>
+</ol>
+
+<h3><a name="Grace-Period Guarantee">Grace-Period Guarantee</a></h3>
+
+<p>
+RCU's grace-period guarantee is unusual in being premeditated:
+Jack Slingwine and I had this guarantee firmly in mind when we started
+work on RCU (then called &ldquo;rclock&rdquo;) in the early 1990s.
+That said, the past two decades of experience with RCU have produced
+a much more detailed understanding of this guarantee.
+
+<p>
+RCU's grace-period guarantee allows updaters to wait for the completion
+of all pre-existing RCU read-side critical sections.
+An RCU read-side critical section
+begins with the marker <tt>rcu_read_lock()</tt> and ends with
+the marker <tt>rcu_read_unlock()</tt>.
+These markers may be nested, and RCU treats a nested set as one
+big RCU read-side critical section.
+Production-quality implementations of <tt>rcu_read_lock()</tt> and
+<tt>rcu_read_unlock()</tt> are extremely lightweight, and in
+fact have exactly zero overhead in Linux kernels built for production
+use with <tt>CONFIG_PREEMPT=n</tt>.
+
+<p>
+This guarantee allows ordering to be enforced with extremely low
+overhead to readers, for example:
+
+<blockquote>
+<pre>
+ 1 int x, y;
+ 2
+ 3 void thread0(void)
+ 4 {
+ 5   rcu_read_lock();
+ 6   r1 = READ_ONCE(x);
+ 7   r2 = READ_ONCE(y);
+ 8   rcu_read_unlock();
+ 9 }
+10
+11 void thread1(void)
+12 {
+13   WRITE_ONCE(x, 1);
+14   synchronize_rcu();
+15   WRITE_ONCE(y, 1);
+16 }
+</pre>
+</blockquote>
+
+<p>
+Because the <tt>synchronize_rcu()</tt> on line&nbsp;14 waits for
+all pre-existing readers, any instance of <tt>thread0()</tt> that
+loads a value of zero from <tt>x</tt> must complete before
+<tt>thread1()</tt> stores to <tt>y</tt>, so that instance must
+also load a value of zero from <tt>y</tt>.
+Similarly, any instance of <tt>thread0()</tt> that loads a value of
+one from <tt>y</tt> must have started after the
+<tt>synchronize_rcu()</tt> started, and must therefore also load
+a value of one from <tt>x</tt>.
+Therefore, the outcome:
+<blockquote>
+<pre>
+(r1 == 0 &amp;&amp; r2 == 1)
+</pre>
+</blockquote>
+cannot happen.
+
+<p><a name="Quick Quiz 1"><b>Quick Quiz 1</b>:</a>
+Wait a minute!
+You said that updaters can make useful forward progress concurrently
+with readers, but pre-existing readers will block
+<tt>synchronize_rcu()</tt>!!!
+Just who are you trying to fool???
+<br><a href="#qq1answer">Answer</a>
+
+<p>
+This scenario resembles one of the first uses of RCU in
+<a href="https://en.wikipedia.org/wiki/DYNIX">DYNIX/ptx</a>,
+which managed a distributed lock manager's transition into
+a state suitable for handling recovery from node failure,
+more or less as follows:
+
+<blockquote>
+<pre>
+ 1 #define STATE_NORMAL        0
+ 2 #define STATE_WANT_RECOVERY 1
+ 3 #define STATE_RECOVERING    2
+ 4 #define STATE_WANT_NORMAL   3
+ 5
+ 6 int state = STATE_NORMAL;
+ 7
+ 8 void do_something_dlm(void)
+ 9 {
+10   int state_snap;
+11
+12   rcu_read_lock();
+13   state_snap = READ_ONCE(state);
+14   if (state_snap == STATE_NORMAL)
+15     do_something();
+16   else
+17     do_something_carefully();
+18   rcu_read_unlock();
+19 }
+20
+21 void start_recovery(void)
+22 {
+23   WRITE_ONCE(state, STATE_WANT_RECOVERY);
+24   synchronize_rcu();
+25   WRITE_ONCE(state, STATE_RECOVERING);
+26   recovery();
+27   WRITE_ONCE(state, STATE_WANT_NORMAL);
+28   synchronize_rcu();
+29   WRITE_ONCE(state, STATE_NORMAL);
+30 }
+</pre>
+</blockquote>
+
+<p>
+The RCU read-side critical section in <tt>do_something_dlm()</tt>
+works with the <tt>synchronize_rcu()</tt> in <tt>start_recovery()</tt>
+to guarantee that <tt>do_something()</tt> never runs concurrently
+with <tt>recovery()</tt>, but with little or no synchronization
+overhead in <tt>do_something_dlm()</tt>.
+
+<p><a name="Quick Quiz 2"><b>Quick Quiz 2</b>:</a>
+Why is the <tt>synchronize_rcu()</tt> on line&nbsp;28 needed?
+<br><a href="#qq2answer">Answer</a>
+
+<p>
+In order to avoid fatal problems such as deadlocks,
+an RCU read-side critical section must not contain calls to
+<tt>synchronize_rcu()</tt>.
+Similarly, an RCU read-side critical section must not
+contain anything that waits, directly or indirectly, on completion of
+an invocation of <tt>synchronize_rcu()</tt>.
+
+<p>
+Although RCU's grace-period guarantee is useful in and of itself, with
+<a href="https://lwn.net/Articles/573497/">quite a few use cases</a>,
+it would be good to be able to use RCU to coordinate read-side
+access to linked data structures.
+For this, the grace-period guarantee is not sufficient, as can
+be seen in function <tt>add_gp_buggy()</tt> below.
+We will look at the reader's code later, but in the meantime, just think of
+the reader as locklessly picking up the <tt>gp</tt> pointer,
+and, if the value loaded is non-<tt>NULL</tt>, locklessly accessing the
+<tt>-&gt;a</tt> and <tt>-&gt;b</tt> fields.
+
+<blockquote>
+<pre>
+ 1 bool add_gp_buggy(int a, int b)
+ 2 {
+ 3   p = kmalloc(sizeof(*p), GFP_KERNEL);
+ 4   if (!p)
+ 5     return -ENOMEM;
+ 6   spin_lock(&amp;gp_lock);
+ 7   if (rcu_access_pointer(gp)) {
+ 8     spin_unlock(&amp;gp_lock);
+ 9     return false;
+10   }
+11   p-&gt;a = a;
+12   p-&gt;b = a;
+13   gp = p; /* ORDERING BUG */
+14   spin_unlock(&amp;gp_lock);
+15   return true;
+16 }
+</pre>
+</blockquote>
+
+<p>
+The problem is that both the compiler and weakly ordered CPUs are within
+their rights to reorder this code as follows:
+
+<blockquote>
+<pre>
+ 1 bool add_gp_buggy_optimized(int a, int b)
+ 2 {
+ 3   p = kmalloc(sizeof(*p), GFP_KERNEL);
+ 4   if (!p)
+ 5     return -ENOMEM;
+ 6   spin_lock(&amp;gp_lock);
+ 7   if (rcu_access_pointer(gp)) {
+ 8     spin_unlock(&amp;gp_lock);
+ 9     return false;
+10   }
+<b>11   gp = p; /* ORDERING BUG */
+12   p-&gt;a = a;
+13   p-&gt;b = a;</b>
+14   spin_unlock(&amp;gp_lock);
+15   return true;
+16 }
+</pre>
+</blockquote>
+
+<p>
+If an RCU reader fetches <tt>gp</tt> just after
+<tt>add_gp_buggy_optimized</tt> executes line&nbsp;11,
+it will see garbage in the <tt>-&gt;a</tt> and <tt>-&gt;b</tt>
+fields.
+And this is but one of many ways in which compiler and hardware optimizations
+could cause trouble.
+Therefore, we clearly need some way to prevent the compiler and the CPU from
+reordering in this manner, which brings us to the publish-subscribe
+guarantee discussed in the next section.
+
+<h3><a name="Publish-Subscribe Guarantee">Publish/Subscribe Guarantee</a></h3>
+
+<p>
+RCU's publish-subscribe guarantee allows data to be inserted
+into a linked data structure without disrupting RCU readers.
+The updater uses <tt>rcu_assign_pointer()</tt> to insert the
+new data, and readers use <tt>rcu_dereference()</tt> to
+access data, whether new or old.
+The following shows an example of insertion:
+
+<blockquote>
+<pre>
+ 1 bool add_gp(int a, int b)
+ 2 {
+ 3   p = kmalloc(sizeof(*p), GFP_KERNEL);
+ 4   if (!p)
+ 5     return -ENOMEM;
+ 6   spin_lock(&amp;gp_lock);
+ 7   if (rcu_access_pointer(gp)) {
+ 8     spin_unlock(&amp;gp_lock);
+ 9     return false;
+10   }
+11   p-&gt;a = a;
+12   p-&gt;b = a;
+13   rcu_assign_pointer(gp, p);
+14   spin_unlock(&amp;gp_lock);
+15   return true;
+16 }
+</pre>
+</blockquote>
+
+<p>
+The <tt>rcu_assign_pointer()</tt> on line&nbsp;13 is conceptually
+equivalent to a simple assignment statement, but also guarantees
+that its assignment will
+happen after the two assignments in lines&nbsp;11 and&nbsp;12,
+similar to the C11 <tt>memory_order_release</tt> store operation.
+It also prevents any number of &ldquo;interesting&rdquo; compiler
+optimizations, for example, the use of <tt>gp</tt> as a scratch
+location immediately preceding the assignment.
+
+<p><a name="Quick Quiz 3"><b>Quick Quiz 3</b>:</a>
+But <tt>rcu_assign_pointer()</tt> does nothing to prevent the
+two assignments to <tt>p-&gt;a</tt> and <tt>p-&gt;b</tt>
+from being reordered.
+Can't that also cause problems?
+<br><a href="#qq3answer">Answer</a>
+
+<p>
+It is tempting to assume that the reader need not do anything special
+to control its accesses to the RCU-protected data,
+as shown in <tt>do_something_gp_buggy()</tt> below:
+
+<blockquote>
+<pre>
+ 1 bool do_something_gp_buggy(void)
+ 2 {
+ 3   rcu_read_lock();
+ 4   p = gp;  /* OPTIMIZATIONS GALORE!!! */
+ 5   if (p) {
+ 6     do_something(p-&gt;a, p-&gt;b);
+ 7     rcu_read_unlock();
+ 8     return true;
+ 9   }
+10   rcu_read_unlock();
+11   return false;
+12 }
+</pre>
+</blockquote>
+
+<p>
+However, this temptation must be resisted because there are a
+surprisingly large number of ways that the compiler
+(to say nothing of
+<a href="https://h71000.www7.hp.com/wizard/wiz_2637.html">DEC Alpha CPUs</a>)
+can trip this code up.
+For but one example, if the compiler were short of registers, it
+might choose to refetch from <tt>gp</tt> rather than keeping
+a separate copy in <tt>p</tt> as follows:
+
+<blockquote>
+<pre>
+ 1 bool do_something_gp_buggy_optimized(void)
+ 2 {
+ 3   rcu_read_lock();
+ 4   if (gp) { /* OPTIMIZATIONS GALORE!!! */
+<b> 5     do_something(gp-&gt;a, gp-&gt;b);</b>
+ 6     rcu_read_unlock();
+ 7     return true;
+ 8   }
+ 9   rcu_read_unlock();
+10   return false;
+11 }
+</pre>
+</blockquote>
+
+<p>
+If this function ran concurrently with a series of updates that
+replaced the current structure with a new one,
+the fetches of <tt>gp-&gt;a</tt>
+and <tt>gp-&gt;b</tt> might well come from two different structures,
+which could cause serious confusion.
+To prevent this (and much else besides), <tt>do_something_gp()</tt> uses
+<tt>rcu_dereference()</tt> to fetch from <tt>gp</tt>:
+
+<blockquote>
+<pre>
+ 1 bool do_something_gp(void)
+ 2 {
+ 3   rcu_read_lock();
+ 4   p = rcu_dereference(gp);
+ 5   if (p) {
+ 6     do_something(p-&gt;a, p-&gt;b);
+ 7     rcu_read_unlock();
+ 8     return true;
+ 9   }
+10   rcu_read_unlock();
+11   return false;
+12 }
+</pre>
+</blockquote>
+
+<p>
+The <tt>rcu_dereference()</tt> uses volatile casts and (for DEC Alpha)
+memory barriers in the Linux kernel.
+Should a
+<a href="http://www.rdrop.com/users/paulmck/RCU/consume.2015.07.13a.pdf">high-quality implementation of C11 <tt>memory_order_consume</tt> [PDF]</a>
+ever appear, then <tt>rcu_dereference()</tt> could be implemented
+as a <tt>memory_order_consume</tt> load.
+Regardless of the exact implementation, a pointer fetched by
+<tt>rcu_dereference()</tt> may not be used outside of the
+outermost RCU read-side critical section containing that
+<tt>rcu_dereference()</tt>, unless protection of
+the corresponding data element has been passed from RCU to some
+other synchronization mechanism, most commonly locking or
+<a href="https://www.kernel.org/doc/Documentation/RCU/rcuref.txt">reference counting</a>.
+
+<p>
+In short, updaters use <tt>rcu_assign_pointer()</tt> and readers
+use <tt>rcu_dereference()</tt>, and these two RCU API elements
+work together to ensure that readers have a consistent view of
+newly added data elements.
+
+<p>
+Of course, it is also necessary to remove elements from RCU-protected
+data structures, for example, using the following process:
+
+<ol>
+<li>	Remove the data element from the enclosing structure.
+<li>	Wait for all pre-existing RCU read-side critical sections
+	to complete (because only pre-existing readers can possibly have
+	a reference to the newly removed data element).
+<li>	At this point, only the updater has a reference to the
+	newly removed data element, so it can safely reclaim
+	the data element, for example, by passing it to <tt>kfree()</tt>.
+</ol>
+
+This process is implemented by <tt>remove_gp_synchronous()</tt>:
+
+<blockquote>
+<pre>
+ 1 bool remove_gp_synchronous(void)
+ 2 {
+ 3   struct foo *p;
+ 4
+ 5   spin_lock(&amp;gp_lock);
+ 6   p = rcu_access_pointer(gp);
+ 7   if (!p) {
+ 8     spin_unlock(&amp;gp_lock);
+ 9     return false;
+10   }
+11   rcu_assign_pointer(gp, NULL);
+12   spin_unlock(&amp;gp_lock);
+13   synchronize_rcu();
+14   kfree(p);
+15   return true;
+16 }
+</pre>
+</blockquote>
+
+<p>
+This function is straightforward, with line&nbsp;13 waiting for a grace
+period before line&nbsp;14 frees the old data element.
+This waiting ensures that readers will reach line&nbsp;7 of
+<tt>do_something_gp()</tt> before the data element referenced by
+<tt>p</tt> is freed.
+The <tt>rcu_access_pointer()</tt> on line&nbsp;6 is similar to
+<tt>rcu_dereference()</tt>, except that:
+
+<ol>
+<li>	The value returned by <tt>rcu_access_pointer()</tt>
+	cannot be dereferenced.
+	If you want to access the value pointed to as well as
+	the pointer itself, use <tt>rcu_dereference()</tt>
+	instead of <tt>rcu_access_pointer()</tt>.
+<li>	The call to <tt>rcu_access_pointer()</tt> need not be
+	protected.
+	In contrast, <tt>rcu_dereference()</tt> must either be
+	within an RCU read-side critical section or in a code
+	segment where the pointer cannot change, for example, in
+	code protected by the corresponding update-side lock.
+</ol>
+
+<p><a name="Quick Quiz 4"><b>Quick Quiz 4</b>:</a>
+Without the <tt>rcu_dereference()</tt> or the
+<tt>rcu_access_pointer()</tt>, what destructive optimizations
+might the compiler make use of?
+<br><a href="#qq4answer">Answer</a>
+
+<p>
+This simple linked-data-structure scenario clearly demonstrates the need
+for RCU's stringent memory-ordering guarantees on systems with more than
+one CPU:
+
+<ol>
+<li>	Each CPU that has an RCU read-side critical section that
+	begins before <tt>synchronize_rcu()</tt> starts is
+	guaranteed to execute a full memory barrier between the time
+	that the RCU read-side critical section ends and the time that
+	<tt>synchronize_rcu()</tt> returns.
+	Without this guarantee, a pre-existing RCU read-side critical section
+	might hold a reference to the newly removed <tt>struct foo</tt>
+	after the <tt>kfree()</tt> on line&nbsp;14 of
+	<tt>remove_gp_synchronous()</tt>.
+<li>	Each CPU that has an RCU read-side critical section that ends
+	after <tt>synchronize_rcu()</tt> returns is guaranteed
+	to execute a full memory barrier between the time that
+	<tt>synchronize_rcu()</tt> begins and the time that the RCU
+	read-side critical section begins.
+	Without this guarantee, a later RCU read-side critical section
+	running after the <tt>kfree()</tt> on line&nbsp;14 of
+	<tt>remove_gp_synchronous()</tt> might
+	later run <tt>do_something_gp()</tt> and find the
+	newly deleted <tt>struct foo</tt>.
+<li>	If the task invoking <tt>synchronize_rcu()</tt> remains
+	on a given CPU, then that CPU is guaranteed to execute a full
+	memory barrier sometime during the execution of
+	<tt>synchronize_rcu()</tt>.
+	This guarantee ensures that the <tt>kfree()</tt> on
+	line&nbsp;14 of <tt>remove_gp_synchronous()</tt> really does
+	execute after the removal on line&nbsp;11.
+<li>	If the task invoking <tt>synchronize_rcu()</tt> migrates
+	among a group of CPUs during that invocation, then each of the
+	CPUs in that group is guaranteed to execute a full memory barrier
+	sometime during the execution of <tt>synchronize_rcu()</tt>.
+	This guarantee also ensures that the <tt>kfree()</tt> on
+	line&nbsp;14 of <tt>remove_gp_synchronous()</tt> really does
+	execute after the removal on
+	line&nbsp;11, but also in the case where the thread executing the
+	<tt>synchronize_rcu()</tt> migrates in the meantime.
+</ol>
+
+<p><a name="Quick Quiz 5"><b>Quick Quiz 5</b>:</a>
+Given that multiple CPUs can start RCU read-side critical sections
+at any time without any ordering whatsoever, how can RCU possibly tell whether
+or not a given RCU read-side critical section starts before a
+given instance of <tt>synchronize_rcu()</tt>?
+<br><a href="#qq5answer">Answer</a>
+
+<p><a name="Quick Quiz 6"><b>Quick Quiz 6</b>:</a>
+The first and second guarantees require unbelievably strict ordering!
+Are all these memory barriers <i> really</i> required?
+<br><a href="#qq6answer">Answer</a>
+
+<p>
+In short, RCU's publish-subscribe guarantee is provided by the combination
+of <tt>rcu_assign_pointer()</tt> and <tt>rcu_dereference()</tt>.
+This guarantee allows data elements to be safely added to RCU-protected
+linked data structures without disrupting RCU readers.
+This guarantee can be used in combination with the grace-period
+guarantee to also allow data elements to be removed from RCU-protected
+linked data structures, again without disrupting RCU readers.
+
+<p>
+This guarantee was only partially premeditated.
+DYNIX/ptx used an explicit memory barrier for publication, but had nothing
+resembling <tt>rcu_dereference()</tt> for subscription, nor did it
+have anything resembling the <tt>smp_read_barrier_depends()</tt>
+that was later subsumed into <tt>rcu_dereference()</tt>.
+The need for these operations made itself known quite suddenly at a
+late-1990s meeting with the DEC Alpha architects, back in the days when
+DEC was still a free-standing company.
+It took the Alpha architects a good hour to convince me that any sort
+of barrier would ever be needed, and it then took me a good <i>two</i> hours
+to convince them that their documentation did not make this point clear.
+More recent work with the C and C++ standards committees have provided
+much education on tricks and traps from the compiler.
+In short, compilers were much less tricky in the early 1990s, but in
+2015, don't even think about omitting <tt>rcu_dereference()</tt>!
+
+<h3><a name="RCU Primitives Guaranteed to Execute Unconditionally">RCU Primitives Guaranteed to Execute Unconditionally</a></h3>
+
+<p>
+The common-case RCU primitives are unconditional.
+They are invoked, they do their job, and they return, with no possibility
+of error, and no need to retry.
+This is a key RCU design philosophy.
+
+<p>
+However, this philosophy is pragmatic rather than pigheaded.
+If someone comes up with a good justification for a particular conditional
+RCU primitive, it might well be implemented and added.
+After all, this guarantee was reverse-engineered, not premeditated.
+The unconditional nature of the RCU primitives was initially an
+accident of implementation, and later experience with synchronization
+primitives with conditional primitives caused me to elevate this
+accident to a guarantee.
+Therefore, the justification for adding a conditional primitive to
+RCU would need to be based on detailed and compelling use cases.
+
+<h3><a name="Guaranteed Read-to-Write Upgrade">Guaranteed Read-to-Write Upgrade</a></h3>
+
+<p>
+As far as RCU is concerned, it is always possible to carry out an
+update within an RCU read-side critical section.
+For example, that RCU read-side critical section might search for
+a given data element, and then might acquire the update-side
+spinlock in order to update that element, all while remaining
+in that RCU read-side critical section.
+Of course, it is necessary to exit the RCU read-side critical section
+before invoking <tt>synchronize_rcu()</tt>, however, this
+inconvenience can be avoided through use of the
+<tt>call_rcu()</tt> and <tt>kfree_rcu()</tt> API members
+described later in this document.
+
+<p><a name="Quick Quiz 7"><b>Quick Quiz 7</b>:</a>
+But how does the upgrade-to-write operation exclude other readers?
+<br><a href="#qq7answer">Answer</a>
+
+<p>
+This guarantee allows lookup code to be shared between read-side
+and update-side code, and was premeditated, appearing in the earliest
+DYNIX/ptx RCU documentation.
+
+<h2><a name="Fundamental Non-Requirements">Fundamental Non-Requirements</a></h2>
+
+<p>
+RCU provides extremely lightweight readers, and its read-side guarantees,
+though quite useful, are correspondingly lightweight.
+It is therefore all too easy to assume that RCU is guaranteeing more
+than it really is.
+Of course, the list of things that RCU does not guarantee is infinitely
+long, however, the following sections list a few non-guarantees that
+have caused confusion.
+Except where otherwise noted, these non-guarantees were premeditated.
+
+<ol>
+<li>	<a href="#Readers Impose Minimal Ordering">
+	Readers Impose Minimal Ordering</a>
+<li>	<a href="#Readers Do Not Exclude Updaters">
+	Readers Do Not Exclude Updaters</a>
+<li>	<a href="#Updaters Only Wait For Old Readers">
+	Updaters Only Wait For Old Readers</a>
+<li>	<a href="#Grace Periods Don't Partition Read-Side Critical Sections">
+	Grace Periods Don't Partition Read-Side Critical Sections</a>
+<li>	<a href="#Read-Side Critical Sections Don't Partition Grace Periods">
+	Read-Side Critical Sections Don't Partition Grace Periods</a>
+<li>	<a href="#Disabling Preemption Does Not Block Grace Periods">
+	Disabling Preemption Does Not Block Grace Periods</a>
+</ol>
+
+<h3><a name="Readers Impose Minimal Ordering">Readers Impose Minimal Ordering</a></h3>
+
+<p>
+Reader-side markers such as <tt>rcu_read_lock()</tt> and
+<tt>rcu_read_unlock()</tt> provide absolutely no ordering guarantees
+except through their interaction with the grace-period APIs such as
+<tt>synchronize_rcu()</tt>.
+To see this, consider the following pair of threads:
+
+<blockquote>
+<pre>
+ 1 void thread0(void)
+ 2 {
+ 3   rcu_read_lock();
+ 4   WRITE_ONCE(x, 1);
+ 5   rcu_read_unlock();
+ 6   rcu_read_lock();
+ 7   WRITE_ONCE(y, 1);
+ 8   rcu_read_unlock();
+ 9 }
+10
+11 void thread1(void)
+12 {
+13   rcu_read_lock();
+14   r1 = READ_ONCE(y);
+15   rcu_read_unlock();
+16   rcu_read_lock();
+17   r2 = READ_ONCE(x);
+18   rcu_read_unlock();
+19 }
+</pre>
+</blockquote>
+
+<p>
+After <tt>thread0()</tt> and <tt>thread1()</tt> execute
+concurrently, it is quite possible to have
+
+<blockquote>
+<pre>
+(r1 == 1 &amp;&amp; r2 == 0)
+</pre>
+</blockquote>
+
+(that is, <tt>y</tt> appears to have been assigned before <tt>x</tt>),
+which would not be possible if <tt>rcu_read_lock()</tt> and
+<tt>rcu_read_unlock()</tt> had much in the way of ordering
+properties.
+But they do not, so the CPU is within its rights
+to do significant reordering.
+This is by design:  Any significant ordering constraints would slow down
+these fast-path APIs.
+
+<p><a name="Quick Quiz 8"><b>Quick Quiz 8</b>:</a>
+Can't the compiler also reorder this code?
+<br><a href="#qq8answer">Answer</a>
+
+<h3><a name="Readers Do Not Exclude Updaters">Readers Do Not Exclude Updaters</a></h3>
+
+<p>
+Neither <tt>rcu_read_lock()</tt> nor <tt>rcu_read_unlock()</tt>
+exclude updates.
+All they do is to prevent grace periods from ending.
+The following example illustrates this:
+
+<blockquote>
+<pre>
+ 1 void thread0(void)
+ 2 {
+ 3   rcu_read_lock();
+ 4   r1 = READ_ONCE(y);
+ 5   if (r1) {
+ 6     do_something_with_nonzero_x();
+ 7     r2 = READ_ONCE(x);
+ 8     WARN_ON(!r2); /* BUG!!! */
+ 9   }
+10   rcu_read_unlock();
+11 }
+12
+13 void thread1(void)
+14 {
+15   spin_lock(&amp;my_lock);
+16   WRITE_ONCE(x, 1);
+17   WRITE_ONCE(y, 1);
+18   spin_unlock(&amp;my_lock);
+19 }
+</pre>
+</blockquote>
+
+<p>
+If the <tt>thread0()</tt> function's <tt>rcu_read_lock()</tt>
+excluded the <tt>thread1()</tt> function's update,
+the <tt>WARN_ON()</tt> could never fire.
+But the fact is that <tt>rcu_read_lock()</tt> does not exclude
+much of anything aside from subsequent grace periods, of which
+<tt>thread1()</tt> has none, so the
+<tt>WARN_ON()</tt> can and does fire.
+
+<h3><a name="Updaters Only Wait For Old Readers">Updaters Only Wait For Old Readers</a></h3>
+
+<p>
+It might be tempting to assume that after <tt>synchronize_rcu()</tt>
+completes, there are no readers executing.
+This temptation must be avoided because
+new readers can start immediately after <tt>synchronize_rcu()</tt>
+starts, and <tt>synchronize_rcu()</tt> is under no
+obligation to wait for these new readers.
+
+<p><a name="Quick Quiz 9"><b>Quick Quiz 9</b>:</a>
+Suppose that synchronize_rcu() did wait until all readers had completed.
+Would the updater be able to rely on this?
+<br><a href="#qq9answer">Answer</a>
+
+<h3><a name="Grace Periods Don't Partition Read-Side Critical Sections">
+Grace Periods Don't Partition Read-Side Critical Sections</a></h3>
+
+<p>
+It is tempting to assume that if any part of one RCU read-side critical
+section precedes a given grace period, and if any part of another RCU
+read-side critical section follows that same grace period, then all of
+the first RCU read-side critical section must precede all of the second.
+However, this just isn't the case: A single grace period does not
+partition the set of RCU read-side critical sections.
+An example of this situation can be illustrated as follows, where
+<tt>x</tt>, <tt>y</tt>, and <tt>z</tt> are initially all zero:
+
+<blockquote>
+<pre>
+ 1 void thread0(void)
+ 2 {
+ 3   rcu_read_lock();
+ 4   WRITE_ONCE(a, 1);
+ 5   WRITE_ONCE(b, 1);
+ 6   rcu_read_unlock();
+ 7 }
+ 8
+ 9 void thread1(void)
+10 {
+11   r1 = READ_ONCE(a);
+12   synchronize_rcu();
+13   WRITE_ONCE(c, 1);
+14 }
+15
+16 void thread2(void)
+17 {
+18   rcu_read_lock();
+19   r2 = READ_ONCE(b);
+20   r3 = READ_ONCE(c);
+21   rcu_read_unlock();
+22 }
+</pre>
+</blockquote>
+
+<p>
+It turns out that the outcome:
+
+<blockquote>
+<pre>
+(r1 == 1 &amp;&amp; r2 == 0 &amp;&amp; r3 == 1)
+</pre>
+</blockquote>
+
+is entirely possible.
+The following figure show how this can happen, with each circled
+<tt>QS</tt> indicating the point at which RCU recorded a
+<i>quiescent state</i> for each thread, that is, a state in which
+RCU knows that the thread cannot be in the midst of an RCU read-side
+critical section that started before the current grace period:
+
+<p><img src="GPpartitionReaders1.svg" alt="GPpartitionReaders1.svg" width="60%"></p>
+
+<p>
+If it is necessary to partition RCU read-side critical sections in this
+manner, it is necessary to use two grace periods, where the first
+grace period is known to end before the second grace period starts:
+
+<blockquote>
+<pre>
+ 1 void thread0(void)
+ 2 {
+ 3   rcu_read_lock();
+ 4   WRITE_ONCE(a, 1);
+ 5   WRITE_ONCE(b, 1);
+ 6   rcu_read_unlock();
+ 7 }
+ 8
+ 9 void thread1(void)
+10 {
+11   r1 = READ_ONCE(a);
+12   synchronize_rcu();
+13   WRITE_ONCE(c, 1);
+14 }
+15
+16 void thread2(void)
+17 {
+18   r2 = READ_ONCE(c);
+19   synchronize_rcu();
+20   WRITE_ONCE(d, 1);
+21 }
+22
+23 void thread3(void)
+24 {
+25   rcu_read_lock();
+26   r3 = READ_ONCE(b);
+27   r4 = READ_ONCE(d);
+28   rcu_read_unlock();
+29 }
+</pre>
+</blockquote>
+
+<p>
+Here, if <tt>(r1 == 1)</tt>, then
+<tt>thread0()</tt>'s write to <tt>b</tt> must happen
+before the end of <tt>thread1()</tt>'s grace period.
+If in addition <tt>(r4 == 1)</tt>, then
+<tt>thread3()</tt>'s read from <tt>b</tt> must happen
+after the beginning of <tt>thread2()</tt>'s grace period.
+If it is also the case that <tt>(r2 == 1)</tt>, then the
+end of <tt>thread1()</tt>'s grace period must precede the
+beginning of <tt>thread2()</tt>'s grace period.
+This mean that the two RCU read-side critical sections cannot overlap,
+guaranteeing that <tt>(r3 == 1)</tt>.
+As a result, the outcome:
+
+<blockquote>
+<pre>
+(r1 == 1 &amp;&amp; r2 == 1 &amp;&amp; r3 == 0 &amp;&amp; r4 == 1)
+</pre>
+</blockquote>
+
+cannot happen.
+
+<p>
+This non-requirement was also non-premeditated, but became apparent
+when studying RCU's interaction with memory ordering.
+
+<h3><a name="Read-Side Critical Sections Don't Partition Grace Periods">
+Read-Side Critical Sections Don't Partition Grace Periods</a></h3>
+
+<p>
+It is also tempting to assume that if an RCU read-side critical section
+happens between a pair of grace periods, then those grace periods cannot
+overlap.
+However, this temptation leads nowhere good, as can be illustrated by
+the following, with all variables initially zero:
+
+<blockquote>
+<pre>
+ 1 void thread0(void)
+ 2 {
+ 3   rcu_read_lock();
+ 4   WRITE_ONCE(a, 1);
+ 5   WRITE_ONCE(b, 1);
+ 6   rcu_read_unlock();
+ 7 }
+ 8
+ 9 void thread1(void)
+10 {
+11   r1 = READ_ONCE(a);
+12   synchronize_rcu();
+13   WRITE_ONCE(c, 1);
+14 }
+15
+16 void thread2(void)
+17 {
+18   rcu_read_lock();
+19   WRITE_ONCE(d, 1);
+20   r2 = READ_ONCE(c);
+21   rcu_read_unlock();
+22 }
+23
+24 void thread3(void)
+25 {
+26   r3 = READ_ONCE(d);
+27   synchronize_rcu();
+28   WRITE_ONCE(e, 1);
+29 }
+30
+31 void thread4(void)
+32 {
+33   rcu_read_lock();
+34   r4 = READ_ONCE(b);
+35   r5 = READ_ONCE(e);
+36   rcu_read_unlock();
+37 }
+</pre>
+</blockquote>
+
+<p>
+In this case, the outcome:
+
+<blockquote>
+<pre>
+(r1 == 1 &amp;&amp; r2 == 1 &amp;&amp; r3 == 1 &amp;&amp; r4 == 0 &amp&amp; r5 == 1)
+</pre>
+</blockquote>
+
+is entirely possible, as illustrated below:
+
+<p><img src="ReadersPartitionGP1.svg" alt="ReadersPartitionGP1.svg" width="100%"></p>
+
+<p>
+Again, an RCU read-side critical section can overlap almost all of a
+given grace period, just so long as it does not overlap the entire
+grace period.
+As a result, an RCU read-side critical section cannot partition a pair
+of RCU grace periods.
+
+<p><a name="Quick Quiz 10"><b>Quick Quiz 10</b>:</a>
+How long a sequence of grace periods, each separated by an RCU read-side
+critical section, would be required to partition the RCU read-side
+critical sections at the beginning and end of the chain?
+<br><a href="#qq10answer">Answer</a>
+
+<h3><a name="Disabling Preemption Does Not Block Grace Periods">
+Disabling Preemption Does Not Block Grace Periods</a></h3>
+
+<p>
+There was a time when disabling preemption on any given CPU would block
+subsequent grace periods.
+However, this was an accident of implementation and is not a requirement.
+And in the current Linux-kernel implementation, disabling preemption
+on a given CPU in fact does not block grace periods, as Oleg Nesterov
+<a href="https://lkml.kernel.org/g/20150614193825.GA19582@redhat.com">demonstrated</a>.
+
+<p>
+If you need a preempt-disable region to block grace periods, you need to add
+<tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>, for example
+as follows:
+
+<blockquote>
+<pre>
+ 1 preempt_disable();
+ 2 rcu_read_lock();
+ 3 do_something();
+ 4 rcu_read_unlock();
+ 5 preempt_enable();
+ 6
+ 7 /* Spinlocks implicitly disable preemption. */
+ 8 spin_lock(&amp;mylock);
+ 9 rcu_read_lock();
+10 do_something();
+11 rcu_read_unlock();
+12 spin_unlock(&amp;mylock);
+</pre>
+</blockquote>
+
+<p>
+In theory, you could enter the RCU read-side critical section first,
+but it is more efficient to keep the entire RCU read-side critical
+section contained in the preempt-disable region as shown above.
+Of course, RCU read-side critical sections that extend outside of
+preempt-disable regions will work correctly, but such critical sections
+can be preempted, which forces <tt>rcu_read_unlock()</tt> to do
+more work.
+And no, this is <i>not</i> an invitation to enclose all of your RCU
+read-side critical sections within preempt-disable regions, because
+doing so would degrade real-time response.
+
+<p>
+This non-requirement appeared with preemptible RCU.
+If you need a grace period that waits on non-preemptible code regions, use
+<a href="#Sched Flavor">RCU-sched</a>.
+
+<h2><a name="Parallelism Facts of Life">Parallelism Facts of Life</a></h2>
+
+<p>
+These parallelism facts of life are by no means specific to RCU, but
+the RCU implementation must abide by them.
+They therefore bear repeating:
+
+<ol>
+<li>	Any CPU or task may be delayed at any time,
+	and any attempts to avoid these delays by disabling
+	preemption, interrupts, or whatever are completely futile.
+	This is most obvious in preemptible user-level
+	environments and in virtualized environments (where
+	a given guest OS's VCPUs can be preempted at any time by
+	the underlying hypervisor), but can also happen in bare-metal
+	environments due to ECC errors, NMIs, and other hardware
+	events.
+	Although a delay of more than about 20 seconds can result
+	in splats, the RCU implementation is obligated to use
+	algorithms that can tolerate extremely long delays, but where
+	&ldquo;extremely long&rdquo; is not long enough to allow
+	wrap-around when incrementing a 64-bit counter.
+<li>	Both the compiler and the CPU can reorder memory accesses.
+	Where it matters, RCU must use compiler directives and
+	memory-barrier instructions to preserve ordering.
+<li>	Conflicting writes to memory locations in any given cache line
+	will result in expensive cache misses.
+	Greater numbers of concurrent writes and more-frequent
+	concurrent writes will result in more dramatic slowdowns.
+	RCU is therefore obligated to use algorithms that have
+	sufficient locality to avoid significant performance and
+	scalability problems.
+<li>	As a rough rule of thumb, only one CPU's worth of processing
+	may be carried out under the protection of any given exclusive
+	lock.
+	RCU must therefore use scalable locking designs.
+<li>	Counters are finite, especially on 32-bit systems.
+	RCU's use of counters must therefore tolerate counter wrap,
+	or be designed such that counter wrap would take way more
+	time than a single system is likely to run.
+	An uptime of ten years is quite possible, a runtime
+	of a century much less so.
+	As an example of the latter, RCU's dyntick-idle nesting counter
+	allows 54 bits for interrupt nesting level (this counter
+	is 64 bits even on a 32-bit system).
+	Overflowing this counter requires 2<sup>54</sup>
+	half-interrupts on a given CPU without that CPU ever going idle.
+	If a half-interrupt happened every microsecond, it would take
+	570 years of runtime to overflow this counter, which is currently
+	believed to be an acceptably long time.
+<li>	Linux systems can have thousands of CPUs running a single
+	Linux kernel in a single shared-memory environment.
+	RCU must therefore pay close attention to high-end scalability.
+</ol>
+
+<p>
+This last parallelism fact of life means that RCU must pay special
+attention to the preceding facts of life.
+The idea that Linux might scale to systems with thousands of CPUs would
+have been met with some skepticism in the 1990s, but these requirements
+would have otherwise have been unsurprising, even in the early 1990s.
+
+<h2><a name="Quality-of-Implementation Requirements">Quality-of-Implementation Requirements</a></h2>
+
+<p>
+These sections list quality-of-implementation requirements.
+Although an RCU implementation that ignores these requirements could
+still be used, it would likely be subject to limitations that would
+make it inappropriate for industrial-strength production use.
+Classes of quality-of-implementation requirements are as follows:
+
+<ol>
+<li>	<a href="#Specialization">Specialization</a>
+<li>	<a href="#Performance and Scalability">Performance and Scalability</a>
+<li>	<a href="#Composability">Composability</a>
+<li>	<a href="#Corner Cases">Corner Cases</a>
+</ol>
+
+<p>
+These classes is covered in the following sections.
+
+<h3><a name="Specialization">Specialization</a></h3>
+
+<p>
+RCU is and always has been intended primarily for read-mostly situations, as
+illustrated by the following figure.
+This means that RCU's read-side primitives are optimized, often at the
+expense of its update-side primitives.
+
+<p><img src="RCUApplicability.svg" alt="RCUApplicability.svg" width="70%"></p>
+
+<p>
+This focus on read-mostly situations means that RCU must interoperate
+with other synchronization primitives.
+For example, the <tt>add_gp()</tt> and <tt>remove_gp_synchronous()</tt>
+examples discussed earlier use RCU to protect readers and locking to
+coordinate updaters.
+However, the need extends much farther, requiring that a variety of
+synchronization primitives be legal within RCU read-side critical sections,
+including spinlocks, sequence locks, atomic operations, reference
+counters, and memory barriers.
+
+<p><a name="Quick Quiz 11"><b>Quick Quiz 11</b>:</a>
+What about sleeping locks?
+<br><a href="#qq11answer">Answer</a>
+
+<p>
+It often comes as a surprise that many algorithms do not require a
+consistent view of data, but many can function in that mode,
+with network routing being the poster child.
+Internet routing algorithms take significant time to propagate
+updates, so that by the time an update arrives at a given system,
+that system has been sending network traffic the wrong way for
+a considerable length of time.
+Having a few threads continue to send traffic the wrong way for a
+few more milliseconds is clearly not a problem:  In the worst case,
+TCP retransmissions will eventually get the data where it needs to go.
+In general, when tracking the state of the universe outside of the
+computer, some level of inconsistency must be tolerated due to
+speed-of-light delays if nothing else.
+
+<p>
+Furthermore, uncertainty about external state is inherent in many cases.
+For example, a pair of veternarians might use heartbeat to determine
+whether or not a given cat was alive.
+But how long should they wait after the last heartbeat to decide that
+the cat is in fact dead?
+Waiting less than 400 milliseconds makes no sense because this would
+mean that a relaxed cat would be considered to cycle between death
+and life more than 100 times per minute.
+Moreover, just as with human beings, a cat's heart might stop for
+some period of time, so the exact wait period is a judgment call.
+One of our pair of veternarians might wait 30 seconds before pronouncing
+the cat dead, while the other might insist on waiting a full minute.
+The two veternarians would then disagree on the state of the cat during
+the final 30 seconds of the minute following the last heartbeat, as
+fancifully illustrated below:
+
+<p><img src="2013-08-is-it-dead.png" alt="2013-08-is-it-dead.png" width="431"></p>
+
+<p>
+Interestingly enough, this same situation applies to hardware.
+When push comes to shove, how do we tell whether or not some
+external server has failed?
+We send messages to it periodically, and declare it failed if we
+don't receive a response within a given period of time.
+Policy decisions can usually tolerate short
+periods of inconsistency.
+The policy was decided some time ago, and is only now being put into
+effect, so a few milliseconds of delay is normally inconsequential.
+
+<p>
+However, there are algorithms that absolutely must see consistent data.
+For example, the translation between a user-level SystemV semaphore
+ID to the corresponding in-kernel data structure is protected by RCU,
+but it is absolutely forbidden to update a semaphore that has just been
+removed.
+In the Linux kernel, this need for consistency is accommodated by acquiring
+spinlocks located in the in-kernel data structure from within
+the RCU read-side critical section, and this is indicated by the
+green box in the figure above.
+Many other techniques may be used, and are in fact used within the
+Linux kernel.
+
+<p>
+In short, RCU is not required to maintain consistency, and other
+mechanisms may be used in concert with RCU when consistency is required.
+RCU's specialization allows it to do its job extremely well, and its
+ability to interoperate with other synchronization mechanisms allows
+the right mix of synchronization tools to be used for a given job.
+
+<h3><a name="Performance and Scalability">Performance and Scalability</a></h3>
+
+<p>
+Energy efficiency is a critical component of performance today,
+and Linux-kernel RCU implementations must therefore avoid unnecessarily
+awakening idle CPUs.
+I cannot claim that this requirement was premeditated.
+In fact, I learned of it during a telephone conversation in which I
+was given &ldquo;frank and open&rdquo; feedback on the importance
+of energy efficiency in battery-powered systems and on specific
+energy-efficiency shortcomings of the Linux-kernel RCU implementation.
+In my experience, the battery-powered embedded community will consider
+any unnecessary wakeups to be extremely unfriendly acts.
+So much so that mere Linux-kernel-mailing-list posts are
+insufficient to vent their ire.
+
+<p>
+Memory consumption is not particularly important for in most
+situations, and has become decreasingly
+so as memory sizes have expanded and memory
+costs have plummeted.
+However, as I learned from Matt Mackall's
+<a href="http://elinux.org/Linux_Tiny-FAQ">bloatwatch</a>
+efforts, memory footprint is critically important on single-CPU systems with
+non-preemptible (<tt>CONFIG_PREEMPT=n</tt>) kernels, and thus
+<a href="https://lkml.kernel.org/g/20090113221724.GA15307@linux.vnet.ibm.com">tiny RCU</a>
+was born.
+Josh Triplett has since taken over the small-memory banner with his
+<a href="https://tiny.wiki.kernel.org/">Linux kernel tinification</a>
+project, which resulted in
+<a href="#Sleepable RCU">SRCU</a>
+becoming optional for those kernels not needing it.
+
+<p>
+The remaining performance requirements are, for the most part,
+unsurprising.
+For example, in keeping with RCU's read-side specialization,
+<tt>rcu_dereference()</tt> should have negligible overhead (for
+example, suppression of a few minor compiler optimizations).
+Similarly, in non-preemptible environments, <tt>rcu_read_lock()</tt> and
+<tt>rcu_read_unlock()</tt> should have exactly zero overhead.
+
+<p>
+In preemptible environments, in the case where the RCU read-side
+critical section was not preempted (as will be the case for the
+highest-priority real-time process), <tt>rcu_read_lock()</tt> and
+<tt>rcu_read_unlock()</tt> should have minimal overhead.
+In particular, they should not contain atomic read-modify-write
+operations, memory-barrier instructions, preemption disabling,
+interrupt disabling, or backwards branches.
+However, in the case where the RCU read-side critical section was preempted,
+<tt>rcu_read_unlock()</tt> may acquire spinlocks and disable interrupts.
+This is why it is better to nest an RCU read-side critical section
+within a preempt-disable region than vice versa, at least in cases
+where that critical section is short enough to avoid unduly degrading
+real-time latencies.
+
+<p>
+The <tt>synchronize_rcu()</tt> grace-period-wait primitive is
+optimized for throughput.
+It may therefore incur several milliseconds of latency in addition to
+the duration of the longest RCU read-side critical section.
+On the other hand, multiple concurrent invocations of
+<tt>synchronize_rcu()</tt> are required to use batching optimizations
+so that they can be satisfied by a single underlying grace-period-wait
+operation.
+For example, in the Linux kernel, it is not unusual for a single
+grace-period-wait operation to serve more than
+<a href="https://www.usenix.org/conference/2004-usenix-annual-technical-conference/making-rcu-safe-deep-sub-millisecond-response">1,000 separate invocations</a>
+of <tt>synchronize_rcu()</tt>, thus amortizing the per-invocation
+overhead down to nearly zero.
+However, the grace-period optimization is also required to avoid
+measurable degradation of real-time scheduling and interrupt latencies.
+
+<p>
+In some cases, the multi-millisecond <tt>synchronize_rcu()</tt>
+latencies are unacceptable.
+In these cases, <tt>synchronize_rcu_expedited()</tt> may be used
+instead, reducing the grace-period latency down to a few tens of
+microseconds on small systems, at least in cases where the RCU read-side
+critical sections are short.
+There are currently no special latency requirements for
+<tt>synchronize_rcu_expedited()</tt> on large systems, but,
+consistent with the empirical nature of the RCU specification,
+that is subject to change.
+However, there most definitely are scalability requirements:
+A storm of <tt>synchronize_rcu_expedited()</tt> invocations on 4096
+CPUs should at least make reasonable forward progress.
+In return for its shorter latencies, <tt>synchronize_rcu_expedited()</tt>
+is permitted to impose modest degradation of real-time latency
+on non-idle online CPUs.
+That said, it will likely be necessary to take further steps to reduce this
+degradation, hopefully to roughly that of a scheduling-clock interrupt.
+
+<p>
+There are a number of situations where even
+<tt>synchronize_rcu_expedited()</tt>'s reduced grace-period
+latency is unacceptable.
+In these situations, the asynchronous <tt>call_rcu()</tt> can be
+used in place of <tt>synchronize_rcu()</tt> as follows:
+
+<blockquote>
+<pre>
+ 1 struct foo {
+ 2   int a;
+ 3   int b;
+ 4   struct rcu_head rh;
+ 5 };
+ 6
+ 7 static void remove_gp_cb(struct rcu_head *rhp)
+ 8 {
+ 9   struct foo *p = container_of(rhp, struct foo, rh);
+10
+11   kfree(p);
+12 }
+13
+14 bool remove_gp_asynchronous(void)
+15 {
+16   struct foo *p;
+17
+18   spin_lock(&amp;gp_lock);
+19   p = rcu_dereference(gp);
+20   if (!p) {
+21     spin_unlock(&amp;gp_lock);
+22     return false;
+23   }
+24   rcu_assign_pointer(gp, NULL);
+25   call_rcu(&amp;p-&gt;rh, remove_gp_cb);
+26   spin_unlock(&amp;gp_lock);
+27   return true;
+28 }
+</pre>
+</blockquote>
+
+<p>
+A definition of <tt>struct foo</tt> is finally needed, and appears
+on lines&nbsp;1-5.
+The function <tt>remove_gp_cb()</tt> is passed to <tt>call_rcu()</tt>
+on line&nbsp;25, and will be invoked after the end of a subsequent
+grace period.
+This gets the same effect as <tt>remove_gp_synchronous()</tt>,
+but without forcing the updater to wait for a grace period to elapse.
+The <tt>call_rcu()</tt> function may be used in a number of
+situations where neither <tt>synchronize_rcu()</tt> nor
+<tt>synchronize_rcu_expedited()</tt> would be legal,
+including within preempt-disable code, <tt>local_bh_disable()</tt> code,
+interrupt-disable code, and interrupt handlers.
+However, even <tt>call_rcu()</tt> is illegal within NMI handlers.
+The callback function (<tt>remove_gp_cb()</tt> in this case) will be
+executed within softirq (software interrupt) environment within the
+Linux kernel,
+either within a real softirq handler or under the protection
+of <tt>local_bh_disable()</tt>.
+In both the Linux kernel and in userspace, it is bad practice to
+write an RCU callback function that takes too long.
+Long-running operations should be relegated to separate threads or
+(in the Linux kernel) workqueues.
+
+<p><a name="Quick Quiz 12"><b>Quick Quiz 12</b>:</a>
+Why does line&nbsp;19 use <tt>rcu_access_pointer()</tt>?
+After all, <tt>call_rcu()</tt> on line&nbsp;25 stores into the
+structure, which would interact badly with concurrent insertions.
+Doesn't this mean that <tt>rcu_dereference()</tt> is required?
+<br><a href="#qq12answer">Answer</a>
+
+<p>
+However, all that <tt>remove_gp_cb()</tt> is doing is
+invoking <tt>kfree()</tt> on the data element.
+This is a common idiom, and is supported by <tt>kfree_rcu()</tt>,
+which allows &ldquo;fire and forget&rdquo; operation as shown below:
+
+<blockquote>
+<pre>
+ 1 struct foo {
+ 2   int a;
+ 3   int b;
+ 4   struct rcu_head rh;
+ 5 };
+ 6
+ 7 bool remove_gp_faf(void)
+ 8 {
+ 9   struct foo *p;
+10
+11   spin_lock(&amp;gp_lock);
+12   p = rcu_dereference(gp);
+13   if (!p) {
+14     spin_unlock(&amp;gp_lock);
+15     return false;
+16   }
+17   rcu_assign_pointer(gp, NULL);
+18   kfree_rcu(p, rh);
+19   spin_unlock(&amp;gp_lock);
+20   return true;
+21 }
+</pre>
+</blockquote>
+
+<p>
+Note that <tt>remove_gp_faf()</tt> simply invokes
+<tt>kfree_rcu()</tt> and proceeds, without any need to pay any
+further attention to the subsequent grace period and <tt>kfree()</tt>.
+It is permissible to invoke <tt>kfree_rcu()</tt> from the same
+environments as for <tt>call_rcu()</tt>.
+Interestingly enough, DYNIX/ptx had the equivalents of
+<tt>call_rcu()</tt> and <tt>kfree_rcu()</tt>, but not
+<tt>synchronize_rcu()</tt>.
+This was due to the fact that RCU was not heavily used within DYNIX/ptx,
+so the very few places that needed something like
+<tt>synchronize_rcu()</tt> simply open-coded it.
+
+<p><a name="Quick Quiz 13"><b>Quick Quiz 13</b>:</a>
+Earlier it was claimed that <tt>call_rcu()</tt> and
+<tt>kfree_rcu()</tt> allowed updaters to avoid being blocked
+by readers.
+But how can that be correct, given that the invocation of the callback
+and the freeing of the memory (respectively) must still wait for
+a grace period to elapse?
+<br><a href="#qq13answer">Answer</a>
+
+<p>
+But what if the updater must wait for the completion of code to be
+executed after the end of the grace period, but has other tasks
+that can be carried out in the meantime?
+The polling-style <tt>get_state_synchronize_rcu()</tt> and
+<tt>cond_synchronize_rcu()</tt> functions may be used for this
+purpose, as shown below:
+
+<blockquote>
+<pre>
+ 1 bool remove_gp_poll(void)
+ 2 {
+ 3   struct foo *p;
+ 4   unsigned long s;
+ 5
+ 6   spin_lock(&amp;gp_lock);
+ 7   p = rcu_access_pointer(gp);
+ 8   if (!p) {
+ 9     spin_unlock(&amp;gp_lock);
+10     return false;
+11   }
+12   rcu_assign_pointer(gp, NULL);
+13   spin_unlock(&amp;gp_lock);
+14   s = get_state_synchronize_rcu();
+15   do_something_while_waiting();
+16   cond_synchronize_rcu(s);
+17   kfree(p);
+18   return true;
+19 }
+</pre>
+</blockquote>
+
+<p>
+On line&nbsp;14, <tt>get_state_synchronize_rcu()</tt> obtains a
+&ldquo;cookie&rdquo; from RCU,
+then line&nbsp;15 carries out other tasks,
+and finally, line&nbsp;16 returns immediately if a grace period has
+elapsed in the meantime, but otherwise waits as required.
+The need for <tt>get_state_synchronize_rcu</tt> and
+<tt>cond_synchronize_rcu()</tt> has appeared quite recently,
+so it is too early to tell whether they will stand the test of time.
+
+<p>
+RCU thus provides a range of tools to allow updaters to strike the
+required tradeoff between latency, flexibility and CPU overhead.
+
+<h3><a name="Composability">Composability</a></h3>
+
+<p>
+Composability has received much attention in recent years, perhaps in part
+due to the collision of multicore hardware with object-oriented techniques
+designed in single-threaded environments for single-threaded use.
+And in theory, RCU read-side critical sections may be composed, and in
+fact may be nested arbitrarily deeply.
+In practice, as with all real-world implementations of composable
+constructs, there are limitations.
+
+<p>
+Implementations of RCU for which <tt>rcu_read_lock()</tt>
+and <tt>rcu_read_unlock()</tt> generate no code, such as
+Linux-kernel RCU when <tt>CONFIG_PREEMPT=n</tt>, can be
+nested arbitrarily deeply.
+After all, there is no overhead.
+Except that if all these instances of <tt>rcu_read_lock()</tt>
+and <tt>rcu_read_unlock()</tt> are visible to the compiler,
+compilation will eventually fail due to exhausting memory,
+mass storage, or user patience, whichever comes first.
+If the nesting is not visible to the compiler, as is the case with
+mutually recursive functions each in its own translation unit,
+stack overflow will result.
+If the nesting takes the form of loops, either the control variable
+will overflow or (in the Linux kernel) you will get an RCU CPU stall warning.
+Nevertheless, this class of RCU implementations is one
+of the most composable constructs in existence.
+
+<p>
+RCU implementations that explicitly track nesting depth
+are limited by the nesting-depth counter.
+For example, the Linux kernel's preemptible RCU limits nesting to
+<tt>INT_MAX</tt>.
+This should suffice for almost all practical purposes.
+That said, a consecutive pair of RCU read-side critical sections
+between which there is an operation that waits for a grace period
+cannot be enclosed in another RCU read-side critical section.
+This is because it is not legal to wait for a grace period within
+an RCU read-side critical section:  To do so would result either
+in deadlock or
+in RCU implicitly splitting the enclosing RCU read-side critical
+section, neither of which is conducive to a long-lived and prosperous
+kernel.
+
+<p>
+In short, although RCU read-side critical sections are highly composable,
+care is required in some situations, just as is the case for any other
+composable synchronization mechanism.
+
+<h3><a name="Corner Cases">Corner Cases</a></h3>
+
+<p>
+A given RCU workload might have an endless and intense stream of
+RCU read-side critical sections, perhaps even so intense that there
+was never a point in time during which there was not at least one
+RCU read-side critical section in flight.
+RCU cannot allow this situation to block grace periods:  As long as
+all the RCU read-side critical sections are finite, grace periods
+must also be finite.
+
+<p>
+That said, preemptible RCU implementations could potentially result
+in RCU read-side critical sections being preempted for long durations,
+which has the effect of creating a long-duration RCU read-side
+critical section.
+This situation can arise only in heavily loaded systems, but systems using
+real-time priorities are of course more vulnerable.
+Therefore, RCU priority boosting is provided to help deal with this
+case.
+That said, the exact requirements on RCU priority boosting will likely
+evolve as more experience accumulates.
+
+<p>
+Other workloads might have very high update rates.
+Although one can argue that such workloads should instead use
+something other than RCU, the fact remains that RCU must
+handle such workloads gracefully.
+This requirement is another factor driving batching of grace periods,
+but it is also the driving force behind the checks for large numbers
+of queued RCU callbacks in the <tt>call_rcu()</tt> code path.
+Finally, high update rates should not delay RCU read-side critical
+sections, although some read-side delays can occur when using
+<tt>synchronize_rcu_expedited()</tt>, courtesy of this function's use
+of <tt>try_stop_cpus()</tt>.
+(In the future, <tt>synchronize_rcu_expedited()</tt> will be
+converted to use lighter-weight inter-processor interrupts (IPIs),
+but this will still disturb readers, though to a much smaller degree.)
+
+<p>
+Although all three of these corner cases were understood in the early
+1990s, a simple user-level test consisting of <tt>close(open(path))</tt>
+in a tight loop
+in the early 2000s suddenly provided a much deeper appreciation of the
+high-update-rate corner case.
+This test also motivated addition of some RCU code to react to high update
+rates, for example, if a given CPU finds itself with more than 10,000
+RCU callbacks queued, it will cause RCU to take evasive action by
+more aggressively starting grace periods and more aggressively forcing
+completion of grace-period processing.
+This evasive action causes the grace period to complete more quickly,
+but at the cost of restricting RCU's batching optimizations, thus
+increasing the CPU overhead incurred by that grace period.
+
+<h2><a name="Software-Engineering Requirements">
+Software-Engineering Requirements</a></h2>
+
+<p>
+Between Murphy's Law and &ldquo;To err is human&rdquo;, it is necessary to
+guard against mishaps and misuse:
+
+<ol>
+<li>	It is all too easy to forget to use <tt>rcu_read_lock()</tt>
+	everywhere that it is needed, so kernels built with
+	<tt>CONFIG_PROVE_RCU=y</tt> will spat if
+	<tt>rcu_dereference()</tt> is used outside of an
+	RCU read-side critical section.
+	Update-side code can use <tt>rcu_dereference_protected()</tt>,
+	which takes a
+	<a href="https://lwn.net/Articles/371986/">lockdep expression</a>
+	to indicate what is providing the protection.
+	If the indicated protection is not provided, a lockdep splat
+	is emitted.
+
+	<p>
+	Code shared between readers and updaters can use
+	<tt>rcu_dereference_check()</tt>, which also takes a
+	lockdep expression, and emits a lockdep splat if neither
+	<tt>rcu_read_lock()</tt> nor the indicated protection
+	is in place.
+	In addition, <tt>rcu_dereference_raw()</tt> is used in those
+	(hopefully rare) cases where the required protection cannot
+	be easily described.
+	Finally, <tt>rcu_read_lock_held()</tt> is provided to
+	allow a function to verify that it has been invoked within
+	an RCU read-side critical section.
+	I was made aware of this set of requirements shortly after Thomas
+	Gleixner audited a number of RCU uses.
+<li>	A given function might wish to check for RCU-related preconditions
+	upon entry, before using any other RCU API.
+	The <tt>rcu_lockdep_assert()</tt> does this job,
+	asserting the expression in kernels having lockdep enabled
+	and doing nothing otherwise.
+<li>	It is also easy to forget to use <tt>rcu_assign_pointer()</tt>
+	and <tt>rcu_dereference()</tt>, perhaps (incorrectly)
+	substituting a simple assignment.
+	To catch this sort of error, a given RCU-protected pointer may be
+	tagged with <tt>__rcu</tt>, after which running sparse
+	with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt> will complain
+	about simple-assignment accesses to that pointer.
+	Arnd Bergmann made me aware of this requirement, and also
+	supplied the needed
+	<a href="https://lwn.net/Articles/376011/">patch series</a>.
+<li>	Kernels built with <tt>CONFIG_DEBUG_OBJECTS_RCU_HEAD=y</tt>
+	will splat if a data element is passed to <tt>call_rcu()</tt>
+	twice in a row, without a grace period in between.
+	(This error is similar to a double free.)
+	The corresponding <tt>rcu_head</tt> structures that are
+	dynamically allocated are automatically tracked, but
+	<tt>rcu_head</tt> structures allocated on the stack
+	must be initialized with <tt>init_rcu_head_on_stack()</tt>
+	and cleaned up with <tt>destroy_rcu_head_on_stack()</tt>.
+	Similarly, statically allocated non-stack <tt>rcu_head</tt>
+	structures must be initialized with <tt>init_rcu_head()</tt>
+	and cleaned up with <tt>destroy_rcu_head()</tt>.
+	Mathieu Desnoyers made me aware of this requirement, and also
+	supplied the needed
+	<a href="https://lkml.kernel.org/g/20100319013024.GA28456@Krystal">patch</a>.
+<li>	An infinite loop in an RCU read-side critical section will
+	eventually trigger an RCU CPU stall warning splat.
+	However, RCU is not obligated to produce this splat
+	unless there is a grace period waiting on that particular
+	RCU read-side critical section.
+	This requirement made itself known in the early 1990s, pretty
+	much the first time that it was necessary to debug a CPU stall.
+<li>	Although it would be very good to detect pointers leaking out
+	of RCU read-side critical sections, there is currently no
+	good way of doing this.
+	One complication is the need to distinguish between pointers
+	leaking and pointers that have been handed off from RCU to
+	some other synchronization mechanism, for example, reference
+	counting.
+<li>	In kernels built with <tt>CONFIG_RCU_TRACE=y</tt>, RCU-related
+	information is provided via both debugfs and event tracing.
+<li>	Open-coded use of <tt>rcu_assign_pointer()</tt> and
+	<tt>rcu_dereference()</tt> to create typical linked
+	data structures can be surprisingly error-prone.
+	Therefore, RCU-protected
+	<a href="https://lwn.net/Articles/609973/#RCU List APIs">linked lists</a>
+	and, more recently, RCU-protected
+	<a href="https://lwn.net/Articles/612100/">hash tables</a>
+	are available.
+	Many other special-purpose RCU-protected data structures are
+	available in the Linux kernel and the userspace RCU library.
+<li>	Some linked structures are created at compile time, but still
+	require <tt>__rcu</tt> checking.
+	The <tt>RCU_POINTER_INITIALIZER()</tt> macro serves this
+	purpose.
+<li>	It is not necessary to use <tt>rcu_assign_pointer()</tt>
+	when creating linked structures that are to be published via
+	a single external pointer.
+	The <tt>RCU_INIT_POINTER()</tt> macro is provided for
+	this task and also for assigning <tt>NULL</tt> pointers
+	at runtime.
+</ol>
+
+<p>
+This not a hard-and-fast list:  RCU's diagnostic capabilities will
+continue to be guided by the number and type of usage bugs found
+in real-world RCU usage.
+
+<h2><a name="Linux Kernel Complications">Linux Kernel Complications</a></h2>
+
+<p>
+The Linux kernel provides an interesting environment for all kinds of
+software, including RCU.
+Some of the relevant points of interest are as follows:
+
+<ol>
+<li>	<a href="#Configuration">Configuration</a>.
+<li>	<a href="#Firmware Interface">Firmware Interface</a>.
+<li>	<a href="#Early Boot">Early Boot</a>.
+<li>	<a href="#Interrupts and NMIs">
+	Interrupts and non-maskable interrupts (NMIs)</a>.
+<li>	<a href="#Loadable Modules">Loadable Modules</a>.
+<li>	<a href="#Hotplug CPU">Hotplug CPU</a>.
+<li>	<a href="#Scheduler and RCU">Scheduler and RCU</a>.
+<li>	<a href="#Tracing and RCU">Tracing and RCU</a>.
+<li>	<a href="#Energy Efficiency">Energy Efficiency</a>.
+<li>	<a href="#Performance, Scalability, Response Time, and Reliability">
+	Performance, Scalability, Response Time, and Reliability</a>.
+</ol>
+
+<p>
+This list is probably incomplete, but it does give a feel for the
+most notable Linux-kernel complications.
+Each of the following sections covers one of the above topics.
+
+<h3><a name="Configuration">Configuration</a></h3>
+
+<p>
+RCU's goal is automatic configuration, so that almost nobody
+needs to worry about RCU's <tt>Kconfig</tt> options.
+And for almost all users, RCU does in fact work well
+&ldquo;out of the box.&rdquo;
+
+<p>
+However, there are specialized use cases that are handled by
+kernel boot parameters and <tt>Kconfig</tt> options.
+Unfortunately, the <tt>Kconfig</tt> system will explicitly ask users
+about new <tt>Kconfig</tt> options, which requires almost all of them
+be hidden behind a <tt>CONFIG_RCU_EXPERT</tt> <tt>Kconfig</tt> option.
+
+<p>
+This all should be quite obvious, but the fact remains that
+Linus Torvalds recently had to
+<a href="https://lkml.kernel.org/g/CA+55aFy4wcCwaL4okTs8wXhGZ5h-ibecy_Meg9C4MNQrUnwMcg@mail.gmail.com">remind</a>
+me of this requirement.
+
+<h3><a name="Firmware Interface">Firmware Interface</a></h3>
+
+<p>
+In many cases, kernel obtains information about the system from the
+firmware, and sometimes things are lost in translation.
+Or the translation is accurate, but the original message is bogus.
+
+<p>
+For example, some systems' firmware overreports the number of CPUs,
+sometimes by a large factor.
+If RCU naively believed the firmware, as it used to do,
+it would create too many per-CPU kthreads.
+Although the resulting system will still run correctly, the extra
+kthreads needlessly consume memory and can cause confusion
+when they show up in <tt>ps</tt> listings.
+
+<p>
+RCU must therefore wait for a given CPU to actually come online before
+it can allow itself to believe that the CPU actually exists.
+The resulting &ldquo;ghost CPUs&rdquo; (which are never going to
+come online) cause a number of
+<a href="https://paulmck.livejournal.com/37494.html">interesting complications</a>.
+
+<h3><a name="Early Boot">Early Boot</a></h3>
+
+<p>
+The Linux kernel's boot sequence is an interesting process,
+and RCU is used early, even before <tt>rcu_init()</tt>
+is invoked.
+In fact, a number of RCU's primitives can be used as soon as the
+initial task's <tt>task_struct</tt> is available and the
+boot CPU's per-CPU variables are set up.
+The read-side primitives (<tt>rcu_read_lock()</tt>,
+<tt>rcu_read_unlock()</tt>, <tt>rcu_dereference()</tt>,
+and <tt>rcu_access_pointer()</tt>) will operate normally very early on,
+as will <tt>rcu_assign_pointer()</tt>.
+
+<p>
+Although <tt>call_rcu()</tt> may be invoked at any
+time during boot, callbacks are not guaranteed to be invoked until after
+the scheduler is fully up and running.
+This delay in callback invocation is due to the fact that RCU does not
+invoke callbacks until it is fully initialized, and this full initialization
+cannot occur until after the scheduler has initialized itself to the
+point where RCU can spawn and run its kthreads.
+In theory, it would be possible to invoke callbacks earlier,
+however, this is not a panacea because there would be severe restrictions
+on what operations those callbacks could invoke.
+
+<p>
+Perhaps surprisingly, <tt>synchronize_rcu()</tt>,
+<a href="#Bottom-Half Flavor"><tt>synchronize_rcu_bh()</tt></a>
+(<a href="#Bottom-Half Flavor">discussed below</a>),
+and
+<a href="#Sched Flavor"><tt>synchronize_sched()</tt></a>
+will all operate normally
+during very early boot, the reason being that there is only one CPU
+and preemption is disabled.
+This means that the call <tt>synchronize_rcu()</tt> (or friends)
+itself is a quiescent
+state and thus a grace period, so the early-boot implementation can
+be a no-op.
+
+<p>
+Both <tt>synchronize_rcu_bh()</tt> and <tt>synchronize_sched()</tt>
+continue to operate normally through the remainder of boot, courtesy
+of the fact that preemption is disabled across their RCU read-side
+critical sections and also courtesy of the fact that there is still
+only one CPU.
+However, once the scheduler starts initializing, preemption is enabled.
+There is still only a single CPU, but the fact that preemption is enabled
+means that the no-op implementation of <tt>synchronize_rcu()</tt> no
+longer works in <tt>CONFIG_PREEMPT=y</tt> kernels.
+Therefore, as soon as the scheduler starts initializing, the early-boot
+fastpath is disabled.
+This means that <tt>synchronize_rcu()</tt> switches to its runtime
+mode of operation where it posts callbacks, which in turn means that
+any call to <tt>synchronize_rcu()</tt> will block until the corresponding
+callback is invoked.
+Unfortunately, the callback cannot be invoked until RCU's runtime
+grace-period machinery is up and running, which cannot happen until
+the scheduler has initialized itself sufficiently to allow RCU's
+kthreads to be spawned.
+Therefore, invoking <tt>synchronize_rcu()</tt> during scheduler
+initialization can result in deadlock.
+
+<p><a name="Quick Quiz 14"><b>Quick Quiz 14</b>:</a>
+So what happens with <tt>synchronize_rcu()</tt> during
+scheduler initialization for <tt>CONFIG_PREEMPT=n</tt>
+kernels?
+<br><a href="#qq14answer">Answer</a>
+
+<p>
+I learned of these boot-time requirements as a result of a series of
+system hangs.
+
+<h3><a name="Interrupts and NMIs">Interrupts and NMIs</a></h3>
+
+<p>
+The Linux kernel has interrupts, and RCU read-side critical sections are
+legal within interrupt handlers and within interrupt-disabled regions
+of code, as are invocations of <tt>call_rcu()</tt>.
+
+<p>
+Some Linux-kernel architectures can enter an interrupt handler from
+non-idle process context, and then just never leave it, instead stealthily
+transitioning back to process context.
+This trick is sometimes used to invoke system calls from inside the kernel.
+These &ldquo;half-interrupts&rdquo; mean that RCU has to be very careful
+about how it counts interrupt nesting levels.
+I learned of this requirement the hard way during a rewrite
+of RCU's dyntick-idle code.
+
+<p>
+The Linux kernel has non-maskable interrupts (NMIs), and
+RCU read-side critical sections are legal within NMI handlers.
+Thankfully, RCU update-side primitives, including
+<tt>call_rcu()</tt>, are prohibited within NMI handlers.
+
+<p>
+The name notwithstanding, some Linux-kernel architectures
+can have nested NMIs, which RCU must handle correctly.
+Andy Lutomirski
+<a href="https://lkml.kernel.org/g/CALCETrXLq1y7e_dKFPgou-FKHB6Pu-r8+t-6Ds+8=va7anBWDA@mail.gmail.com">surprised me</a>
+with this requirement;
+he also kindly surprised me with
+<a href="https://lkml.kernel.org/g/CALCETrXSY9JpW3uE6H8WYk81sg56qasA2aqmjMPsq5dOtzso=g@mail.gmail.com">an algorithm</a>
+that meets this requirement.
+
+<h3><a name="Loadable Modules">Loadable Modules</a></h3>
+
+<p>
+The Linux kernel has loadable modules, and these modules can
+also be unloaded.
+After a given module has been unloaded, any attempt to call
+one of its functions results in a segmentation fault.
+The module-unload functions must therefore cancel any
+delayed calls to loadable-module functions, for example,
+any outstanding <tt>mod_timer()</tt> must be dealt with
+via <tt>del_timer_sync()</tt> or similar.
+
+<p>
+Unfortunately, there is no way to cancel an RCU callback;
+once you invoke <tt>call_rcu()</tt>, the callback function is
+going to eventually be invoked, unless the system goes down first.
+Because it is normally considered socially irresponsible to crash the system
+in response to a module unload request, we need some other way
+to deal with in-flight RCU callbacks.
+
+<p>
+RCU therefore provides
+<tt><a href="https://lwn.net/Articles/217484/">rcu_barrier()</a></tt>,
+which waits until all in-flight RCU callbacks have been invoked.
+If a module uses <tt>call_rcu()</tt>, its exit function should therefore
+prevent any future invocation of <tt>call_rcu()</tt>, then invoke
+<tt>rcu_barrier()</tt>.
+In theory, the underlying module-unload code could invoke
+<tt>rcu_barrier()</tt> unconditionally, but in practice this would
+incur unacceptable latencies.
+
+<p>
+Nikita Danilov noted this requirement for an analogous filesystem-unmount
+situation, and Dipankar Sarma incorporated <tt>rcu_barrier()</tt> into RCU.
+The need for <tt>rcu_barrier()</tt> for module unloading became
+apparent later.
+
+<h3><a name="Hotplug CPU">Hotplug CPU</a></h3>
+
+<p>
+The Linux kernel supports CPU hotplug, which means that CPUs
+can come and go.
+It is of course illegal to use any RCU API member from an offline CPU.
+This requirement was present from day one in DYNIX/ptx, but
+on the other hand, the Linux kernel's CPU-hotplug implementation
+is &ldquo;interesting.&rdquo;
+
+<p>
+The Linux-kernel CPU-hotplug implementation has notifiers that
+are used to allow the various kernel subsystems (including RCU)
+to respond appropriately to a given CPU-hotplug operation.
+Most RCU operations may be invoked from CPU-hotplug notifiers,
+including even normal synchronous grace-period operations
+such as <tt>synchronize_rcu()</tt>.
+However, expedited grace-period operations such as
+<tt>synchronize_rcu_expedited()</tt> are not supported,
+due to the fact that current implementations block CPU-hotplug
+operations, which could result in deadlock.
+
+<p>
+In addition, all-callback-wait operations such as
+<tt>rcu_barrier()</tt> are also not supported, due to the
+fact that there are phases of CPU-hotplug operations where
+the outgoing CPU's callbacks will not be invoked until after
+the CPU-hotplug operation ends, which could also result in deadlock.
+
+<h3><a name="Scheduler and RCU">Scheduler and RCU</a></h3>
+
+<p>
+RCU depends on the scheduler, and the scheduler uses RCU to
+protect some of its data structures.
+This means the scheduler is forbidden from acquiring
+the runqueue locks and the priority-inheritance locks
+in the middle of an outermost RCU read-side critical section unless
+it also releases them before exiting that same
+RCU read-side critical section.
+This same prohibition also applies to any lock that is acquired
+while holding any lock to which this prohibition applies.
+Violating this rule results in deadlock.
+
+<p>
+For RCU's part, the preemptible-RCU <tt>rcu_read_unlock()</tt>
+implementation must be written carefully to avoid similar deadlocks.
+In particular, <tt>rcu_read_unlock()</tt> must tolerate an
+interrupt where the interrupt handler invokes both
+<tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>.
+This possibility requires <tt>rcu_read_unlock()</tt> to use
+negative nesting levels to avoid destructive recursion via
+interrupt handler's use of RCU.
+
+<p>
+This pair of mutual scheduler-RCU requirements came as a
+<a href="https://lwn.net/Articles/453002/">complete surprise</a>.
+
+<p>
+As noted above, RCU makes use of kthreads, and it is necessary to
+avoid excessive CPU-time accumulation by these kthreads.
+This requirement was no surprise, but RCU's violation of it
+when running context-switch-heavy workloads when built with
+<tt>CONFIG_NO_HZ_FULL=y</tt>
+<a href="http://www.rdrop.com/users/paulmck/scalability/paper/BareMetal.2015.01.15b.pdf">did come as a surprise [PDF]</a>.
+RCU has made good progress towards meeting this requirement, even
+for context-switch-have <tt>CONFIG_NO_HZ_FULL=y</tt> workloads,
+but there is room for further improvement.
+
+<h3><a name="Tracing and RCU">Tracing and RCU</a></h3>
+
+<p>
+It is possible to use tracing on RCU code, but tracing itself
+uses RCU.
+For this reason, <tt>rcu_dereference_raw_notrace()</tt>
+is provided for use by tracing, which avoids the destructive
+recursion that could otherwise ensue.
+This API is also used by virtualization in some architectures,
+where RCU readers execute in environments in which tracing
+cannot be used.
+The tracing folks both located the requirement and provided the
+needed fix, so this surprise requirement was relatively painless.
+
+<h3><a name="Energy Efficiency">Energy Efficiency</a></h3>
+
+<p>
+Interrupting idle CPUs is considered socially unacceptable,
+especially by people with battery-powered embedded systems.
+RCU therefore conserves energy by detecting which CPUs are
+idle, including tracking CPUs that have been interrupted from idle.
+This is a large part of the energy-efficiency requirement,
+so I learned of this via an irate phone call.
+
+<p>
+Because RCU avoids interrupting idle CPUs, it is illegal to
+execute an RCU read-side critical section on an idle CPU.
+(Kernels built with <tt>CONFIG_PROVE_RCU=y</tt> will splat
+if you try it.)
+The <tt>RCU_NONIDLE()</tt> macro and <tt>_rcuidle</tt>
+event tracing is provided to work around this restriction.
+In addition, <tt>rcu_is_watching()</tt> may be used to
+test whether or not it is currently legal to run RCU read-side
+critical sections on this CPU.
+I learned of the need for diagnostics on the one hand
+and <tt>RCU_NONIDLE()</tt> on the other while inspecting
+idle-loop code.
+Steven Rostedt supplied <tt>_rcuidle</tt> event tracing,
+which is used quite heavily in the idle loop.
+
+<p>
+It is similarly socially unacceptable to interrupt an
+<tt>nohz_full</tt> CPU running in userspace.
+RCU must therefore track <tt>nohz_full</tt> userspace
+execution.
+And in
+<a href="https://lwn.net/Articles/558284/"><tt>CONFIG_NO_HZ_FULL_SYSIDLE=y</tt></a>
+kernels, RCU must separately track idle CPUs on the one hand and
+CPUs that are either idle or executing in userspace on the other.
+In both cases, RCU must be able to sample state at two points in
+time, and be able to determine whether or not some other CPU spent
+any time idle and/or executing in userspace.
+
+<p>
+These energy-efficiency requirements have proven quite difficult to
+understand and to meet, for example, there have been more than five
+clean-sheet rewrites of RCU's energy-efficiency code, the last of
+which was finally able to demonstrate
+<a href="http://www.rdrop.com/users/paulmck/realtime/paper/AMPenergy.2013.04.19a.pdf">real energy savings running on real hardware [PDF]</a>.
+As noted earlier,
+I learned of many of these requirements via angry phone calls:
+Flaming me on the Linux-kernel mailing list was apparently not
+sufficient to fully vent their ire at RCU's energy-efficiency bugs!
+
+<h3><a name="Performance, Scalability, Response Time, and Reliability">
+Performance, Scalability, Response Time, and Reliability</a></h3>
+
+<p>
+Expanding on the
+<a href="#Performance and Scalability">earlier discussion</a>,
+RCU is used heavily by hot code paths in performance-critical
+portions of the Linux kernel's networking, security, virtualization,
+and scheduling code paths.
+RCU must therefore use efficient implementations, especially in its
+read-side primitives.
+To that end, it would be good if preemptible RCU's implementation
+of <tt>rcu_read_lock()</tt> could be inlined, however, doing
+this requires resolving <tt>#include</tt> issues with the
+<tt>task_struct</tt> structure.
+
+<p>
+The Linux kernel supports hardware configurations with up to
+4096 CPUs, which means that RCU must be extremely scalable.
+Algorithms that involve frequent acquisitions of global locks or
+frequent atomic operations on global variables simply cannot be
+tolerated within the RCU implementation.
+RCU therefore makes heavy use of a combining tree based on the
+<tt>rcu_node</tt> structure.
+RCU is required to tolerate all CPUs continuously invoking any
+combination of RCU's runtime primitives with minimal per-operation
+overhead.
+In fact, in many cases, increasing load must <i>decrease</i> the
+per-operation overhead, witness the batching optimizations for
+<tt>synchronize_rcu()</tt>, <tt>call_rcu()</tt>,
+<tt>synchronize_rcu_expedited()</tt>, and <tt>rcu_barrier()</tt>.
+As a general rule, RCU must cheerfully accept whatever the
+rest of the Linux kernel decides to throw at it.
+
+<p>
+The Linux kernel is used for real-time workloads, especially
+in conjunction with the
+<a href="https://rt.wiki.kernel.org/index.php/Main_Page">-rt patchset</a>.
+The real-time-latency response requirements are such that the
+traditional approach of disabling preemption across RCU
+read-side critical sections is inappropriate.
+Kernels built with <tt>CONFIG_PREEMPT=y</tt> therefore
+use an RCU implementation that allows RCU read-side critical
+sections to be preempted.
+This requirement made its presence known after users made it
+clear that an earlier
+<a href="https://lwn.net/Articles/107930/">real-time patch</a>
+did not meet their needs, in conjunction with some
+<a href="https://lkml.kernel.org/g/20050318002026.GA2693@us.ibm.com">RCU issues</a>
+encountered by a very early version of the -rt patchset.
+
+<p>
+In addition, RCU must make do with a sub-100-microsecond real-time latency
+budget.
+In fact, on smaller systems with the -rt patchset, the Linux kernel
+provides sub-20-microsecond real-time latencies for the whole kernel,
+including RCU.
+RCU's scalability and latency must therefore be sufficient for
+these sorts of configurations.
+To my surprise, the sub-100-microsecond real-time latency budget
+<a href="http://www.rdrop.com/users/paulmck/realtime/paper/bigrt.2013.01.31a.LCA.pdf">
+applies to even the largest systems [PDF]</a>,
+up to and including systems with 4096 CPUs.
+This real-time requirement motivated the grace-period kthread, which
+also simplified handling of a number of race conditions.
+
+<p>
+Finally, RCU's status as a synchronization primitive means that
+any RCU failure can result in arbitrary memory corruption that can be
+extremely difficult to debug.
+This means that RCU must be extremely reliable, which in
+practice also means that RCU must have an aggressive stress-test
+suite.
+This stress-test suite is called <tt>rcutorture</tt>.
+
+<p>
+Although the need for <tt>rcutorture</tt> was no surprise,
+the current immense popularity of the Linux kernel is posing
+interesting&mdash;and perhaps unprecedented&mdash;validation
+challenges.
+To see this, keep in mind that there are well over one billion
+instances of the Linux kernel running today, given Android
+smartphones, Linux-powered televisions, and servers.
+This number can be expected to increase sharply with the advent of
+the celebrated Internet of Things.
+
+<p>
+Suppose that RCU contains a race condition that manifests on average
+once per million years of runtime.
+This bug will be occurring about three times per <i>day</i> across
+the installed base.
+RCU could simply hide behind hardware error rates, given that no one
+should really expect their smartphone to last for a million years.
+However, anyone taking too much comfort from this thought should
+consider the fact that in most jurisdictions, a successful multi-year
+test of a given mechanism, which might include a Linux kernel,
+suffices for a number of types of safety-critical certifications.
+In fact, rumor has it that the Linux kernel is already being used
+in production for safety-critical applications.
+I don't know about you, but I would feel quite bad if a bug in RCU
+killed someone.
+Which might explain my recent focus on validation and verification.
+
+<h2><a name="Other RCU Flavors">Other RCU Flavors</a></h2>
+
+<p>
+One of the more surprising things about RCU is that there are now
+no fewer than five <i>flavors</i>, or API families.
+In addition, the primary flavor that has been the sole focus up to
+this point has two different implementations, non-preemptible and
+preemptible.
+The other four flavors are listed below, with requirements for each
+described in a separate section.
+
+<ol>
+<li>	<a href="#Bottom-Half Flavor">Bottom-Half Flavor</a>
+<li>	<a href="#Sched Flavor">Sched Flavor</a>
+<li>	<a href="#Sleepable RCU">Sleepable RCU</a>
+<li>	<a href="#Tasks RCU">Tasks RCU</a>
+</ol>
+
+<h3><a name="Bottom-Half Flavor">Bottom-Half Flavor</a></h3>
+
+<p>
+The softirq-disable (AKA &ldquo;bottom-half&rdquo;,
+hence the &ldquo;_bh&rdquo; abbreviations)
+flavor of RCU, or <i>RCU-bh</i>, was developed by
+Dipankar Sarma to provide a flavor of RCU that could withstand the
+network-based denial-of-service attacks researched by Robert
+Olsson.
+These attacks placed so much networking load on the system
+that some of the CPUs never exited softirq execution,
+which in turn prevented those CPUs from ever executing a context switch,
+which, in the RCU implementation of that time, prevented grace periods
+from ever ending.
+The result was an out-of-memory condition and a system hang.
+
+<p>
+The solution was the creation of RCU-bh, which does
+<tt>local_bh_disable()</tt>
+across its read-side critical sections, and which uses the transition
+from one type of softirq processing to another as a quiescent state
+in addition to context switch, idle, user mode, and offline.
+This means that RCU-bh grace periods can complete even when some of
+the CPUs execute in softirq indefinitely, thus allowing algorithms
+based on RCU-bh to withstand network-based denial-of-service attacks.
+
+<p>
+Because
+<tt>rcu_read_lock_bh()</tt> and <tt>rcu_read_unlock_bh()</tt>
+disable and re-enable softirq handlers, any attempt to start a softirq
+handlers during the
+RCU-bh read-side critical section will be deferred.
+In this case, <tt>rcu_read_unlock_bh()</tt>
+will invoke softirq processing, which can take considerable time.
+One can of course argue that this softirq overhead should be associated
+with the code following the RCU-bh read-side critical section rather
+than <tt>rcu_read_unlock_bh()</tt>, but the fact
+is that most profiling tools cannot be expected to make this sort
+of fine distinction.
+For example, suppose that a three-millisecond-long RCU-bh read-side
+critical section executes during a time of heavy networking load.
+There will very likely be an attempt to invoke at least one softirq
+handler during that three milliseconds, but any such invocation will
+be delayed until the time of the <tt>rcu_read_unlock_bh()</tt>.
+This can of course make it appear at first glance as if
+<tt>rcu_read_unlock_bh()</tt> was executing very slowly.
+
+<p>
+The
+<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">RCU-bh API</a>
+includes
+<tt>rcu_read_lock_bh()</tt>,
+<tt>rcu_read_unlock_bh()</tt>,
+<tt>rcu_dereference_bh()</tt>,
+<tt>rcu_dereference_bh_check()</tt>,
+<tt>synchronize_rcu_bh()</tt>,
+<tt>synchronize_rcu_bh_expedited()</tt>,
+<tt>call_rcu_bh()</tt>,
+<tt>rcu_barrier_bh()</tt>, and
+<tt>rcu_read_lock_bh_held()</tt>.
+
+<h3><a name="Sched Flavor">Sched Flavor</a></h3>
+
+<p>
+Before preemptible RCU, waiting for an RCU grace period had the
+side effect of also waiting for all pre-existing interrupt
+and NMI handlers.
+However, there are legitimate preemptible-RCU implementations that
+do not have this property, given that any point in the code outside
+of an RCU read-side critical section can be a quiescent state.
+Therefore, <i>RCU-sched</i> was created, which follows &ldquo;classic&rdquo;
+RCU in that an RCU-sched grace period waits for for pre-existing
+interrupt and NMI handlers.
+In kernels built with <tt>CONFIG_PREEMPT=n</tt>, the RCU and RCU-sched
+APIs have identical implementations, while kernels built with
+<tt>CONFIG_PREEMPT=y</tt> provide a separate implementation for each.
+
+<p>
+Note well that in <tt>CONFIG_PREEMPT=y</tt> kernels,
+<tt>rcu_read_lock_sched()</tt> and <tt>rcu_read_unlock_sched()</tt>
+disable and re-enable preemption, respectively.
+This means that if there was a preemption attempt during the
+RCU-sched read-side critical section, <tt>rcu_read_unlock_sched()</tt>
+will enter the scheduler, with all the latency and overhead entailed.
+Just as with <tt>rcu_read_unlock_bh()</tt>, this can make it look
+as if <tt>rcu_read_unlock_sched()</tt> was executing very slowly.
+However, the highest-priority task won't be preempted, so that task
+will enjoy low-overhead <tt>rcu_read_unlock_sched()</tt> invocations.
+
+<p>
+The
+<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">RCU-sched API</a>
+includes
+<tt>rcu_read_lock_sched()</tt>,
+<tt>rcu_read_unlock_sched()</tt>,
+<tt>rcu_read_lock_sched_notrace()</tt>,
+<tt>rcu_read_unlock_sched_notrace()</tt>,
+<tt>rcu_dereference_sched()</tt>,
+<tt>rcu_dereference_sched_check()</tt>,
+<tt>synchronize_sched()</tt>,
+<tt>synchronize_rcu_sched_expedited()</tt>,
+<tt>call_rcu_sched()</tt>,
+<tt>rcu_barrier_sched()</tt>, and
+<tt>rcu_read_lock_sched_held()</tt>.
+However, anything that disables preemption also marks an RCU-sched
+read-side critical section, including
+<tt>preempt_disable()</tt> and <tt>preempt_enable()</tt>,
+<tt>local_irq_save()</tt> and <tt>local_irq_restore()</tt>,
+and so on.
+
+<h3><a name="Sleepable RCU">Sleepable RCU</a></h3>
+
+<p>
+For well over a decade, someone saying &ldquo;I need to block within
+an RCU read-side critical section&rdquo; was a reliable indication
+that this someone did not understand RCU.
+After all, if you are always blocking in an RCU read-side critical
+section, you can probably afford to use a higher-overhead synchronization
+mechanism.
+However, that changed with the advent of the Linux kernel's notifiers,
+whose RCU read-side critical
+sections almost never sleep, but sometimes need to.
+This resulted in the introduction of
+<a href="https://lwn.net/Articles/202847/">sleepable RCU</a>,
+or <i>SRCU</i>.
+
+<p>
+SRCU allows different domains to be defined, with each such domain
+defined by an instance of an <tt>srcu_struct</tt> structure.
+A pointer to this structure must be passed in to each SRCU function,
+for example, <tt>synchronize_srcu(&amp;ss)</tt>, where
+<tt>ss</tt> is the <tt>srcu_struct</tt> structure.
+The key benefit of these domains is that a slow SRCU reader in one
+domain does not delay an SRCU grace period in some other domain.
+That said, one consequence of these domains is that read-side code
+must pass a &ldquo;cookie&rdquo; from <tt>srcu_read_lock()</tt>
+to <tt>srcu_read_unlock()</tt>, for example, as follows:
+
+<blockquote>
+<pre>
+ 1 int idx;
+ 2
+ 3 idx = srcu_read_lock(&amp;ss);
+ 4 do_something();
+ 5 srcu_read_unlock(&amp;ss, idx);
+</pre>
+</blockquote>
+
+<p>
+As noted above, it is legal to block within SRCU read-side critical sections,
+however, with great power comes great responsibility.
+If you block forever in one of a given domain's SRCU read-side critical
+sections, then that domain's grace periods will also be blocked forever.
+Of course, one good way to block forever is to deadlock, which can
+happen if any operation in a given domain's SRCU read-side critical
+section can block waiting, either directly or indirectly, for that domain's
+grace period to elapse.
+For example, this results in a self-deadlock:
+
+<blockquote>
+<pre>
+ 1 int idx;
+ 2
+ 3 idx = srcu_read_lock(&amp;ss);
+ 4 do_something();
+ 5 synchronize_srcu(&amp;ss);
+ 6 srcu_read_unlock(&amp;ss, idx);
+</pre>
+</blockquote>
+
+<p>
+However, if line&nbsp;5 acquired a mutex that was held across
+a <tt>synchronize_srcu()</tt> for domain <tt>ss</tt>,
+deadlock would still be possible.
+Furthermore, if line&nbsp;5 acquired a mutex that was held across
+a <tt>synchronize_srcu()</tt> for some other domain <tt>ss1</tt>,
+and if an <tt>ss1</tt>-domain SRCU read-side critical section
+acquired another mutex that was held across as <tt>ss</tt>-domain
+<tt>synchronize_srcu()</tt>,
+deadlock would again be possible.
+Such a deadlock cycle could extend across an arbitrarily large number
+of different SRCU domains.
+Again, with great power comes great responsibility.
+
+<p>
+Unlike the other RCU flavors, SRCU read-side critical sections can
+run on idle and even offline CPUs.
+This ability requires that <tt>srcu_read_lock()</tt> and
+<tt>srcu_read_unlock()</tt> contain memory barriers, which means
+that SRCU readers will run a bit slower than would RCU readers.
+It also motivates the <tt>smp_mb__after_srcu_read_unlock()</tt>
+API, which, in combination with <tt>srcu_read_unlock()</tt>,
+guarantees a full memory barrier.
+
+<p>
+The
+<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">SRCU API</a>
+includes
+<tt>srcu_read_lock()</tt>,
+<tt>srcu_read_unlock()</tt>,
+<tt>srcu_dereference()</tt>,
+<tt>srcu_dereference_check()</tt>,
+<tt>synchronize_srcu()</tt>,
+<tt>synchronize_srcu_expedited()</tt>,
+<tt>call_srcu()</tt>,
+<tt>srcu_barrier()</tt>, and
+<tt>srcu_read_lock_held()</tt>.
+It also includes
+<tt>DEFINE_SRCU()</tt>,
+<tt>DEFINE_STATIC_SRCU()</tt>, and
+<tt>init_srcu_struct()</tt>
+APIs for defining and initializing <tt>srcu_struct</tt> structures.
+
+<h3><a name="Tasks RCU">Tasks RCU</a></h3>
+
+<p>
+Some forms of tracing use &ldquo;tramopolines&rdquo; to handle the
+binary rewriting required to install different types of probes.
+It would be good to be able to free old trampolines, which sounds
+like a job for some form of RCU.
+However, because it is necessary to be able to install a trace
+anywhere in the code, it is not possible to use read-side markers
+such as <tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>.
+In addition, it does not work to have these markers in the trampoline
+itself, because there would need to be instructions following
+<tt>rcu_read_unlock()</tt>.
+Although <tt>synchronize_rcu()</tt> would guarantee that execution
+reached the <tt>rcu_read_unlock()</tt>, it would not be able to
+guarantee that execution had completely left the trampoline.
+
+<p>
+The solution, in the form of
+<a href="https://lwn.net/Articles/607117/"><i>Tasks RCU</i></a>,
+is to have implicit
+read-side critical sections that are delimited by voluntary context
+switches, that is, calls to <tt>schedule()</tt>,
+<tt>cond_resched_rcu_qs()</tt>, and
+<tt>synchronize_rcu_tasks()</tt>.
+In addition, transitions to and from userspace execution also delimit
+tasks-RCU read-side critical sections.
+
+<p>
+The tasks-RCU API is quite compact, consisting only of
+<tt>call_rcu_tasks()</tt>,
+<tt>synchronize_rcu_tasks()</tt>, and
+<tt>rcu_barrier_tasks()</tt>.
+
+<h2><a name="Possible Future Changes">Possible Future Changes</a></h2>
+
+<p>
+One of the tricks that RCU uses to attain update-side scalability is
+to increase grace-period latency with increasing numbers of CPUs.
+If this becomes a serious problem, it will be necessary to rework the
+grace-period state machine so as to avoid the need for the additional
+latency.
+
+<p>
+Expedited grace periods scan the CPUs, so their latency and overhead
+increases with increasing numbers of CPUs.
+If this becomes a serious problem on large systems, it will be necessary
+to do some redesign to avoid this scalability problem.
+
+<p>
+RCU disables CPU hotplug in a few places, perhaps most notably in the
+expedited grace-period and <tt>rcu_barrier()</tt> operations.
+If there is a strong reason to use expedited grace periods in CPU-hotplug
+notifiers, it will be necessary to avoid disabling CPU hotplug.
+This would introduce some complexity, so there had better be a <i>very</i>
+good reason.
+
+<p>
+The tradeoff between grace-period latency on the one hand and interruptions
+of other CPUs on the other hand may need to be re-examined.
+The desire is of course for zero grace-period latency as well as zero
+interprocessor interrupts undertaken during an expedited grace period
+operation.
+While this ideal is unlikely to be achievable, it is quite possible that
+further improvements can be made.
+
+<p>
+The multiprocessor implementations of RCU use a combining tree that
+groups CPUs so as to reduce lock contention and increase cache locality.
+However, this combining tree does not spread its memory across NUMA
+nodes nor does it align the CPU groups with hardware features such
+as sockets or cores.
+Such spreading and alignment is currently believed to be unnecessary
+because the hotpath read-side primitives do not access the combining
+tree, nor does <tt>call_rcu()</tt> in the common case.
+If you believe that your architecture needs such spreading and alignment,
+then your architecture should also benefit from the
+<tt>rcutree.rcu_fanout_leaf</tt> boot parameter, which can be set
+to the number of CPUs in a socket, NUMA node, or whatever.
+If the number of CPUs is too large, use a fraction of the number of
+CPUs.
+If the number of CPUs is a large prime number, well, that certainly
+is an &ldquo;interesting&rdquo; architectural choice!
+More flexible arrangements might be considered, but only if
+<tt>rcutree.rcu_fanout_leaf</tt> has proven inadequate, and only
+if the inadequacy has been demonstrated by a carefully run and
+realistic system-level workload.
+
+<p>
+Please note that arrangements that require RCU to remap CPU numbers will
+require extremely good demonstration of need and full exploration of
+alternatives.
+
+<p>
+There is an embarrassingly large number of flavors of RCU, and this
+number has been increasing over time.
+Perhaps it will be possible to combine some at some future date.
+
+<p>
+RCU's various kthreads are reasonably recent additions.
+It is quite likely that adjustments will be required to more gracefully
+handle extreme loads.
+It might also be necessary to be able to relate CPU utilization by
+RCU's kthreads and softirq handlers to the code that instigated this
+CPU utilization.
+For example, RCU callback overhead might be charged back to the
+originating <tt>call_rcu()</tt> instance, though probably not
+in production kernels.
+
+<h2><a name="Summary">Summary</a></h2>
+
+<p>
+This document has presented more than two decade's worth of RCU
+requirements.
+Given that the requirements keep changing, this will not be the last
+word on this subject, but at least it serves to get an important
+subset of the requirements set forth.
+
+<h2><a name="Acknowledgments">Acknowledgments</a></h2>
+
+I am grateful to Steven Rostedt, Lai Jiangshan, Ingo Molnar,
+Oleg Nesterov, Borislav Petkov, Peter Zijlstra, Boqun Feng, and
+Andy Lutomirski for their help in rendering
+this article human readable, and to Michelle Rankin for her support
+of this effort.
+Other contributions are acknowledged in the Linux kernel's git archive.
+The cartoon is copyright (c) 2013 by Melissa Broussard,
+and is provided
+under the terms of the Creative Commons Attribution-Share Alike 3.0
+United States license.
+
+<h3><a name="Answers to Quick Quizzes">
+Answers to Quick Quizzes</a></h3>
+
+<a name="qq1answer"></a>
+<p><b>Quick Quiz 1</b>:
+Wait a minute!
+You said that updaters can make useful forward progress concurrently
+with readers, but pre-existing readers will block
+<tt>synchronize_rcu()</tt>!!!
+Just who are you trying to fool???
+
+
+</p><p><b>Answer</b>:
+First, if updaters do not wish to be blocked by readers, they can use
+<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt>, which will
+be discussed later.
+Second, even when using <tt>synchronize_rcu()</tt>, the other
+update-side code does run concurrently with readers, whether pre-existing
+or not.
+
+
+</p><p><a href="#Quick%20Quiz%201"><b>Back to Quick Quiz 1</b>.</a>
+
+<a name="qq2answer"></a>
+<p><b>Quick Quiz 2</b>:
+Why is the <tt>synchronize_rcu()</tt> on line&nbsp;28 needed?
+
+
+</p><p><b>Answer</b>:
+Without that extra grace period, memory reordering could result in
+<tt>do_something_dlm()</tt> executing <tt>do_something()</tt>
+concurrently with the last bits of <tt>recovery()</tt>.
+
+
+</p><p><a href="#Quick%20Quiz%202"><b>Back to Quick Quiz 2</b>.</a>
+
+<a name="qq3answer"></a>
+<p><b>Quick Quiz 3</b>:
+But <tt>rcu_assign_pointer()</tt> does nothing to prevent the
+two assignments to <tt>p-&gt;a</tt> and <tt>p-&gt;b</tt>
+from being reordered.
+Can't that also cause problems?
+
+
+</p><p><b>Answer</b>:
+No, it cannot.
+The readers cannot see either of these two fields until
+the assignment to <tt>gp</tt>, by which time both fields are
+fully initialized.
+So reordering the assignments
+to <tt>p-&gt;a</tt> and <tt>p-&gt;b</tt> cannot possibly
+cause any problems.
+
+
+</p><p><a href="#Quick%20Quiz%203"><b>Back to Quick Quiz 3</b>.</a>
+
+<a name="qq4answer"></a>
+<p><b>Quick Quiz 4</b>:
+Without the <tt>rcu_dereference()</tt> or the
+<tt>rcu_access_pointer()</tt>, what destructive optimizations
+might the compiler make use of?
+
+
+</p><p><b>Answer</b>:
+Let's start with what happens to <tt>do_something_gp()</tt>
+if it fails to use <tt>rcu_dereference()</tt>.
+It could reuse a value formerly fetched from this same pointer.
+It could also fetch the pointer from <tt>gp</tt> in a byte-at-a-time
+manner, resulting in <i>load tearing</i>, in turn resulting a bytewise
+mash-up of two distince pointer values.
+It might even use value-speculation optimizations, where it makes a wrong
+guess, but by the time it gets around to checking the value, an update
+has changed the pointer to match the wrong guess.
+Too bad about any dereferences that returned pre-initialization garbage
+in the meantime!
+
+<p>
+For <tt>remove_gp_synchronous()</tt>, as long as all modifications
+to <tt>gp</tt> are carried out while holding <tt>gp_lock</tt>,
+the above optimizations are harmless.
+However,
+with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt>,
+<tt>sparse</tt> will complain if you
+define <tt>gp</tt> with <tt>__rcu</tt> and then
+access it without using
+either <tt>rcu_access_pointer()</tt> or <tt>rcu_dereference()</tt>.
+
+
+</p><p><a href="#Quick%20Quiz%204"><b>Back to Quick Quiz 4</b>.</a>
+
+<a name="qq5answer"></a>
+<p><b>Quick Quiz 5</b>:
+Given that multiple CPUs can start RCU read-side critical sections
+at any time without any ordering whatsoever, how can RCU possibly tell whether
+or not a given RCU read-side critical section starts before a
+given instance of <tt>synchronize_rcu()</tt>?
+
+
+</p><p><b>Answer</b>:
+If RCU cannot tell whether or not a given
+RCU read-side critical section starts before a
+given instance of <tt>synchronize_rcu()</tt>,
+then it must assume that the RCU read-side critical section
+started first.
+In other words, a given instance of <tt>synchronize_rcu()</tt>
+can avoid waiting on a given RCU read-side critical section only
+if it can prove that <tt>synchronize_rcu()</tt> started first.
+
+
+</p><p><a href="#Quick%20Quiz%205"><b>Back to Quick Quiz 5</b>.</a>
+
+<a name="qq6answer"></a>
+<p><b>Quick Quiz 6</b>:
+The first and second guarantees require unbelievably strict ordering!
+Are all these memory barriers <i> really</i> required?
+
+
+</p><p><b>Answer</b>:
+Yes, they really are required.
+To see why the first guarantee is required, consider the following
+sequence of events:
+
+<ol>
+<li>	CPU 1: <tt>rcu_read_lock()</tt>
+<li>	CPU 1: <tt>q = rcu_dereference(gp);
+	/* Very likely to return p. */</tt>
+<li>	CPU 0: <tt>list_del_rcu(p);</tt>
+<li>	CPU 0: <tt>synchronize_rcu()</tt> starts.
+<li>	CPU 1: <tt>do_something_with(q-&gt;a);
+	/* No smp_mb(), so might happen after kfree(). */</tt>
+<li>	CPU 1: <tt>rcu_read_unlock()</tt>
+<li>	CPU 0: <tt>synchronize_rcu()</tt> returns.
+<li>	CPU 0: <tt>kfree(p);</tt>
+</ol>
+
+<p>
+Therefore, there absolutely must be a full memory barrier between the
+end of the RCU read-side critical section and the end of the
+grace period.
+
+<p>
+The sequence of events demonstrating the necessity of the second rule
+is roughly similar:
+
+<ol>
+<li>	CPU 0: <tt>list_del_rcu(p);</tt>
+<li>	CPU 0: <tt>synchronize_rcu()</tt> starts.
+<li>	CPU 1: <tt>rcu_read_lock()</tt>
+<li>	CPU 1: <tt>q = rcu_dereference(gp);
+	/* Might return p if no memory barrier. */</tt>
+<li>	CPU 0: <tt>synchronize_rcu()</tt> returns.
+<li>	CPU 0: <tt>kfree(p);</tt>
+<li>	CPU 1: <tt>do_something_with(q-&gt;a); /* Boom!!! */</tt>
+<li>	CPU 1: <tt>rcu_read_unlock()</tt>
+</ol>
+
+<p>
+And similarly, without a memory barrier between the beginning of the
+grace period and the beginning of the RCU read-side critical section,
+CPU&nbsp;1 might end up accessing the freelist.
+
+<p>
+The &ldquo;as if&rdquo; rule of course applies, so that any implementation
+that acts as if the appropriate memory barriers were in place is a
+correct implementation.
+That said, it is much easier to fool yourself into believing that you have
+adhered to the as-if rule than it is to actually adhere to it!
+
+
+</p><p><a href="#Quick%20Quiz%206"><b>Back to Quick Quiz 6</b>.</a>
+
+<a name="qq7answer"></a>
+<p><b>Quick Quiz 7</b>:
+But how does the upgrade-to-write operation exclude other readers?
+
+
+</p><p><b>Answer</b>:
+It doesn't, just like normal RCU updates, which also do not exclude
+RCU readers.
+
+
+</p><p><a href="#Quick%20Quiz%207"><b>Back to Quick Quiz 7</b>.</a>
+
+<a name="qq8answer"></a>
+<p><b>Quick Quiz 8</b>:
+Can't the compiler also reorder this code?
+
+
+</p><p><b>Answer</b>:
+No, the volatile casts in <tt>READ_ONCE()</tt> and
+<tt>WRITE_ONCE()</tt> prevent the compiler from reordering in
+this particular case.
+
+
+</p><p><a href="#Quick%20Quiz%208"><b>Back to Quick Quiz 8</b>.</a>
+
+<a name="qq9answer"></a>
+<p><b>Quick Quiz 9</b>:
+Suppose that synchronize_rcu() did wait until all readers had completed.
+Would the updater be able to rely on this?
+
+
+</p><p><b>Answer</b>:
+No.
+Even if <tt>synchronize_rcu()</tt> were to wait until
+all readers had completed, a new reader might start immediately after
+<tt>synchronize_rcu()</tt> completed.
+Therefore, the code following
+<tt>synchronize_rcu()</tt> cannot rely on there being no readers
+in any case.
+
+
+</p><p><a href="#Quick%20Quiz%209"><b>Back to Quick Quiz 9</b>.</a>
+
+<a name="qq10answer"></a>
+<p><b>Quick Quiz 10</b>:
+How long a sequence of grace periods, each separated by an RCU read-side
+critical section, would be required to partition the RCU read-side
+critical sections at the beginning and end of the chain?
+
+
+</p><p><b>Answer</b>:
+In theory, an infinite number.
+In practice, an unknown number that is sensitive to both implementation
+details and timing considerations.
+Therefore, even in practice, RCU users must abide by the theoretical rather
+than the practical answer.
+
+
+</p><p><a href="#Quick%20Quiz%2010"><b>Back to Quick Quiz 10</b>.</a>
+
+<a name="qq11answer"></a>
+<p><b>Quick Quiz 11</b>:
+What about sleeping locks?
+
+
+</p><p><b>Answer</b>:
+These are forbidden within Linux-kernel RCU read-side critical sections
+because it is not legal to place a quiescent state (in this case,
+voluntary context switch) within an RCU read-side critical section.
+However, sleeping locks may be used within userspace RCU read-side critical
+sections, and also within Linux-kernel sleepable RCU
+<a href="#Sleepable RCU">(SRCU)</a>
+read-side critical sections.
+In addition, the -rt patchset turns spinlocks into a sleeping locks so
+that the corresponding critical sections can be preempted, which
+also means that these sleeplockified spinlocks (but not other sleeping locks!)
+may be acquire within -rt-Linux-kernel RCU read-side critical sections.
+
+<p>
+Note that it <i>is</i> legal for a normal RCU read-side critical section
+to conditionally acquire a sleeping locks (as in <tt>mutex_trylock()</tt>),
+but only as long as it does not loop indefinitely attempting to
+conditionally acquire that sleeping locks.
+The key point is that things like <tt>mutex_trylock()</tt>
+either return with the mutex held, or return an error indication if
+the mutex was not immediately available.
+Either way, <tt>mutex_trylock()</tt> returns immediately without sleeping.
+
+
+</p><p><a href="#Quick%20Quiz%2011"><b>Back to Quick Quiz 11</b>.</a>
+
+<a name="qq12answer"></a>
+<p><b>Quick Quiz 12</b>:
+Why does line&nbsp;19 use <tt>rcu_access_pointer()</tt>?
+After all, <tt>call_rcu()</tt> on line&nbsp;25 stores into the
+structure, which would interact badly with concurrent insertions.
+Doesn't this mean that <tt>rcu_dereference()</tt> is required?
+
+
+</p><p><b>Answer</b>:
+Presumably the <tt>-&gt;gp_lock</tt> acquired on line&nbsp;18 excludes
+any changes, including any insertions that <tt>rcu_dereference()</tt>
+would protect against.
+Therefore, any insertions will be delayed until after <tt>-&gt;gp_lock</tt>
+is released on line&nbsp;25, which in turn means that
+<tt>rcu_access_pointer()</tt> suffices.
+
+
+</p><p><a href="#Quick%20Quiz%2012"><b>Back to Quick Quiz 12</b>.</a>
+
+<a name="qq13answer"></a>
+<p><b>Quick Quiz 13</b>:
+Earlier it was claimed that <tt>call_rcu()</tt> and
+<tt>kfree_rcu()</tt> allowed updaters to avoid being blocked
+by readers.
+But how can that be correct, given that the invocation of the callback
+and the freeing of the memory (respectively) must still wait for
+a grace period to elapse?
+
+
+</p><p><b>Answer</b>:
+We could define things this way, but keep in mind that this sort of
+definition would say that updates in garbage-collected languages
+cannot complete until the next time the garbage collector runs,
+which does not seem at all reasonable.
+The key point is that in most cases, an updater using either
+<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt> can proceed to the
+next update as soon as it has invoked <tt>call_rcu()</tt> or
+<tt>kfree_rcu()</tt>, without having to wait for a subsequent
+grace period.
+
+
+</p><p><a href="#Quick%20Quiz%2013"><b>Back to Quick Quiz 13</b>.</a>
+
+<a name="qq14answer"></a>
+<p><b>Quick Quiz 14</b>:
+So what happens with <tt>synchronize_rcu()</tt> during
+scheduler initialization for <tt>CONFIG_PREEMPT=n</tt>
+kernels?
+
+
+</p><p><b>Answer</b>:
+In <tt>CONFIG_PREEMPT=n</tt> kernel, <tt>synchronize_rcu()</tt>
+maps directly to <tt>synchronize_sched()</tt>.
+Therefore, <tt>synchronize_rcu()</tt> works normally throughout
+boot in <tt>CONFIG_PREEMPT=n</tt> kernels.
+However, your code must also work in <tt>CONFIG_PREEMPT=y</tt> kernels,
+so it is still necessary to avoid invoking <tt>synchronize_rcu()</tt>
+during scheduler initialization.
+
+
+</p><p><a href="#Quick%20Quiz%2014"><b>Back to Quick Quiz 14</b>.</a>
+
+
+</body></html>
diff --git a/Documentation/RCU/Design/Requirements/Requirements.htmlx b/Documentation/RCU/Design/Requirements/Requirements.htmlx
new file mode 100644
index 0000000..1168010
--- /dev/null
+++ b/Documentation/RCU/Design/Requirements/Requirements.htmlx
@@ -0,0 +1,2643 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
+        "http://www.w3.org/TR/html4/loose.dtd">
+        <html>
+        <head><title>A Tour Through RCU's Requirements [LWN.net]</title>
+        <meta HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=utf-8">
+
+<h1>A Tour Through RCU's Requirements</h1>
+
+<p>Copyright IBM Corporation, 2015</p>
+<p>Author: Paul E.&nbsp;McKenney</p>
+<p><i>The initial version of this document appeared in the
+<a href="https://lwn.net/">LWN</a> articles
+<a href="https://lwn.net/Articles/652156/">here</a>,
+<a href="https://lwn.net/Articles/652677/">here</a>, and
+<a href="https://lwn.net/Articles/653326/">here</a>.</i></p>
+
+<h2>Introduction</h2>
+
+<p>
+Read-copy update (RCU) is a synchronization mechanism that is often
+used as a replacement for reader-writer locking.
+RCU is unusual in that updaters do not block readers,
+which means that RCU's read-side primitives can be exceedingly fast
+and scalable.
+In addition, updaters can make useful forward progress concurrently
+with readers.
+However, all this concurrency between RCU readers and updaters does raise
+the question of exactly what RCU readers are doing, which in turn
+raises the question of exactly what RCU's requirements are.
+
+<p>
+This document therefore summarizes RCU's requirements, and can be thought
+of as an informal, high-level specification for RCU.
+It is important to understand that RCU's specification is primarily
+empirical in nature;
+in fact, I learned about many of these requirements the hard way.
+This situation might cause some consternation, however, not only
+has this learning process been a lot of fun, but it has also been
+a great privilege to work with so many people willing to apply
+technologies in interesting new ways.
+
+<p>
+All that aside, here are the categories of currently known RCU requirements:
+</p>
+
+<ol>
+<li>	<a href="#Fundamental Requirements">
+	Fundamental Requirements</a>
+<li>	<a href="#Fundamental Non-Requirements">Fundamental Non-Requirements</a>
+<li>	<a href="#Parallelism Facts of Life">
+	Parallelism Facts of Life</a>
+<li>	<a href="#Quality-of-Implementation Requirements">
+	Quality-of-Implementation Requirements</a>
+<li>	<a href="#Linux Kernel Complications">
+	Linux Kernel Complications</a>
+<li>	<a href="#Software-Engineering Requirements">
+	Software-Engineering Requirements</a>
+<li>	<a href="#Other RCU Flavors">
+	Other RCU Flavors</a>
+<li>	<a href="#Possible Future Changes">
+	Possible Future Changes</a>
+</ol>
+
+<p>
+This is followed by a <a href="#Summary">summary</a>,
+which is in turn followed by the inevitable
+<a href="#Answers to Quick Quizzes">answers to the quick quizzes</a>.
+
+<h2><a name="Fundamental Requirements">Fundamental Requirements</a></h2>
+
+<p>
+RCU's fundamental requirements are the closest thing RCU has to hard
+mathematical requirements.
+These are:
+
+<ol>
+<li>	<a href="#Grace-Period Guarantee">
+	Grace-Period Guarantee</a>
+<li>	<a href="#Publish-Subscribe Guarantee">
+	Publish-Subscribe Guarantee</a>
+<li>	<a href="#RCU Primitives Guaranteed to Execute Unconditionally">
+	RCU Primitives Guaranteed to Execute Unconditionally</a>
+<li>	<a href="#Guaranteed Read-to-Write Upgrade">
+	Guaranteed Read-to-Write Upgrade</a>
+</ol>
+
+<h3><a name="Grace-Period Guarantee">Grace-Period Guarantee</a></h3>
+
+<p>
+RCU's grace-period guarantee is unusual in being premeditated:
+Jack Slingwine and I had this guarantee firmly in mind when we started
+work on RCU (then called &ldquo;rclock&rdquo;) in the early 1990s.
+That said, the past two decades of experience with RCU have produced
+a much more detailed understanding of this guarantee.
+
+<p>
+RCU's grace-period guarantee allows updaters to wait for the completion
+of all pre-existing RCU read-side critical sections.
+An RCU read-side critical section
+begins with the marker <tt>rcu_read_lock()</tt> and ends with
+the marker <tt>rcu_read_unlock()</tt>.
+These markers may be nested, and RCU treats a nested set as one
+big RCU read-side critical section.
+Production-quality implementations of <tt>rcu_read_lock()</tt> and
+<tt>rcu_read_unlock()</tt> are extremely lightweight, and in
+fact have exactly zero overhead in Linux kernels built for production
+use with <tt>CONFIG_PREEMPT=n</tt>.
+
+<p>
+This guarantee allows ordering to be enforced with extremely low
+overhead to readers, for example:
+
+<blockquote>
+<pre>
+ 1 int x, y;
+ 2
+ 3 void thread0(void)
+ 4 {
+ 5   rcu_read_lock();
+ 6   r1 = READ_ONCE(x);
+ 7   r2 = READ_ONCE(y);
+ 8   rcu_read_unlock();
+ 9 }
+10
+11 void thread1(void)
+12 {
+13   WRITE_ONCE(x, 1);
+14   synchronize_rcu();
+15   WRITE_ONCE(y, 1);
+16 }
+</pre>
+</blockquote>
+
+<p>
+Because the <tt>synchronize_rcu()</tt> on line&nbsp;14 waits for
+all pre-existing readers, any instance of <tt>thread0()</tt> that
+loads a value of zero from <tt>x</tt> must complete before
+<tt>thread1()</tt> stores to <tt>y</tt>, so that instance must
+also load a value of zero from <tt>y</tt>.
+Similarly, any instance of <tt>thread0()</tt> that loads a value of
+one from <tt>y</tt> must have started after the
+<tt>synchronize_rcu()</tt> started, and must therefore also load
+a value of one from <tt>x</tt>.
+Therefore, the outcome:
+<blockquote>
+<pre>
+(r1 == 0 &amp;&amp; r2 == 1)
+</pre>
+</blockquote>
+cannot happen.
+
+<p>@@QQ@@
+Wait a minute!
+You said that updaters can make useful forward progress concurrently
+with readers, but pre-existing readers will block
+<tt>synchronize_rcu()</tt>!!!
+Just who are you trying to fool???
+<p>@@QQA@@
+First, if updaters do not wish to be blocked by readers, they can use
+<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt>, which will
+be discussed later.
+Second, even when using <tt>synchronize_rcu()</tt>, the other
+update-side code does run concurrently with readers, whether pre-existing
+or not.
+<p>@@QQE@@
+
+<p>
+This scenario resembles one of the first uses of RCU in
+<a href="https://en.wikipedia.org/wiki/DYNIX">DYNIX/ptx</a>,
+which managed a distributed lock manager's transition into
+a state suitable for handling recovery from node failure,
+more or less as follows:
+
+<blockquote>
+<pre>
+ 1 #define STATE_NORMAL        0
+ 2 #define STATE_WANT_RECOVERY 1
+ 3 #define STATE_RECOVERING    2
+ 4 #define STATE_WANT_NORMAL   3
+ 5
+ 6 int state = STATE_NORMAL;
+ 7
+ 8 void do_something_dlm(void)
+ 9 {
+10   int state_snap;
+11
+12   rcu_read_lock();
+13   state_snap = READ_ONCE(state);
+14   if (state_snap == STATE_NORMAL)
+15     do_something();
+16   else
+17     do_something_carefully();
+18   rcu_read_unlock();
+19 }
+20
+21 void start_recovery(void)
+22 {
+23   WRITE_ONCE(state, STATE_WANT_RECOVERY);
+24   synchronize_rcu();
+25   WRITE_ONCE(state, STATE_RECOVERING);
+26   recovery();
+27   WRITE_ONCE(state, STATE_WANT_NORMAL);
+28   synchronize_rcu();
+29   WRITE_ONCE(state, STATE_NORMAL);
+30 }
+</pre>
+</blockquote>
+
+<p>
+The RCU read-side critical section in <tt>do_something_dlm()</tt>
+works with the <tt>synchronize_rcu()</tt> in <tt>start_recovery()</tt>
+to guarantee that <tt>do_something()</tt> never runs concurrently
+with <tt>recovery()</tt>, but with little or no synchronization
+overhead in <tt>do_something_dlm()</tt>.
+
+<p>@@QQ@@
+Why is the <tt>synchronize_rcu()</tt> on line&nbsp;28 needed?
+<p>@@QQA@@
+Without that extra grace period, memory reordering could result in
+<tt>do_something_dlm()</tt> executing <tt>do_something()</tt>
+concurrently with the last bits of <tt>recovery()</tt>.
+<p>@@QQE@@
+
+<p>
+In order to avoid fatal problems such as deadlocks,
+an RCU read-side critical section must not contain calls to
+<tt>synchronize_rcu()</tt>.
+Similarly, an RCU read-side critical section must not
+contain anything that waits, directly or indirectly, on completion of
+an invocation of <tt>synchronize_rcu()</tt>.
+
+<p>
+Although RCU's grace-period guarantee is useful in and of itself, with
+<a href="https://lwn.net/Articles/573497/">quite a few use cases</a>,
+it would be good to be able to use RCU to coordinate read-side
+access to linked data structures.
+For this, the grace-period guarantee is not sufficient, as can
+be seen in function <tt>add_gp_buggy()</tt> below.
+We will look at the reader's code later, but in the meantime, just think of
+the reader as locklessly picking up the <tt>gp</tt> pointer,
+and, if the value loaded is non-<tt>NULL</tt>, locklessly accessing the
+<tt>-&gt;a</tt> and <tt>-&gt;b</tt> fields.
+
+<blockquote>
+<pre>
+ 1 bool add_gp_buggy(int a, int b)
+ 2 {
+ 3   p = kmalloc(sizeof(*p), GFP_KERNEL);
+ 4   if (!p)
+ 5     return -ENOMEM;
+ 6   spin_lock(&amp;gp_lock);
+ 7   if (rcu_access_pointer(gp)) {
+ 8     spin_unlock(&amp;gp_lock);
+ 9     return false;
+10   }
+11   p-&gt;a = a;
+12   p-&gt;b = a;
+13   gp = p; /* ORDERING BUG */
+14   spin_unlock(&amp;gp_lock);
+15   return true;
+16 }
+</pre>
+</blockquote>
+
+<p>
+The problem is that both the compiler and weakly ordered CPUs are within
+their rights to reorder this code as follows:
+
+<blockquote>
+<pre>
+ 1 bool add_gp_buggy_optimized(int a, int b)
+ 2 {
+ 3   p = kmalloc(sizeof(*p), GFP_KERNEL);
+ 4   if (!p)
+ 5     return -ENOMEM;
+ 6   spin_lock(&amp;gp_lock);
+ 7   if (rcu_access_pointer(gp)) {
+ 8     spin_unlock(&amp;gp_lock);
+ 9     return false;
+10   }
+<b>11   gp = p; /* ORDERING BUG */
+12   p-&gt;a = a;
+13   p-&gt;b = a;</b>
+14   spin_unlock(&amp;gp_lock);
+15   return true;
+16 }
+</pre>
+</blockquote>
+
+<p>
+If an RCU reader fetches <tt>gp</tt> just after
+<tt>add_gp_buggy_optimized</tt> executes line&nbsp;11,
+it will see garbage in the <tt>-&gt;a</tt> and <tt>-&gt;b</tt>
+fields.
+And this is but one of many ways in which compiler and hardware optimizations
+could cause trouble.
+Therefore, we clearly need some way to prevent the compiler and the CPU from
+reordering in this manner, which brings us to the publish-subscribe
+guarantee discussed in the next section.
+
+<h3><a name="Publish-Subscribe Guarantee">Publish/Subscribe Guarantee</a></h3>
+
+<p>
+RCU's publish-subscribe guarantee allows data to be inserted
+into a linked data structure without disrupting RCU readers.
+The updater uses <tt>rcu_assign_pointer()</tt> to insert the
+new data, and readers use <tt>rcu_dereference()</tt> to
+access data, whether new or old.
+The following shows an example of insertion:
+
+<blockquote>
+<pre>
+ 1 bool add_gp(int a, int b)
+ 2 {
+ 3   p = kmalloc(sizeof(*p), GFP_KERNEL);
+ 4   if (!p)
+ 5     return -ENOMEM;
+ 6   spin_lock(&amp;gp_lock);
+ 7   if (rcu_access_pointer(gp)) {
+ 8     spin_unlock(&amp;gp_lock);
+ 9     return false;
+10   }
+11   p-&gt;a = a;
+12   p-&gt;b = a;
+13   rcu_assign_pointer(gp, p);
+14   spin_unlock(&amp;gp_lock);
+15   return true;
+16 }
+</pre>
+</blockquote>
+
+<p>
+The <tt>rcu_assign_pointer()</tt> on line&nbsp;13 is conceptually
+equivalent to a simple assignment statement, but also guarantees
+that its assignment will
+happen after the two assignments in lines&nbsp;11 and&nbsp;12,
+similar to the C11 <tt>memory_order_release</tt> store operation.
+It also prevents any number of &ldquo;interesting&rdquo; compiler
+optimizations, for example, the use of <tt>gp</tt> as a scratch
+location immediately preceding the assignment.
+
+<p>@@QQ@@
+But <tt>rcu_assign_pointer()</tt> does nothing to prevent the
+two assignments to <tt>p-&gt;a</tt> and <tt>p-&gt;b</tt>
+from being reordered.
+Can't that also cause problems?
+<p>@@QQA@@
+No, it cannot.
+The readers cannot see either of these two fields until
+the assignment to <tt>gp</tt>, by which time both fields are
+fully initialized.
+So reordering the assignments
+to <tt>p-&gt;a</tt> and <tt>p-&gt;b</tt> cannot possibly
+cause any problems.
+<p>@@QQE@@
+
+<p>
+It is tempting to assume that the reader need not do anything special
+to control its accesses to the RCU-protected data,
+as shown in <tt>do_something_gp_buggy()</tt> below:
+
+<blockquote>
+<pre>
+ 1 bool do_something_gp_buggy(void)
+ 2 {
+ 3   rcu_read_lock();
+ 4   p = gp;  /* OPTIMIZATIONS GALORE!!! */
+ 5   if (p) {
+ 6     do_something(p-&gt;a, p-&gt;b);
+ 7     rcu_read_unlock();
+ 8     return true;
+ 9   }
+10   rcu_read_unlock();
+11   return false;
+12 }
+</pre>
+</blockquote>
+
+<p>
+However, this temptation must be resisted because there are a
+surprisingly large number of ways that the compiler
+(to say nothing of
+<a href="https://h71000.www7.hp.com/wizard/wiz_2637.html">DEC Alpha CPUs</a>)
+can trip this code up.
+For but one example, if the compiler were short of registers, it
+might choose to refetch from <tt>gp</tt> rather than keeping
+a separate copy in <tt>p</tt> as follows:
+
+<blockquote>
+<pre>
+ 1 bool do_something_gp_buggy_optimized(void)
+ 2 {
+ 3   rcu_read_lock();
+ 4   if (gp) { /* OPTIMIZATIONS GALORE!!! */
+<b> 5     do_something(gp-&gt;a, gp-&gt;b);</b>
+ 6     rcu_read_unlock();
+ 7     return true;
+ 8   }
+ 9   rcu_read_unlock();
+10   return false;
+11 }
+</pre>
+</blockquote>
+
+<p>
+If this function ran concurrently with a series of updates that
+replaced the current structure with a new one,
+the fetches of <tt>gp-&gt;a</tt>
+and <tt>gp-&gt;b</tt> might well come from two different structures,
+which could cause serious confusion.
+To prevent this (and much else besides), <tt>do_something_gp()</tt> uses
+<tt>rcu_dereference()</tt> to fetch from <tt>gp</tt>:
+
+<blockquote>
+<pre>
+ 1 bool do_something_gp(void)
+ 2 {
+ 3   rcu_read_lock();
+ 4   p = rcu_dereference(gp);
+ 5   if (p) {
+ 6     do_something(p-&gt;a, p-&gt;b);
+ 7     rcu_read_unlock();
+ 8     return true;
+ 9   }
+10   rcu_read_unlock();
+11   return false;
+12 }
+</pre>
+</blockquote>
+
+<p>
+The <tt>rcu_dereference()</tt> uses volatile casts and (for DEC Alpha)
+memory barriers in the Linux kernel.
+Should a
+<a href="http://www.rdrop.com/users/paulmck/RCU/consume.2015.07.13a.pdf">high-quality implementation of C11 <tt>memory_order_consume</tt> [PDF]</a>
+ever appear, then <tt>rcu_dereference()</tt> could be implemented
+as a <tt>memory_order_consume</tt> load.
+Regardless of the exact implementation, a pointer fetched by
+<tt>rcu_dereference()</tt> may not be used outside of the
+outermost RCU read-side critical section containing that
+<tt>rcu_dereference()</tt>, unless protection of
+the corresponding data element has been passed from RCU to some
+other synchronization mechanism, most commonly locking or
+<a href="https://www.kernel.org/doc/Documentation/RCU/rcuref.txt">reference counting</a>.
+
+<p>
+In short, updaters use <tt>rcu_assign_pointer()</tt> and readers
+use <tt>rcu_dereference()</tt>, and these two RCU API elements
+work together to ensure that readers have a consistent view of
+newly added data elements.
+
+<p>
+Of course, it is also necessary to remove elements from RCU-protected
+data structures, for example, using the following process:
+
+<ol>
+<li>	Remove the data element from the enclosing structure.
+<li>	Wait for all pre-existing RCU read-side critical sections
+	to complete (because only pre-existing readers can possibly have
+	a reference to the newly removed data element).
+<li>	At this point, only the updater has a reference to the
+	newly removed data element, so it can safely reclaim
+	the data element, for example, by passing it to <tt>kfree()</tt>.
+</ol>
+
+This process is implemented by <tt>remove_gp_synchronous()</tt>:
+
+<blockquote>
+<pre>
+ 1 bool remove_gp_synchronous(void)
+ 2 {
+ 3   struct foo *p;
+ 4
+ 5   spin_lock(&amp;gp_lock);
+ 6   p = rcu_access_pointer(gp);
+ 7   if (!p) {
+ 8     spin_unlock(&amp;gp_lock);
+ 9     return false;
+10   }
+11   rcu_assign_pointer(gp, NULL);
+12   spin_unlock(&amp;gp_lock);
+13   synchronize_rcu();
+14   kfree(p);
+15   return true;
+16 }
+</pre>
+</blockquote>
+
+<p>
+This function is straightforward, with line&nbsp;13 waiting for a grace
+period before line&nbsp;14 frees the old data element.
+This waiting ensures that readers will reach line&nbsp;7 of
+<tt>do_something_gp()</tt> before the data element referenced by
+<tt>p</tt> is freed.
+The <tt>rcu_access_pointer()</tt> on line&nbsp;6 is similar to
+<tt>rcu_dereference()</tt>, except that:
+
+<ol>
+<li>	The value returned by <tt>rcu_access_pointer()</tt>
+	cannot be dereferenced.
+	If you want to access the value pointed to as well as
+	the pointer itself, use <tt>rcu_dereference()</tt>
+	instead of <tt>rcu_access_pointer()</tt>.
+<li>	The call to <tt>rcu_access_pointer()</tt> need not be
+	protected.
+	In contrast, <tt>rcu_dereference()</tt> must either be
+	within an RCU read-side critical section or in a code
+	segment where the pointer cannot change, for example, in
+	code protected by the corresponding update-side lock.
+</ol>
+
+<p>@@QQ@@
+Without the <tt>rcu_dereference()</tt> or the
+<tt>rcu_access_pointer()</tt>, what destructive optimizations
+might the compiler make use of?
+<p>@@QQA@@
+Let's start with what happens to <tt>do_something_gp()</tt>
+if it fails to use <tt>rcu_dereference()</tt>.
+It could reuse a value formerly fetched from this same pointer.
+It could also fetch the pointer from <tt>gp</tt> in a byte-at-a-time
+manner, resulting in <i>load tearing</i>, in turn resulting a bytewise
+mash-up of two distince pointer values.
+It might even use value-speculation optimizations, where it makes a wrong
+guess, but by the time it gets around to checking the value, an update
+has changed the pointer to match the wrong guess.
+Too bad about any dereferences that returned pre-initialization garbage
+in the meantime!
+
+<p>
+For <tt>remove_gp_synchronous()</tt>, as long as all modifications
+to <tt>gp</tt> are carried out while holding <tt>gp_lock</tt>,
+the above optimizations are harmless.
+However,
+with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt>,
+<tt>sparse</tt> will complain if you
+define <tt>gp</tt> with <tt>__rcu</tt> and then
+access it without using
+either <tt>rcu_access_pointer()</tt> or <tt>rcu_dereference()</tt>.
+<p>@@QQE@@
+
+<p>
+This simple linked-data-structure scenario clearly demonstrates the need
+for RCU's stringent memory-ordering guarantees on systems with more than
+one CPU:
+
+<ol>
+<li>	Each CPU that has an RCU read-side critical section that
+	begins before <tt>synchronize_rcu()</tt> starts is
+	guaranteed to execute a full memory barrier between the time
+	that the RCU read-side critical section ends and the time that
+	<tt>synchronize_rcu()</tt> returns.
+	Without this guarantee, a pre-existing RCU read-side critical section
+	might hold a reference to the newly removed <tt>struct foo</tt>
+	after the <tt>kfree()</tt> on line&nbsp;14 of
+	<tt>remove_gp_synchronous()</tt>.
+<li>	Each CPU that has an RCU read-side critical section that ends
+	after <tt>synchronize_rcu()</tt> returns is guaranteed
+	to execute a full memory barrier between the time that
+	<tt>synchronize_rcu()</tt> begins and the time that the RCU
+	read-side critical section begins.
+	Without this guarantee, a later RCU read-side critical section
+	running after the <tt>kfree()</tt> on line&nbsp;14 of
+	<tt>remove_gp_synchronous()</tt> might
+	later run <tt>do_something_gp()</tt> and find the
+	newly deleted <tt>struct foo</tt>.
+<li>	If the task invoking <tt>synchronize_rcu()</tt> remains
+	on a given CPU, then that CPU is guaranteed to execute a full
+	memory barrier sometime during the execution of
+	<tt>synchronize_rcu()</tt>.
+	This guarantee ensures that the <tt>kfree()</tt> on
+	line&nbsp;14 of <tt>remove_gp_synchronous()</tt> really does
+	execute after the removal on line&nbsp;11.
+<li>	If the task invoking <tt>synchronize_rcu()</tt> migrates
+	among a group of CPUs during that invocation, then each of the
+	CPUs in that group is guaranteed to execute a full memory barrier
+	sometime during the execution of <tt>synchronize_rcu()</tt>.
+	This guarantee also ensures that the <tt>kfree()</tt> on
+	line&nbsp;14 of <tt>remove_gp_synchronous()</tt> really does
+	execute after the removal on
+	line&nbsp;11, but also in the case where the thread executing the
+	<tt>synchronize_rcu()</tt> migrates in the meantime.
+</ol>
+
+<p>@@QQ@@
+Given that multiple CPUs can start RCU read-side critical sections
+at any time without any ordering whatsoever, how can RCU possibly tell whether
+or not a given RCU read-side critical section starts before a
+given instance of <tt>synchronize_rcu()</tt>?
+<p>@@QQA@@
+If RCU cannot tell whether or not a given
+RCU read-side critical section starts before a
+given instance of <tt>synchronize_rcu()</tt>,
+then it must assume that the RCU read-side critical section
+started first.
+In other words, a given instance of <tt>synchronize_rcu()</tt>
+can avoid waiting on a given RCU read-side critical section only
+if it can prove that <tt>synchronize_rcu()</tt> started first.
+<p>@@QQE@@
+
+<p>@@QQ@@
+The first and second guarantees require unbelievably strict ordering!
+Are all these memory barriers <i> really</i> required?
+<p>@@QQA@@
+Yes, they really are required.
+To see why the first guarantee is required, consider the following
+sequence of events:
+
+<ol>
+<li>	CPU 1: <tt>rcu_read_lock()</tt>
+<li>	CPU 1: <tt>q = rcu_dereference(gp);
+	/* Very likely to return p. */</tt>
+<li>	CPU 0: <tt>list_del_rcu(p);</tt>
+<li>	CPU 0: <tt>synchronize_rcu()</tt> starts.
+<li>	CPU 1: <tt>do_something_with(q-&gt;a);
+	/* No smp_mb(), so might happen after kfree(). */</tt>
+<li>	CPU 1: <tt>rcu_read_unlock()</tt>
+<li>	CPU 0: <tt>synchronize_rcu()</tt> returns.
+<li>	CPU 0: <tt>kfree(p);</tt>
+</ol>
+
+<p>
+Therefore, there absolutely must be a full memory barrier between the
+end of the RCU read-side critical section and the end of the
+grace period.
+
+<p>
+The sequence of events demonstrating the necessity of the second rule
+is roughly similar:
+
+<ol>
+<li>	CPU 0: <tt>list_del_rcu(p);</tt>
+<li>	CPU 0: <tt>synchronize_rcu()</tt> starts.
+<li>	CPU 1: <tt>rcu_read_lock()</tt>
+<li>	CPU 1: <tt>q = rcu_dereference(gp);
+	/* Might return p if no memory barrier. */</tt>
+<li>	CPU 0: <tt>synchronize_rcu()</tt> returns.
+<li>	CPU 0: <tt>kfree(p);</tt>
+<li>	CPU 1: <tt>do_something_with(q-&gt;a); /* Boom!!! */</tt>
+<li>	CPU 1: <tt>rcu_read_unlock()</tt>
+</ol>
+
+<p>
+And similarly, without a memory barrier between the beginning of the
+grace period and the beginning of the RCU read-side critical section,
+CPU&nbsp;1 might end up accessing the freelist.
+
+<p>
+The &ldquo;as if&rdquo; rule of course applies, so that any implementation
+that acts as if the appropriate memory barriers were in place is a
+correct implementation.
+That said, it is much easier to fool yourself into believing that you have
+adhered to the as-if rule than it is to actually adhere to it!
+<p>@@QQE@@
+
+<p>
+In short, RCU's publish-subscribe guarantee is provided by the combination
+of <tt>rcu_assign_pointer()</tt> and <tt>rcu_dereference()</tt>.
+This guarantee allows data elements to be safely added to RCU-protected
+linked data structures without disrupting RCU readers.
+This guarantee can be used in combination with the grace-period
+guarantee to also allow data elements to be removed from RCU-protected
+linked data structures, again without disrupting RCU readers.
+
+<p>
+This guarantee was only partially premeditated.
+DYNIX/ptx used an explicit memory barrier for publication, but had nothing
+resembling <tt>rcu_dereference()</tt> for subscription, nor did it
+have anything resembling the <tt>smp_read_barrier_depends()</tt>
+that was later subsumed into <tt>rcu_dereference()</tt>.
+The need for these operations made itself known quite suddenly at a
+late-1990s meeting with the DEC Alpha architects, back in the days when
+DEC was still a free-standing company.
+It took the Alpha architects a good hour to convince me that any sort
+of barrier would ever be needed, and it then took me a good <i>two</i> hours
+to convince them that their documentation did not make this point clear.
+More recent work with the C and C++ standards committees have provided
+much education on tricks and traps from the compiler.
+In short, compilers were much less tricky in the early 1990s, but in
+2015, don't even think about omitting <tt>rcu_dereference()</tt>!
+
+<h3><a name="RCU Primitives Guaranteed to Execute Unconditionally">RCU Primitives Guaranteed to Execute Unconditionally</a></h3>
+
+<p>
+The common-case RCU primitives are unconditional.
+They are invoked, they do their job, and they return, with no possibility
+of error, and no need to retry.
+This is a key RCU design philosophy.
+
+<p>
+However, this philosophy is pragmatic rather than pigheaded.
+If someone comes up with a good justification for a particular conditional
+RCU primitive, it might well be implemented and added.
+After all, this guarantee was reverse-engineered, not premeditated.
+The unconditional nature of the RCU primitives was initially an
+accident of implementation, and later experience with synchronization
+primitives with conditional primitives caused me to elevate this
+accident to a guarantee.
+Therefore, the justification for adding a conditional primitive to
+RCU would need to be based on detailed and compelling use cases.
+
+<h3><a name="Guaranteed Read-to-Write Upgrade">Guaranteed Read-to-Write Upgrade</a></h3>
+
+<p>
+As far as RCU is concerned, it is always possible to carry out an
+update within an RCU read-side critical section.
+For example, that RCU read-side critical section might search for
+a given data element, and then might acquire the update-side
+spinlock in order to update that element, all while remaining
+in that RCU read-side critical section.
+Of course, it is necessary to exit the RCU read-side critical section
+before invoking <tt>synchronize_rcu()</tt>, however, this
+inconvenience can be avoided through use of the
+<tt>call_rcu()</tt> and <tt>kfree_rcu()</tt> API members
+described later in this document.
+
+<p>@@QQ@@
+But how does the upgrade-to-write operation exclude other readers?
+<p>@@QQA@@
+It doesn't, just like normal RCU updates, which also do not exclude
+RCU readers.
+<p>@@QQE@@
+
+<p>
+This guarantee allows lookup code to be shared between read-side
+and update-side code, and was premeditated, appearing in the earliest
+DYNIX/ptx RCU documentation.
+
+<h2><a name="Fundamental Non-Requirements">Fundamental Non-Requirements</a></h2>
+
+<p>
+RCU provides extremely lightweight readers, and its read-side guarantees,
+though quite useful, are correspondingly lightweight.
+It is therefore all too easy to assume that RCU is guaranteeing more
+than it really is.
+Of course, the list of things that RCU does not guarantee is infinitely
+long, however, the following sections list a few non-guarantees that
+have caused confusion.
+Except where otherwise noted, these non-guarantees were premeditated.
+
+<ol>
+<li>	<a href="#Readers Impose Minimal Ordering">
+	Readers Impose Minimal Ordering</a>
+<li>	<a href="#Readers Do Not Exclude Updaters">
+	Readers Do Not Exclude Updaters</a>
+<li>	<a href="#Updaters Only Wait For Old Readers">
+	Updaters Only Wait For Old Readers</a>
+<li>	<a href="#Grace Periods Don't Partition Read-Side Critical Sections">
+	Grace Periods Don't Partition Read-Side Critical Sections</a>
+<li>	<a href="#Read-Side Critical Sections Don't Partition Grace Periods">
+	Read-Side Critical Sections Don't Partition Grace Periods</a>
+<li>	<a href="#Disabling Preemption Does Not Block Grace Periods">
+	Disabling Preemption Does Not Block Grace Periods</a>
+</ol>
+
+<h3><a name="Readers Impose Minimal Ordering">Readers Impose Minimal Ordering</a></h3>
+
+<p>
+Reader-side markers such as <tt>rcu_read_lock()</tt> and
+<tt>rcu_read_unlock()</tt> provide absolutely no ordering guarantees
+except through their interaction with the grace-period APIs such as
+<tt>synchronize_rcu()</tt>.
+To see this, consider the following pair of threads:
+
+<blockquote>
+<pre>
+ 1 void thread0(void)
+ 2 {
+ 3   rcu_read_lock();
+ 4   WRITE_ONCE(x, 1);
+ 5   rcu_read_unlock();
+ 6   rcu_read_lock();
+ 7   WRITE_ONCE(y, 1);
+ 8   rcu_read_unlock();
+ 9 }
+10
+11 void thread1(void)
+12 {
+13   rcu_read_lock();
+14   r1 = READ_ONCE(y);
+15   rcu_read_unlock();
+16   rcu_read_lock();
+17   r2 = READ_ONCE(x);
+18   rcu_read_unlock();
+19 }
+</pre>
+</blockquote>
+
+<p>
+After <tt>thread0()</tt> and <tt>thread1()</tt> execute
+concurrently, it is quite possible to have
+
+<blockquote>
+<pre>
+(r1 == 1 &amp;&amp; r2 == 0)
+</pre>
+</blockquote>
+
+(that is, <tt>y</tt> appears to have been assigned before <tt>x</tt>),
+which would not be possible if <tt>rcu_read_lock()</tt> and
+<tt>rcu_read_unlock()</tt> had much in the way of ordering
+properties.
+But they do not, so the CPU is within its rights
+to do significant reordering.
+This is by design:  Any significant ordering constraints would slow down
+these fast-path APIs.
+
+<p>@@QQ@@
+Can't the compiler also reorder this code?
+<p>@@QQA@@
+No, the volatile casts in <tt>READ_ONCE()</tt> and
+<tt>WRITE_ONCE()</tt> prevent the compiler from reordering in
+this particular case.
+<p>@@QQE@@
+
+<h3><a name="Readers Do Not Exclude Updaters">Readers Do Not Exclude Updaters</a></h3>
+
+<p>
+Neither <tt>rcu_read_lock()</tt> nor <tt>rcu_read_unlock()</tt>
+exclude updates.
+All they do is to prevent grace periods from ending.
+The following example illustrates this:
+
+<blockquote>
+<pre>
+ 1 void thread0(void)
+ 2 {
+ 3   rcu_read_lock();
+ 4   r1 = READ_ONCE(y);
+ 5   if (r1) {
+ 6     do_something_with_nonzero_x();
+ 7     r2 = READ_ONCE(x);
+ 8     WARN_ON(!r2); /* BUG!!! */
+ 9   }
+10   rcu_read_unlock();
+11 }
+12
+13 void thread1(void)
+14 {
+15   spin_lock(&amp;my_lock);
+16   WRITE_ONCE(x, 1);
+17   WRITE_ONCE(y, 1);
+18   spin_unlock(&amp;my_lock);
+19 }
+</pre>
+</blockquote>
+
+<p>
+If the <tt>thread0()</tt> function's <tt>rcu_read_lock()</tt>
+excluded the <tt>thread1()</tt> function's update,
+the <tt>WARN_ON()</tt> could never fire.
+But the fact is that <tt>rcu_read_lock()</tt> does not exclude
+much of anything aside from subsequent grace periods, of which
+<tt>thread1()</tt> has none, so the
+<tt>WARN_ON()</tt> can and does fire.
+
+<h3><a name="Updaters Only Wait For Old Readers">Updaters Only Wait For Old Readers</a></h3>
+
+<p>
+It might be tempting to assume that after <tt>synchronize_rcu()</tt>
+completes, there are no readers executing.
+This temptation must be avoided because
+new readers can start immediately after <tt>synchronize_rcu()</tt>
+starts, and <tt>synchronize_rcu()</tt> is under no
+obligation to wait for these new readers.
+
+<p>@@QQ@@
+Suppose that synchronize_rcu() did wait until all readers had completed.
+Would the updater be able to rely on this?
+<p>@@QQA@@
+No.
+Even if <tt>synchronize_rcu()</tt> were to wait until
+all readers had completed, a new reader might start immediately after
+<tt>synchronize_rcu()</tt> completed.
+Therefore, the code following
+<tt>synchronize_rcu()</tt> cannot rely on there being no readers
+in any case.
+<p>@@QQE@@
+
+<h3><a name="Grace Periods Don't Partition Read-Side Critical Sections">
+Grace Periods Don't Partition Read-Side Critical Sections</a></h3>
+
+<p>
+It is tempting to assume that if any part of one RCU read-side critical
+section precedes a given grace period, and if any part of another RCU
+read-side critical section follows that same grace period, then all of
+the first RCU read-side critical section must precede all of the second.
+However, this just isn't the case: A single grace period does not
+partition the set of RCU read-side critical sections.
+An example of this situation can be illustrated as follows, where
+<tt>x</tt>, <tt>y</tt>, and <tt>z</tt> are initially all zero:
+
+<blockquote>
+<pre>
+ 1 void thread0(void)
+ 2 {
+ 3   rcu_read_lock();
+ 4   WRITE_ONCE(a, 1);
+ 5   WRITE_ONCE(b, 1);
+ 6   rcu_read_unlock();
+ 7 }
+ 8
+ 9 void thread1(void)
+10 {
+11   r1 = READ_ONCE(a);
+12   synchronize_rcu();
+13   WRITE_ONCE(c, 1);
+14 }
+15
+16 void thread2(void)
+17 {
+18   rcu_read_lock();
+19   r2 = READ_ONCE(b);
+20   r3 = READ_ONCE(c);
+21   rcu_read_unlock();
+22 }
+</pre>
+</blockquote>
+
+<p>
+It turns out that the outcome:
+
+<blockquote>
+<pre>
+(r1 == 1 &amp;&amp; r2 == 0 &amp;&amp; r3 == 1)
+</pre>
+</blockquote>
+
+is entirely possible.
+The following figure show how this can happen, with each circled
+<tt>QS</tt> indicating the point at which RCU recorded a
+<i>quiescent state</i> for each thread, that is, a state in which
+RCU knows that the thread cannot be in the midst of an RCU read-side
+critical section that started before the current grace period:
+
+<p><img src="GPpartitionReaders1.svg" alt="GPpartitionReaders1.svg" width="60%"></p>
+
+<p>
+If it is necessary to partition RCU read-side critical sections in this
+manner, it is necessary to use two grace periods, where the first
+grace period is known to end before the second grace period starts:
+
+<blockquote>
+<pre>
+ 1 void thread0(void)
+ 2 {
+ 3   rcu_read_lock();
+ 4   WRITE_ONCE(a, 1);
+ 5   WRITE_ONCE(b, 1);
+ 6   rcu_read_unlock();
+ 7 }
+ 8
+ 9 void thread1(void)
+10 {
+11   r1 = READ_ONCE(a);
+12   synchronize_rcu();
+13   WRITE_ONCE(c, 1);
+14 }
+15
+16 void thread2(void)
+17 {
+18   r2 = READ_ONCE(c);
+19   synchronize_rcu();
+20   WRITE_ONCE(d, 1);
+21 }
+22
+23 void thread3(void)
+24 {
+25   rcu_read_lock();
+26   r3 = READ_ONCE(b);
+27   r4 = READ_ONCE(d);
+28   rcu_read_unlock();
+29 }
+</pre>
+</blockquote>
+
+<p>
+Here, if <tt>(r1 == 1)</tt>, then
+<tt>thread0()</tt>'s write to <tt>b</tt> must happen
+before the end of <tt>thread1()</tt>'s grace period.
+If in addition <tt>(r4 == 1)</tt>, then
+<tt>thread3()</tt>'s read from <tt>b</tt> must happen
+after the beginning of <tt>thread2()</tt>'s grace period.
+If it is also the case that <tt>(r2 == 1)</tt>, then the
+end of <tt>thread1()</tt>'s grace period must precede the
+beginning of <tt>thread2()</tt>'s grace period.
+This mean that the two RCU read-side critical sections cannot overlap,
+guaranteeing that <tt>(r3 == 1)</tt>.
+As a result, the outcome:
+
+<blockquote>
+<pre>
+(r1 == 1 &amp;&amp; r2 == 1 &amp;&amp; r3 == 0 &amp;&amp; r4 == 1)
+</pre>
+</blockquote>
+
+cannot happen.
+
+<p>
+This non-requirement was also non-premeditated, but became apparent
+when studying RCU's interaction with memory ordering.
+
+<h3><a name="Read-Side Critical Sections Don't Partition Grace Periods">
+Read-Side Critical Sections Don't Partition Grace Periods</a></h3>
+
+<p>
+It is also tempting to assume that if an RCU read-side critical section
+happens between a pair of grace periods, then those grace periods cannot
+overlap.
+However, this temptation leads nowhere good, as can be illustrated by
+the following, with all variables initially zero:
+
+<blockquote>
+<pre>
+ 1 void thread0(void)
+ 2 {
+ 3   rcu_read_lock();
+ 4   WRITE_ONCE(a, 1);
+ 5   WRITE_ONCE(b, 1);
+ 6   rcu_read_unlock();
+ 7 }
+ 8
+ 9 void thread1(void)
+10 {
+11   r1 = READ_ONCE(a);
+12   synchronize_rcu();
+13   WRITE_ONCE(c, 1);
+14 }
+15
+16 void thread2(void)
+17 {
+18   rcu_read_lock();
+19   WRITE_ONCE(d, 1);
+20   r2 = READ_ONCE(c);
+21   rcu_read_unlock();
+22 }
+23
+24 void thread3(void)
+25 {
+26   r3 = READ_ONCE(d);
+27   synchronize_rcu();
+28   WRITE_ONCE(e, 1);
+29 }
+30
+31 void thread4(void)
+32 {
+33   rcu_read_lock();
+34   r4 = READ_ONCE(b);
+35   r5 = READ_ONCE(e);
+36   rcu_read_unlock();
+37 }
+</pre>
+</blockquote>
+
+<p>
+In this case, the outcome:
+
+<blockquote>
+<pre>
+(r1 == 1 &amp;&amp; r2 == 1 &amp;&amp; r3 == 1 &amp;&amp; r4 == 0 &amp&amp; r5 == 1)
+</pre>
+</blockquote>
+
+is entirely possible, as illustrated below:
+
+<p><img src="ReadersPartitionGP1.svg" alt="ReadersPartitionGP1.svg" width="100%"></p>
+
+<p>
+Again, an RCU read-side critical section can overlap almost all of a
+given grace period, just so long as it does not overlap the entire
+grace period.
+As a result, an RCU read-side critical section cannot partition a pair
+of RCU grace periods.
+
+<p>@@QQ@@
+How long a sequence of grace periods, each separated by an RCU read-side
+critical section, would be required to partition the RCU read-side
+critical sections at the beginning and end of the chain?
+<p>@@QQA@@
+In theory, an infinite number.
+In practice, an unknown number that is sensitive to both implementation
+details and timing considerations.
+Therefore, even in practice, RCU users must abide by the theoretical rather
+than the practical answer.
+<p>@@QQE@@
+
+<h3><a name="Disabling Preemption Does Not Block Grace Periods">
+Disabling Preemption Does Not Block Grace Periods</a></h3>
+
+<p>
+There was a time when disabling preemption on any given CPU would block
+subsequent grace periods.
+However, this was an accident of implementation and is not a requirement.
+And in the current Linux-kernel implementation, disabling preemption
+on a given CPU in fact does not block grace periods, as Oleg Nesterov
+<a href="https://lkml.kernel.org/g/20150614193825.GA19582@redhat.com">demonstrated</a>.
+
+<p>
+If you need a preempt-disable region to block grace periods, you need to add
+<tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>, for example
+as follows:
+
+<blockquote>
+<pre>
+ 1 preempt_disable();
+ 2 rcu_read_lock();
+ 3 do_something();
+ 4 rcu_read_unlock();
+ 5 preempt_enable();
+ 6
+ 7 /* Spinlocks implicitly disable preemption. */
+ 8 spin_lock(&amp;mylock);
+ 9 rcu_read_lock();
+10 do_something();
+11 rcu_read_unlock();
+12 spin_unlock(&amp;mylock);
+</pre>
+</blockquote>
+
+<p>
+In theory, you could enter the RCU read-side critical section first,
+but it is more efficient to keep the entire RCU read-side critical
+section contained in the preempt-disable region as shown above.
+Of course, RCU read-side critical sections that extend outside of
+preempt-disable regions will work correctly, but such critical sections
+can be preempted, which forces <tt>rcu_read_unlock()</tt> to do
+more work.
+And no, this is <i>not</i> an invitation to enclose all of your RCU
+read-side critical sections within preempt-disable regions, because
+doing so would degrade real-time response.
+
+<p>
+This non-requirement appeared with preemptible RCU.
+If you need a grace period that waits on non-preemptible code regions, use
+<a href="#Sched Flavor">RCU-sched</a>.
+
+<h2><a name="Parallelism Facts of Life">Parallelism Facts of Life</a></h2>
+
+<p>
+These parallelism facts of life are by no means specific to RCU, but
+the RCU implementation must abide by them.
+They therefore bear repeating:
+
+<ol>
+<li>	Any CPU or task may be delayed at any time,
+	and any attempts to avoid these delays by disabling
+	preemption, interrupts, or whatever are completely futile.
+	This is most obvious in preemptible user-level
+	environments and in virtualized environments (where
+	a given guest OS's VCPUs can be preempted at any time by
+	the underlying hypervisor), but can also happen in bare-metal
+	environments due to ECC errors, NMIs, and other hardware
+	events.
+	Although a delay of more than about 20 seconds can result
+	in splats, the RCU implementation is obligated to use
+	algorithms that can tolerate extremely long delays, but where
+	&ldquo;extremely long&rdquo; is not long enough to allow
+	wrap-around when incrementing a 64-bit counter.
+<li>	Both the compiler and the CPU can reorder memory accesses.
+	Where it matters, RCU must use compiler directives and
+	memory-barrier instructions to preserve ordering.
+<li>	Conflicting writes to memory locations in any given cache line
+	will result in expensive cache misses.
+	Greater numbers of concurrent writes and more-frequent
+	concurrent writes will result in more dramatic slowdowns.
+	RCU is therefore obligated to use algorithms that have
+	sufficient locality to avoid significant performance and
+	scalability problems.
+<li>	As a rough rule of thumb, only one CPU's worth of processing
+	may be carried out under the protection of any given exclusive
+	lock.
+	RCU must therefore use scalable locking designs.
+<li>	Counters are finite, especially on 32-bit systems.
+	RCU's use of counters must therefore tolerate counter wrap,
+	or be designed such that counter wrap would take way more
+	time than a single system is likely to run.
+	An uptime of ten years is quite possible, a runtime
+	of a century much less so.
+	As an example of the latter, RCU's dyntick-idle nesting counter
+	allows 54 bits for interrupt nesting level (this counter
+	is 64 bits even on a 32-bit system).
+	Overflowing this counter requires 2<sup>54</sup>
+	half-interrupts on a given CPU without that CPU ever going idle.
+	If a half-interrupt happened every microsecond, it would take
+	570 years of runtime to overflow this counter, which is currently
+	believed to be an acceptably long time.
+<li>	Linux systems can have thousands of CPUs running a single
+	Linux kernel in a single shared-memory environment.
+	RCU must therefore pay close attention to high-end scalability.
+</ol>
+
+<p>
+This last parallelism fact of life means that RCU must pay special
+attention to the preceding facts of life.
+The idea that Linux might scale to systems with thousands of CPUs would
+have been met with some skepticism in the 1990s, but these requirements
+would have otherwise have been unsurprising, even in the early 1990s.
+
+<h2><a name="Quality-of-Implementation Requirements">Quality-of-Implementation Requirements</a></h2>
+
+<p>
+These sections list quality-of-implementation requirements.
+Although an RCU implementation that ignores these requirements could
+still be used, it would likely be subject to limitations that would
+make it inappropriate for industrial-strength production use.
+Classes of quality-of-implementation requirements are as follows:
+
+<ol>
+<li>	<a href="#Specialization">Specialization</a>
+<li>	<a href="#Performance and Scalability">Performance and Scalability</a>
+<li>	<a href="#Composability">Composability</a>
+<li>	<a href="#Corner Cases">Corner Cases</a>
+</ol>
+
+<p>
+These classes is covered in the following sections.
+
+<h3><a name="Specialization">Specialization</a></h3>
+
+<p>
+RCU is and always has been intended primarily for read-mostly situations, as
+illustrated by the following figure.
+This means that RCU's read-side primitives are optimized, often at the
+expense of its update-side primitives.
+
+<p><img src="RCUApplicability.svg" alt="RCUApplicability.svg" width="70%"></p>
+
+<p>
+This focus on read-mostly situations means that RCU must interoperate
+with other synchronization primitives.
+For example, the <tt>add_gp()</tt> and <tt>remove_gp_synchronous()</tt>
+examples discussed earlier use RCU to protect readers and locking to
+coordinate updaters.
+However, the need extends much farther, requiring that a variety of
+synchronization primitives be legal within RCU read-side critical sections,
+including spinlocks, sequence locks, atomic operations, reference
+counters, and memory barriers.
+
+<p>@@QQ@@
+What about sleeping locks?
+<p>@@QQA@@
+These are forbidden within Linux-kernel RCU read-side critical sections
+because it is not legal to place a quiescent state (in this case,
+voluntary context switch) within an RCU read-side critical section.
+However, sleeping locks may be used within userspace RCU read-side critical
+sections, and also within Linux-kernel sleepable RCU
+<a href="#Sleepable RCU">(SRCU)</a>
+read-side critical sections.
+In addition, the -rt patchset turns spinlocks into a sleeping locks so
+that the corresponding critical sections can be preempted, which
+also means that these sleeplockified spinlocks (but not other sleeping locks!)
+may be acquire within -rt-Linux-kernel RCU read-side critical sections.
+
+<p>
+Note that it <i>is</i> legal for a normal RCU read-side critical section
+to conditionally acquire a sleeping locks (as in <tt>mutex_trylock()</tt>),
+but only as long as it does not loop indefinitely attempting to
+conditionally acquire that sleeping locks.
+The key point is that things like <tt>mutex_trylock()</tt>
+either return with the mutex held, or return an error indication if
+the mutex was not immediately available.
+Either way, <tt>mutex_trylock()</tt> returns immediately without sleeping.
+<p>@@QQE@@
+
+<p>
+It often comes as a surprise that many algorithms do not require a
+consistent view of data, but many can function in that mode,
+with network routing being the poster child.
+Internet routing algorithms take significant time to propagate
+updates, so that by the time an update arrives at a given system,
+that system has been sending network traffic the wrong way for
+a considerable length of time.
+Having a few threads continue to send traffic the wrong way for a
+few more milliseconds is clearly not a problem:  In the worst case,
+TCP retransmissions will eventually get the data where it needs to go.
+In general, when tracking the state of the universe outside of the
+computer, some level of inconsistency must be tolerated due to
+speed-of-light delays if nothing else.
+
+<p>
+Furthermore, uncertainty about external state is inherent in many cases.
+For example, a pair of veternarians might use heartbeat to determine
+whether or not a given cat was alive.
+But how long should they wait after the last heartbeat to decide that
+the cat is in fact dead?
+Waiting less than 400 milliseconds makes no sense because this would
+mean that a relaxed cat would be considered to cycle between death
+and life more than 100 times per minute.
+Moreover, just as with human beings, a cat's heart might stop for
+some period of time, so the exact wait period is a judgment call.
+One of our pair of veternarians might wait 30 seconds before pronouncing
+the cat dead, while the other might insist on waiting a full minute.
+The two veternarians would then disagree on the state of the cat during
+the final 30 seconds of the minute following the last heartbeat, as
+fancifully illustrated below:
+
+<p><img src="2013-08-is-it-dead.png" alt="2013-08-is-it-dead.png" width="431"></p>
+
+<p>
+Interestingly enough, this same situation applies to hardware.
+When push comes to shove, how do we tell whether or not some
+external server has failed?
+We send messages to it periodically, and declare it failed if we
+don't receive a response within a given period of time.
+Policy decisions can usually tolerate short
+periods of inconsistency.
+The policy was decided some time ago, and is only now being put into
+effect, so a few milliseconds of delay is normally inconsequential.
+
+<p>
+However, there are algorithms that absolutely must see consistent data.
+For example, the translation between a user-level SystemV semaphore
+ID to the corresponding in-kernel data structure is protected by RCU,
+but it is absolutely forbidden to update a semaphore that has just been
+removed.
+In the Linux kernel, this need for consistency is accommodated by acquiring
+spinlocks located in the in-kernel data structure from within
+the RCU read-side critical section, and this is indicated by the
+green box in the figure above.
+Many other techniques may be used, and are in fact used within the
+Linux kernel.
+
+<p>
+In short, RCU is not required to maintain consistency, and other
+mechanisms may be used in concert with RCU when consistency is required.
+RCU's specialization allows it to do its job extremely well, and its
+ability to interoperate with other synchronization mechanisms allows
+the right mix of synchronization tools to be used for a given job.
+
+<h3><a name="Performance and Scalability">Performance and Scalability</a></h3>
+
+<p>
+Energy efficiency is a critical component of performance today,
+and Linux-kernel RCU implementations must therefore avoid unnecessarily
+awakening idle CPUs.
+I cannot claim that this requirement was premeditated.
+In fact, I learned of it during a telephone conversation in which I
+was given &ldquo;frank and open&rdquo; feedback on the importance
+of energy efficiency in battery-powered systems and on specific
+energy-efficiency shortcomings of the Linux-kernel RCU implementation.
+In my experience, the battery-powered embedded community will consider
+any unnecessary wakeups to be extremely unfriendly acts.
+So much so that mere Linux-kernel-mailing-list posts are
+insufficient to vent their ire.
+
+<p>
+Memory consumption is not particularly important for in most
+situations, and has become decreasingly
+so as memory sizes have expanded and memory
+costs have plummeted.
+However, as I learned from Matt Mackall's
+<a href="http://elinux.org/Linux_Tiny-FAQ">bloatwatch</a>
+efforts, memory footprint is critically important on single-CPU systems with
+non-preemptible (<tt>CONFIG_PREEMPT=n</tt>) kernels, and thus
+<a href="https://lkml.kernel.org/g/20090113221724.GA15307@linux.vnet.ibm.com">tiny RCU</a>
+was born.
+Josh Triplett has since taken over the small-memory banner with his
+<a href="https://tiny.wiki.kernel.org/">Linux kernel tinification</a>
+project, which resulted in
+<a href="#Sleepable RCU">SRCU</a>
+becoming optional for those kernels not needing it.
+
+<p>
+The remaining performance requirements are, for the most part,
+unsurprising.
+For example, in keeping with RCU's read-side specialization,
+<tt>rcu_dereference()</tt> should have negligible overhead (for
+example, suppression of a few minor compiler optimizations).
+Similarly, in non-preemptible environments, <tt>rcu_read_lock()</tt> and
+<tt>rcu_read_unlock()</tt> should have exactly zero overhead.
+
+<p>
+In preemptible environments, in the case where the RCU read-side
+critical section was not preempted (as will be the case for the
+highest-priority real-time process), <tt>rcu_read_lock()</tt> and
+<tt>rcu_read_unlock()</tt> should have minimal overhead.
+In particular, they should not contain atomic read-modify-write
+operations, memory-barrier instructions, preemption disabling,
+interrupt disabling, or backwards branches.
+However, in the case where the RCU read-side critical section was preempted,
+<tt>rcu_read_unlock()</tt> may acquire spinlocks and disable interrupts.
+This is why it is better to nest an RCU read-side critical section
+within a preempt-disable region than vice versa, at least in cases
+where that critical section is short enough to avoid unduly degrading
+real-time latencies.
+
+<p>
+The <tt>synchronize_rcu()</tt> grace-period-wait primitive is
+optimized for throughput.
+It may therefore incur several milliseconds of latency in addition to
+the duration of the longest RCU read-side critical section.
+On the other hand, multiple concurrent invocations of
+<tt>synchronize_rcu()</tt> are required to use batching optimizations
+so that they can be satisfied by a single underlying grace-period-wait
+operation.
+For example, in the Linux kernel, it is not unusual for a single
+grace-period-wait operation to serve more than
+<a href="https://www.usenix.org/conference/2004-usenix-annual-technical-conference/making-rcu-safe-deep-sub-millisecond-response">1,000 separate invocations</a>
+of <tt>synchronize_rcu()</tt>, thus amortizing the per-invocation
+overhead down to nearly zero.
+However, the grace-period optimization is also required to avoid
+measurable degradation of real-time scheduling and interrupt latencies.
+
+<p>
+In some cases, the multi-millisecond <tt>synchronize_rcu()</tt>
+latencies are unacceptable.
+In these cases, <tt>synchronize_rcu_expedited()</tt> may be used
+instead, reducing the grace-period latency down to a few tens of
+microseconds on small systems, at least in cases where the RCU read-side
+critical sections are short.
+There are currently no special latency requirements for
+<tt>synchronize_rcu_expedited()</tt> on large systems, but,
+consistent with the empirical nature of the RCU specification,
+that is subject to change.
+However, there most definitely are scalability requirements:
+A storm of <tt>synchronize_rcu_expedited()</tt> invocations on 4096
+CPUs should at least make reasonable forward progress.
+In return for its shorter latencies, <tt>synchronize_rcu_expedited()</tt>
+is permitted to impose modest degradation of real-time latency
+on non-idle online CPUs.
+That said, it will likely be necessary to take further steps to reduce this
+degradation, hopefully to roughly that of a scheduling-clock interrupt.
+
+<p>
+There are a number of situations where even
+<tt>synchronize_rcu_expedited()</tt>'s reduced grace-period
+latency is unacceptable.
+In these situations, the asynchronous <tt>call_rcu()</tt> can be
+used in place of <tt>synchronize_rcu()</tt> as follows:
+
+<blockquote>
+<pre>
+ 1 struct foo {
+ 2   int a;
+ 3   int b;
+ 4   struct rcu_head rh;
+ 5 };
+ 6
+ 7 static void remove_gp_cb(struct rcu_head *rhp)
+ 8 {
+ 9   struct foo *p = container_of(rhp, struct foo, rh);
+10
+11   kfree(p);
+12 }
+13
+14 bool remove_gp_asynchronous(void)
+15 {
+16   struct foo *p;
+17
+18   spin_lock(&amp;gp_lock);
+19   p = rcu_dereference(gp);
+20   if (!p) {
+21     spin_unlock(&amp;gp_lock);
+22     return false;
+23   }
+24   rcu_assign_pointer(gp, NULL);
+25   call_rcu(&amp;p-&gt;rh, remove_gp_cb);
+26   spin_unlock(&amp;gp_lock);
+27   return true;
+28 }
+</pre>
+</blockquote>
+
+<p>
+A definition of <tt>struct foo</tt> is finally needed, and appears
+on lines&nbsp;1-5.
+The function <tt>remove_gp_cb()</tt> is passed to <tt>call_rcu()</tt>
+on line&nbsp;25, and will be invoked after the end of a subsequent
+grace period.
+This gets the same effect as <tt>remove_gp_synchronous()</tt>,
+but without forcing the updater to wait for a grace period to elapse.
+The <tt>call_rcu()</tt> function may be used in a number of
+situations where neither <tt>synchronize_rcu()</tt> nor
+<tt>synchronize_rcu_expedited()</tt> would be legal,
+including within preempt-disable code, <tt>local_bh_disable()</tt> code,
+interrupt-disable code, and interrupt handlers.
+However, even <tt>call_rcu()</tt> is illegal within NMI handlers.
+The callback function (<tt>remove_gp_cb()</tt> in this case) will be
+executed within softirq (software interrupt) environment within the
+Linux kernel,
+either within a real softirq handler or under the protection
+of <tt>local_bh_disable()</tt>.
+In both the Linux kernel and in userspace, it is bad practice to
+write an RCU callback function that takes too long.
+Long-running operations should be relegated to separate threads or
+(in the Linux kernel) workqueues.
+
+<p>@@QQ@@
+Why does line&nbsp;19 use <tt>rcu_access_pointer()</tt>?
+After all, <tt>call_rcu()</tt> on line&nbsp;25 stores into the
+structure, which would interact badly with concurrent insertions.
+Doesn't this mean that <tt>rcu_dereference()</tt> is required?
+<p>@@QQA@@
+Presumably the <tt>-&gt;gp_lock</tt> acquired on line&nbsp;18 excludes
+any changes, including any insertions that <tt>rcu_dereference()</tt>
+would protect against.
+Therefore, any insertions will be delayed until after <tt>-&gt;gp_lock</tt>
+is released on line&nbsp;25, which in turn means that
+<tt>rcu_access_pointer()</tt> suffices.
+<p>@@QQE@@
+
+<p>
+However, all that <tt>remove_gp_cb()</tt> is doing is
+invoking <tt>kfree()</tt> on the data element.
+This is a common idiom, and is supported by <tt>kfree_rcu()</tt>,
+which allows &ldquo;fire and forget&rdquo; operation as shown below:
+
+<blockquote>
+<pre>
+ 1 struct foo {
+ 2   int a;
+ 3   int b;
+ 4   struct rcu_head rh;
+ 5 };
+ 6
+ 7 bool remove_gp_faf(void)
+ 8 {
+ 9   struct foo *p;
+10
+11   spin_lock(&amp;gp_lock);
+12   p = rcu_dereference(gp);
+13   if (!p) {
+14     spin_unlock(&amp;gp_lock);
+15     return false;
+16   }
+17   rcu_assign_pointer(gp, NULL);
+18   kfree_rcu(p, rh);
+19   spin_unlock(&amp;gp_lock);
+20   return true;
+21 }
+</pre>
+</blockquote>
+
+<p>
+Note that <tt>remove_gp_faf()</tt> simply invokes
+<tt>kfree_rcu()</tt> and proceeds, without any need to pay any
+further attention to the subsequent grace period and <tt>kfree()</tt>.
+It is permissible to invoke <tt>kfree_rcu()</tt> from the same
+environments as for <tt>call_rcu()</tt>.
+Interestingly enough, DYNIX/ptx had the equivalents of
+<tt>call_rcu()</tt> and <tt>kfree_rcu()</tt>, but not
+<tt>synchronize_rcu()</tt>.
+This was due to the fact that RCU was not heavily used within DYNIX/ptx,
+so the very few places that needed something like
+<tt>synchronize_rcu()</tt> simply open-coded it.
+
+<p>@@QQ@@
+Earlier it was claimed that <tt>call_rcu()</tt> and
+<tt>kfree_rcu()</tt> allowed updaters to avoid being blocked
+by readers.
+But how can that be correct, given that the invocation of the callback
+and the freeing of the memory (respectively) must still wait for
+a grace period to elapse?
+<p>@@QQA@@
+We could define things this way, but keep in mind that this sort of
+definition would say that updates in garbage-collected languages
+cannot complete until the next time the garbage collector runs,
+which does not seem at all reasonable.
+The key point is that in most cases, an updater using either
+<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt> can proceed to the
+next update as soon as it has invoked <tt>call_rcu()</tt> or
+<tt>kfree_rcu()</tt>, without having to wait for a subsequent
+grace period.
+<p>@@QQE@@
+
+<p>
+But what if the updater must wait for the completion of code to be
+executed after the end of the grace period, but has other tasks
+that can be carried out in the meantime?
+The polling-style <tt>get_state_synchronize_rcu()</tt> and
+<tt>cond_synchronize_rcu()</tt> functions may be used for this
+purpose, as shown below:
+
+<blockquote>
+<pre>
+ 1 bool remove_gp_poll(void)
+ 2 {
+ 3   struct foo *p;
+ 4   unsigned long s;
+ 5
+ 6   spin_lock(&amp;gp_lock);
+ 7   p = rcu_access_pointer(gp);
+ 8   if (!p) {
+ 9     spin_unlock(&amp;gp_lock);
+10     return false;
+11   }
+12   rcu_assign_pointer(gp, NULL);
+13   spin_unlock(&amp;gp_lock);
+14   s = get_state_synchronize_rcu();
+15   do_something_while_waiting();
+16   cond_synchronize_rcu(s);
+17   kfree(p);
+18   return true;
+19 }
+</pre>
+</blockquote>
+
+<p>
+On line&nbsp;14, <tt>get_state_synchronize_rcu()</tt> obtains a
+&ldquo;cookie&rdquo; from RCU,
+then line&nbsp;15 carries out other tasks,
+and finally, line&nbsp;16 returns immediately if a grace period has
+elapsed in the meantime, but otherwise waits as required.
+The need for <tt>get_state_synchronize_rcu</tt> and
+<tt>cond_synchronize_rcu()</tt> has appeared quite recently,
+so it is too early to tell whether they will stand the test of time.
+
+<p>
+RCU thus provides a range of tools to allow updaters to strike the
+required tradeoff between latency, flexibility and CPU overhead.
+
+<h3><a name="Composability">Composability</a></h3>
+
+<p>
+Composability has received much attention in recent years, perhaps in part
+due to the collision of multicore hardware with object-oriented techniques
+designed in single-threaded environments for single-threaded use.
+And in theory, RCU read-side critical sections may be composed, and in
+fact may be nested arbitrarily deeply.
+In practice, as with all real-world implementations of composable
+constructs, there are limitations.
+
+<p>
+Implementations of RCU for which <tt>rcu_read_lock()</tt>
+and <tt>rcu_read_unlock()</tt> generate no code, such as
+Linux-kernel RCU when <tt>CONFIG_PREEMPT=n</tt>, can be
+nested arbitrarily deeply.
+After all, there is no overhead.
+Except that if all these instances of <tt>rcu_read_lock()</tt>
+and <tt>rcu_read_unlock()</tt> are visible to the compiler,
+compilation will eventually fail due to exhausting memory,
+mass storage, or user patience, whichever comes first.
+If the nesting is not visible to the compiler, as is the case with
+mutually recursive functions each in its own translation unit,
+stack overflow will result.
+If the nesting takes the form of loops, either the control variable
+will overflow or (in the Linux kernel) you will get an RCU CPU stall warning.
+Nevertheless, this class of RCU implementations is one
+of the most composable constructs in existence.
+
+<p>
+RCU implementations that explicitly track nesting depth
+are limited by the nesting-depth counter.
+For example, the Linux kernel's preemptible RCU limits nesting to
+<tt>INT_MAX</tt>.
+This should suffice for almost all practical purposes.
+That said, a consecutive pair of RCU read-side critical sections
+between which there is an operation that waits for a grace period
+cannot be enclosed in another RCU read-side critical section.
+This is because it is not legal to wait for a grace period within
+an RCU read-side critical section:  To do so would result either
+in deadlock or
+in RCU implicitly splitting the enclosing RCU read-side critical
+section, neither of which is conducive to a long-lived and prosperous
+kernel.
+
+<p>
+In short, although RCU read-side critical sections are highly composable,
+care is required in some situations, just as is the case for any other
+composable synchronization mechanism.
+
+<h3><a name="Corner Cases">Corner Cases</a></h3>
+
+<p>
+A given RCU workload might have an endless and intense stream of
+RCU read-side critical sections, perhaps even so intense that there
+was never a point in time during which there was not at least one
+RCU read-side critical section in flight.
+RCU cannot allow this situation to block grace periods:  As long as
+all the RCU read-side critical sections are finite, grace periods
+must also be finite.
+
+<p>
+That said, preemptible RCU implementations could potentially result
+in RCU read-side critical sections being preempted for long durations,
+which has the effect of creating a long-duration RCU read-side
+critical section.
+This situation can arise only in heavily loaded systems, but systems using
+real-time priorities are of course more vulnerable.
+Therefore, RCU priority boosting is provided to help deal with this
+case.
+That said, the exact requirements on RCU priority boosting will likely
+evolve as more experience accumulates.
+
+<p>
+Other workloads might have very high update rates.
+Although one can argue that such workloads should instead use
+something other than RCU, the fact remains that RCU must
+handle such workloads gracefully.
+This requirement is another factor driving batching of grace periods,
+but it is also the driving force behind the checks for large numbers
+of queued RCU callbacks in the <tt>call_rcu()</tt> code path.
+Finally, high update rates should not delay RCU read-side critical
+sections, although some read-side delays can occur when using
+<tt>synchronize_rcu_expedited()</tt>, courtesy of this function's use
+of <tt>try_stop_cpus()</tt>.
+(In the future, <tt>synchronize_rcu_expedited()</tt> will be
+converted to use lighter-weight inter-processor interrupts (IPIs),
+but this will still disturb readers, though to a much smaller degree.)
+
+<p>
+Although all three of these corner cases were understood in the early
+1990s, a simple user-level test consisting of <tt>close(open(path))</tt>
+in a tight loop
+in the early 2000s suddenly provided a much deeper appreciation of the
+high-update-rate corner case.
+This test also motivated addition of some RCU code to react to high update
+rates, for example, if a given CPU finds itself with more than 10,000
+RCU callbacks queued, it will cause RCU to take evasive action by
+more aggressively starting grace periods and more aggressively forcing
+completion of grace-period processing.
+This evasive action causes the grace period to complete more quickly,
+but at the cost of restricting RCU's batching optimizations, thus
+increasing the CPU overhead incurred by that grace period.
+
+<h2><a name="Software-Engineering Requirements">
+Software-Engineering Requirements</a></h2>
+
+<p>
+Between Murphy's Law and &ldquo;To err is human&rdquo;, it is necessary to
+guard against mishaps and misuse:
+
+<ol>
+<li>	It is all too easy to forget to use <tt>rcu_read_lock()</tt>
+	everywhere that it is needed, so kernels built with
+	<tt>CONFIG_PROVE_RCU=y</tt> will spat if
+	<tt>rcu_dereference()</tt> is used outside of an
+	RCU read-side critical section.
+	Update-side code can use <tt>rcu_dereference_protected()</tt>,
+	which takes a
+	<a href="https://lwn.net/Articles/371986/">lockdep expression</a>
+	to indicate what is providing the protection.
+	If the indicated protection is not provided, a lockdep splat
+	is emitted.
+
+	<p>
+	Code shared between readers and updaters can use
+	<tt>rcu_dereference_check()</tt>, which also takes a
+	lockdep expression, and emits a lockdep splat if neither
+	<tt>rcu_read_lock()</tt> nor the indicated protection
+	is in place.
+	In addition, <tt>rcu_dereference_raw()</tt> is used in those
+	(hopefully rare) cases where the required protection cannot
+	be easily described.
+	Finally, <tt>rcu_read_lock_held()</tt> is provided to
+	allow a function to verify that it has been invoked within
+	an RCU read-side critical section.
+	I was made aware of this set of requirements shortly after Thomas
+	Gleixner audited a number of RCU uses.
+<li>	A given function might wish to check for RCU-related preconditions
+	upon entry, before using any other RCU API.
+	The <tt>rcu_lockdep_assert()</tt> does this job,
+	asserting the expression in kernels having lockdep enabled
+	and doing nothing otherwise.
+<li>	It is also easy to forget to use <tt>rcu_assign_pointer()</tt>
+	and <tt>rcu_dereference()</tt>, perhaps (incorrectly)
+	substituting a simple assignment.
+	To catch this sort of error, a given RCU-protected pointer may be
+	tagged with <tt>__rcu</tt>, after which running sparse
+	with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt> will complain
+	about simple-assignment accesses to that pointer.
+	Arnd Bergmann made me aware of this requirement, and also
+	supplied the needed
+	<a href="https://lwn.net/Articles/376011/">patch series</a>.
+<li>	Kernels built with <tt>CONFIG_DEBUG_OBJECTS_RCU_HEAD=y</tt>
+	will splat if a data element is passed to <tt>call_rcu()</tt>
+	twice in a row, without a grace period in between.
+	(This error is similar to a double free.)
+	The corresponding <tt>rcu_head</tt> structures that are
+	dynamically allocated are automatically tracked, but
+	<tt>rcu_head</tt> structures allocated on the stack
+	must be initialized with <tt>init_rcu_head_on_stack()</tt>
+	and cleaned up with <tt>destroy_rcu_head_on_stack()</tt>.
+	Similarly, statically allocated non-stack <tt>rcu_head</tt>
+	structures must be initialized with <tt>init_rcu_head()</tt>
+	and cleaned up with <tt>destroy_rcu_head()</tt>.
+	Mathieu Desnoyers made me aware of this requirement, and also
+	supplied the needed
+	<a href="https://lkml.kernel.org/g/20100319013024.GA28456@Krystal">patch</a>.
+<li>	An infinite loop in an RCU read-side critical section will
+	eventually trigger an RCU CPU stall warning splat.
+	However, RCU is not obligated to produce this splat
+	unless there is a grace period waiting on that particular
+	RCU read-side critical section.
+	This requirement made itself known in the early 1990s, pretty
+	much the first time that it was necessary to debug a CPU stall.
+<li>	Although it would be very good to detect pointers leaking out
+	of RCU read-side critical sections, there is currently no
+	good way of doing this.
+	One complication is the need to distinguish between pointers
+	leaking and pointers that have been handed off from RCU to
+	some other synchronization mechanism, for example, reference
+	counting.
+<li>	In kernels built with <tt>CONFIG_RCU_TRACE=y</tt>, RCU-related
+	information is provided via both debugfs and event tracing.
+<li>	Open-coded use of <tt>rcu_assign_pointer()</tt> and
+	<tt>rcu_dereference()</tt> to create typical linked
+	data structures can be surprisingly error-prone.
+	Therefore, RCU-protected
+	<a href="https://lwn.net/Articles/609973/#RCU List APIs">linked lists</a>
+	and, more recently, RCU-protected
+	<a href="https://lwn.net/Articles/612100/">hash tables</a>
+	are available.
+	Many other special-purpose RCU-protected data structures are
+	available in the Linux kernel and the userspace RCU library.
+<li>	Some linked structures are created at compile time, but still
+	require <tt>__rcu</tt> checking.
+	The <tt>RCU_POINTER_INITIALIZER()</tt> macro serves this
+	purpose.
+<li>	It is not necessary to use <tt>rcu_assign_pointer()</tt>
+	when creating linked structures that are to be published via
+	a single external pointer.
+	The <tt>RCU_INIT_POINTER()</tt> macro is provided for
+	this task and also for assigning <tt>NULL</tt> pointers
+	at runtime.
+</ol>
+
+<p>
+This not a hard-and-fast list:  RCU's diagnostic capabilities will
+continue to be guided by the number and type of usage bugs found
+in real-world RCU usage.
+
+<h2><a name="Linux Kernel Complications">Linux Kernel Complications</a></h2>
+
+<p>
+The Linux kernel provides an interesting environment for all kinds of
+software, including RCU.
+Some of the relevant points of interest are as follows:
+
+<ol>
+<li>	<a href="#Configuration">Configuration</a>.
+<li>	<a href="#Firmware Interface">Firmware Interface</a>.
+<li>	<a href="#Early Boot">Early Boot</a>.
+<li>	<a href="#Interrupts and NMIs">
+	Interrupts and non-maskable interrupts (NMIs)</a>.
+<li>	<a href="#Loadable Modules">Loadable Modules</a>.
+<li>	<a href="#Hotplug CPU">Hotplug CPU</a>.
+<li>	<a href="#Scheduler and RCU">Scheduler and RCU</a>.
+<li>	<a href="#Tracing and RCU">Tracing and RCU</a>.
+<li>	<a href="#Energy Efficiency">Energy Efficiency</a>.
+<li>	<a href="#Performance, Scalability, Response Time, and Reliability">
+	Performance, Scalability, Response Time, and Reliability</a>.
+</ol>
+
+<p>
+This list is probably incomplete, but it does give a feel for the
+most notable Linux-kernel complications.
+Each of the following sections covers one of the above topics.
+
+<h3><a name="Configuration">Configuration</a></h3>
+
+<p>
+RCU's goal is automatic configuration, so that almost nobody
+needs to worry about RCU's <tt>Kconfig</tt> options.
+And for almost all users, RCU does in fact work well
+&ldquo;out of the box.&rdquo;
+
+<p>
+However, there are specialized use cases that are handled by
+kernel boot parameters and <tt>Kconfig</tt> options.
+Unfortunately, the <tt>Kconfig</tt> system will explicitly ask users
+about new <tt>Kconfig</tt> options, which requires almost all of them
+be hidden behind a <tt>CONFIG_RCU_EXPERT</tt> <tt>Kconfig</tt> option.
+
+<p>
+This all should be quite obvious, but the fact remains that
+Linus Torvalds recently had to
+<a href="https://lkml.kernel.org/g/CA+55aFy4wcCwaL4okTs8wXhGZ5h-ibecy_Meg9C4MNQrUnwMcg@mail.gmail.com">remind</a>
+me of this requirement.
+
+<h3><a name="Firmware Interface">Firmware Interface</a></h3>
+
+<p>
+In many cases, kernel obtains information about the system from the
+firmware, and sometimes things are lost in translation.
+Or the translation is accurate, but the original message is bogus.
+
+<p>
+For example, some systems' firmware overreports the number of CPUs,
+sometimes by a large factor.
+If RCU naively believed the firmware, as it used to do,
+it would create too many per-CPU kthreads.
+Although the resulting system will still run correctly, the extra
+kthreads needlessly consume memory and can cause confusion
+when they show up in <tt>ps</tt> listings.
+
+<p>
+RCU must therefore wait for a given CPU to actually come online before
+it can allow itself to believe that the CPU actually exists.
+The resulting &ldquo;ghost CPUs&rdquo; (which are never going to
+come online) cause a number of
+<a href="https://paulmck.livejournal.com/37494.html">interesting complications</a>.
+
+<h3><a name="Early Boot">Early Boot</a></h3>
+
+<p>
+The Linux kernel's boot sequence is an interesting process,
+and RCU is used early, even before <tt>rcu_init()</tt>
+is invoked.
+In fact, a number of RCU's primitives can be used as soon as the
+initial task's <tt>task_struct</tt> is available and the
+boot CPU's per-CPU variables are set up.
+The read-side primitives (<tt>rcu_read_lock()</tt>,
+<tt>rcu_read_unlock()</tt>, <tt>rcu_dereference()</tt>,
+and <tt>rcu_access_pointer()</tt>) will operate normally very early on,
+as will <tt>rcu_assign_pointer()</tt>.
+
+<p>
+Although <tt>call_rcu()</tt> may be invoked at any
+time during boot, callbacks are not guaranteed to be invoked until after
+the scheduler is fully up and running.
+This delay in callback invocation is due to the fact that RCU does not
+invoke callbacks until it is fully initialized, and this full initialization
+cannot occur until after the scheduler has initialized itself to the
+point where RCU can spawn and run its kthreads.
+In theory, it would be possible to invoke callbacks earlier,
+however, this is not a panacea because there would be severe restrictions
+on what operations those callbacks could invoke.
+
+<p>
+Perhaps surprisingly, <tt>synchronize_rcu()</tt>,
+<a href="#Bottom-Half Flavor"><tt>synchronize_rcu_bh()</tt></a>
+(<a href="#Bottom-Half Flavor">discussed below</a>),
+and
+<a href="#Sched Flavor"><tt>synchronize_sched()</tt></a>
+will all operate normally
+during very early boot, the reason being that there is only one CPU
+and preemption is disabled.
+This means that the call <tt>synchronize_rcu()</tt> (or friends)
+itself is a quiescent
+state and thus a grace period, so the early-boot implementation can
+be a no-op.
+
+<p>
+Both <tt>synchronize_rcu_bh()</tt> and <tt>synchronize_sched()</tt>
+continue to operate normally through the remainder of boot, courtesy
+of the fact that preemption is disabled across their RCU read-side
+critical sections and also courtesy of the fact that there is still
+only one CPU.
+However, once the scheduler starts initializing, preemption is enabled.
+There is still only a single CPU, but the fact that preemption is enabled
+means that the no-op implementation of <tt>synchronize_rcu()</tt> no
+longer works in <tt>CONFIG_PREEMPT=y</tt> kernels.
+Therefore, as soon as the scheduler starts initializing, the early-boot
+fastpath is disabled.
+This means that <tt>synchronize_rcu()</tt> switches to its runtime
+mode of operation where it posts callbacks, which in turn means that
+any call to <tt>synchronize_rcu()</tt> will block until the corresponding
+callback is invoked.
+Unfortunately, the callback cannot be invoked until RCU's runtime
+grace-period machinery is up and running, which cannot happen until
+the scheduler has initialized itself sufficiently to allow RCU's
+kthreads to be spawned.
+Therefore, invoking <tt>synchronize_rcu()</tt> during scheduler
+initialization can result in deadlock.
+
+<p>@@QQ@@
+So what happens with <tt>synchronize_rcu()</tt> during
+scheduler initialization for <tt>CONFIG_PREEMPT=n</tt>
+kernels?
+<p>@@QQA@@
+In <tt>CONFIG_PREEMPT=n</tt> kernel, <tt>synchronize_rcu()</tt>
+maps directly to <tt>synchronize_sched()</tt>.
+Therefore, <tt>synchronize_rcu()</tt> works normally throughout
+boot in <tt>CONFIG_PREEMPT=n</tt> kernels.
+However, your code must also work in <tt>CONFIG_PREEMPT=y</tt> kernels,
+so it is still necessary to avoid invoking <tt>synchronize_rcu()</tt>
+during scheduler initialization.
+<p>@@QQE@@
+
+<p>
+I learned of these boot-time requirements as a result of a series of
+system hangs.
+
+<h3><a name="Interrupts and NMIs">Interrupts and NMIs</a></h3>
+
+<p>
+The Linux kernel has interrupts, and RCU read-side critical sections are
+legal within interrupt handlers and within interrupt-disabled regions
+of code, as are invocations of <tt>call_rcu()</tt>.
+
+<p>
+Some Linux-kernel architectures can enter an interrupt handler from
+non-idle process context, and then just never leave it, instead stealthily
+transitioning back to process context.
+This trick is sometimes used to invoke system calls from inside the kernel.
+These &ldquo;half-interrupts&rdquo; mean that RCU has to be very careful
+about how it counts interrupt nesting levels.
+I learned of this requirement the hard way during a rewrite
+of RCU's dyntick-idle code.
+
+<p>
+The Linux kernel has non-maskable interrupts (NMIs), and
+RCU read-side critical sections are legal within NMI handlers.
+Thankfully, RCU update-side primitives, including
+<tt>call_rcu()</tt>, are prohibited within NMI handlers.
+
+<p>
+The name notwithstanding, some Linux-kernel architectures
+can have nested NMIs, which RCU must handle correctly.
+Andy Lutomirski
+<a href="https://lkml.kernel.org/g/CALCETrXLq1y7e_dKFPgou-FKHB6Pu-r8+t-6Ds+8=va7anBWDA@mail.gmail.com">surprised me</a>
+with this requirement;
+he also kindly surprised me with
+<a href="https://lkml.kernel.org/g/CALCETrXSY9JpW3uE6H8WYk81sg56qasA2aqmjMPsq5dOtzso=g@mail.gmail.com">an algorithm</a>
+that meets this requirement.
+
+<h3><a name="Loadable Modules">Loadable Modules</a></h3>
+
+<p>
+The Linux kernel has loadable modules, and these modules can
+also be unloaded.
+After a given module has been unloaded, any attempt to call
+one of its functions results in a segmentation fault.
+The module-unload functions must therefore cancel any
+delayed calls to loadable-module functions, for example,
+any outstanding <tt>mod_timer()</tt> must be dealt with
+via <tt>del_timer_sync()</tt> or similar.
+
+<p>
+Unfortunately, there is no way to cancel an RCU callback;
+once you invoke <tt>call_rcu()</tt>, the callback function is
+going to eventually be invoked, unless the system goes down first.
+Because it is normally considered socially irresponsible to crash the system
+in response to a module unload request, we need some other way
+to deal with in-flight RCU callbacks.
+
+<p>
+RCU therefore provides
+<tt><a href="https://lwn.net/Articles/217484/">rcu_barrier()</a></tt>,
+which waits until all in-flight RCU callbacks have been invoked.
+If a module uses <tt>call_rcu()</tt>, its exit function should therefore
+prevent any future invocation of <tt>call_rcu()</tt>, then invoke
+<tt>rcu_barrier()</tt>.
+In theory, the underlying module-unload code could invoke
+<tt>rcu_barrier()</tt> unconditionally, but in practice this would
+incur unacceptable latencies.
+
+<p>
+Nikita Danilov noted this requirement for an analogous filesystem-unmount
+situation, and Dipankar Sarma incorporated <tt>rcu_barrier()</tt> into RCU.
+The need for <tt>rcu_barrier()</tt> for module unloading became
+apparent later.
+
+<h3><a name="Hotplug CPU">Hotplug CPU</a></h3>
+
+<p>
+The Linux kernel supports CPU hotplug, which means that CPUs
+can come and go.
+It is of course illegal to use any RCU API member from an offline CPU.
+This requirement was present from day one in DYNIX/ptx, but
+on the other hand, the Linux kernel's CPU-hotplug implementation
+is &ldquo;interesting.&rdquo;
+
+<p>
+The Linux-kernel CPU-hotplug implementation has notifiers that
+are used to allow the various kernel subsystems (including RCU)
+to respond appropriately to a given CPU-hotplug operation.
+Most RCU operations may be invoked from CPU-hotplug notifiers,
+including even normal synchronous grace-period operations
+such as <tt>synchronize_rcu()</tt>.
+However, expedited grace-period operations such as
+<tt>synchronize_rcu_expedited()</tt> are not supported,
+due to the fact that current implementations block CPU-hotplug
+operations, which could result in deadlock.
+
+<p>
+In addition, all-callback-wait operations such as
+<tt>rcu_barrier()</tt> are also not supported, due to the
+fact that there are phases of CPU-hotplug operations where
+the outgoing CPU's callbacks will not be invoked until after
+the CPU-hotplug operation ends, which could also result in deadlock.
+
+<h3><a name="Scheduler and RCU">Scheduler and RCU</a></h3>
+
+<p>
+RCU depends on the scheduler, and the scheduler uses RCU to
+protect some of its data structures.
+This means the scheduler is forbidden from acquiring
+the runqueue locks and the priority-inheritance locks
+in the middle of an outermost RCU read-side critical section unless
+it also releases them before exiting that same
+RCU read-side critical section.
+This same prohibition also applies to any lock that is acquired
+while holding any lock to which this prohibition applies.
+Violating this rule results in deadlock.
+
+<p>
+For RCU's part, the preemptible-RCU <tt>rcu_read_unlock()</tt>
+implementation must be written carefully to avoid similar deadlocks.
+In particular, <tt>rcu_read_unlock()</tt> must tolerate an
+interrupt where the interrupt handler invokes both
+<tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>.
+This possibility requires <tt>rcu_read_unlock()</tt> to use
+negative nesting levels to avoid destructive recursion via
+interrupt handler's use of RCU.
+
+<p>
+This pair of mutual scheduler-RCU requirements came as a
+<a href="https://lwn.net/Articles/453002/">complete surprise</a>.
+
+<p>
+As noted above, RCU makes use of kthreads, and it is necessary to
+avoid excessive CPU-time accumulation by these kthreads.
+This requirement was no surprise, but RCU's violation of it
+when running context-switch-heavy workloads when built with
+<tt>CONFIG_NO_HZ_FULL=y</tt>
+<a href="http://www.rdrop.com/users/paulmck/scalability/paper/BareMetal.2015.01.15b.pdf">did come as a surprise [PDF]</a>.
+RCU has made good progress towards meeting this requirement, even
+for context-switch-have <tt>CONFIG_NO_HZ_FULL=y</tt> workloads,
+but there is room for further improvement.
+
+<h3><a name="Tracing and RCU">Tracing and RCU</a></h3>
+
+<p>
+It is possible to use tracing on RCU code, but tracing itself
+uses RCU.
+For this reason, <tt>rcu_dereference_raw_notrace()</tt>
+is provided for use by tracing, which avoids the destructive
+recursion that could otherwise ensue.
+This API is also used by virtualization in some architectures,
+where RCU readers execute in environments in which tracing
+cannot be used.
+The tracing folks both located the requirement and provided the
+needed fix, so this surprise requirement was relatively painless.
+
+<h3><a name="Energy Efficiency">Energy Efficiency</a></h3>
+
+<p>
+Interrupting idle CPUs is considered socially unacceptable,
+especially by people with battery-powered embedded systems.
+RCU therefore conserves energy by detecting which CPUs are
+idle, including tracking CPUs that have been interrupted from idle.
+This is a large part of the energy-efficiency requirement,
+so I learned of this via an irate phone call.
+
+<p>
+Because RCU avoids interrupting idle CPUs, it is illegal to
+execute an RCU read-side critical section on an idle CPU.
+(Kernels built with <tt>CONFIG_PROVE_RCU=y</tt> will splat
+if you try it.)
+The <tt>RCU_NONIDLE()</tt> macro and <tt>_rcuidle</tt>
+event tracing is provided to work around this restriction.
+In addition, <tt>rcu_is_watching()</tt> may be used to
+test whether or not it is currently legal to run RCU read-side
+critical sections on this CPU.
+I learned of the need for diagnostics on the one hand
+and <tt>RCU_NONIDLE()</tt> on the other while inspecting
+idle-loop code.
+Steven Rostedt supplied <tt>_rcuidle</tt> event tracing,
+which is used quite heavily in the idle loop.
+
+<p>
+It is similarly socially unacceptable to interrupt an
+<tt>nohz_full</tt> CPU running in userspace.
+RCU must therefore track <tt>nohz_full</tt> userspace
+execution.
+And in
+<a href="https://lwn.net/Articles/558284/"><tt>CONFIG_NO_HZ_FULL_SYSIDLE=y</tt></a>
+kernels, RCU must separately track idle CPUs on the one hand and
+CPUs that are either idle or executing in userspace on the other.
+In both cases, RCU must be able to sample state at two points in
+time, and be able to determine whether or not some other CPU spent
+any time idle and/or executing in userspace.
+
+<p>
+These energy-efficiency requirements have proven quite difficult to
+understand and to meet, for example, there have been more than five
+clean-sheet rewrites of RCU's energy-efficiency code, the last of
+which was finally able to demonstrate
+<a href="http://www.rdrop.com/users/paulmck/realtime/paper/AMPenergy.2013.04.19a.pdf">real energy savings running on real hardware [PDF]</a>.
+As noted earlier,
+I learned of many of these requirements via angry phone calls:
+Flaming me on the Linux-kernel mailing list was apparently not
+sufficient to fully vent their ire at RCU's energy-efficiency bugs!
+
+<h3><a name="Performance, Scalability, Response Time, and Reliability">
+Performance, Scalability, Response Time, and Reliability</a></h3>
+
+<p>
+Expanding on the
+<a href="#Performance and Scalability">earlier discussion</a>,
+RCU is used heavily by hot code paths in performance-critical
+portions of the Linux kernel's networking, security, virtualization,
+and scheduling code paths.
+RCU must therefore use efficient implementations, especially in its
+read-side primitives.
+To that end, it would be good if preemptible RCU's implementation
+of <tt>rcu_read_lock()</tt> could be inlined, however, doing
+this requires resolving <tt>#include</tt> issues with the
+<tt>task_struct</tt> structure.
+
+<p>
+The Linux kernel supports hardware configurations with up to
+4096 CPUs, which means that RCU must be extremely scalable.
+Algorithms that involve frequent acquisitions of global locks or
+frequent atomic operations on global variables simply cannot be
+tolerated within the RCU implementation.
+RCU therefore makes heavy use of a combining tree based on the
+<tt>rcu_node</tt> structure.
+RCU is required to tolerate all CPUs continuously invoking any
+combination of RCU's runtime primitives with minimal per-operation
+overhead.
+In fact, in many cases, increasing load must <i>decrease</i> the
+per-operation overhead, witness the batching optimizations for
+<tt>synchronize_rcu()</tt>, <tt>call_rcu()</tt>,
+<tt>synchronize_rcu_expedited()</tt>, and <tt>rcu_barrier()</tt>.
+As a general rule, RCU must cheerfully accept whatever the
+rest of the Linux kernel decides to throw at it.
+
+<p>
+The Linux kernel is used for real-time workloads, especially
+in conjunction with the
+<a href="https://rt.wiki.kernel.org/index.php/Main_Page">-rt patchset</a>.
+The real-time-latency response requirements are such that the
+traditional approach of disabling preemption across RCU
+read-side critical sections is inappropriate.
+Kernels built with <tt>CONFIG_PREEMPT=y</tt> therefore
+use an RCU implementation that allows RCU read-side critical
+sections to be preempted.
+This requirement made its presence known after users made it
+clear that an earlier
+<a href="https://lwn.net/Articles/107930/">real-time patch</a>
+did not meet their needs, in conjunction with some
+<a href="https://lkml.kernel.org/g/20050318002026.GA2693@us.ibm.com">RCU issues</a>
+encountered by a very early version of the -rt patchset.
+
+<p>
+In addition, RCU must make do with a sub-100-microsecond real-time latency
+budget.
+In fact, on smaller systems with the -rt patchset, the Linux kernel
+provides sub-20-microsecond real-time latencies for the whole kernel,
+including RCU.
+RCU's scalability and latency must therefore be sufficient for
+these sorts of configurations.
+To my surprise, the sub-100-microsecond real-time latency budget
+<a href="http://www.rdrop.com/users/paulmck/realtime/paper/bigrt.2013.01.31a.LCA.pdf">
+applies to even the largest systems [PDF]</a>,
+up to and including systems with 4096 CPUs.
+This real-time requirement motivated the grace-period kthread, which
+also simplified handling of a number of race conditions.
+
+<p>
+Finally, RCU's status as a synchronization primitive means that
+any RCU failure can result in arbitrary memory corruption that can be
+extremely difficult to debug.
+This means that RCU must be extremely reliable, which in
+practice also means that RCU must have an aggressive stress-test
+suite.
+This stress-test suite is called <tt>rcutorture</tt>.
+
+<p>
+Although the need for <tt>rcutorture</tt> was no surprise,
+the current immense popularity of the Linux kernel is posing
+interesting&mdash;and perhaps unprecedented&mdash;validation
+challenges.
+To see this, keep in mind that there are well over one billion
+instances of the Linux kernel running today, given Android
+smartphones, Linux-powered televisions, and servers.
+This number can be expected to increase sharply with the advent of
+the celebrated Internet of Things.
+
+<p>
+Suppose that RCU contains a race condition that manifests on average
+once per million years of runtime.
+This bug will be occurring about three times per <i>day</i> across
+the installed base.
+RCU could simply hide behind hardware error rates, given that no one
+should really expect their smartphone to last for a million years.
+However, anyone taking too much comfort from this thought should
+consider the fact that in most jurisdictions, a successful multi-year
+test of a given mechanism, which might include a Linux kernel,
+suffices for a number of types of safety-critical certifications.
+In fact, rumor has it that the Linux kernel is already being used
+in production for safety-critical applications.
+I don't know about you, but I would feel quite bad if a bug in RCU
+killed someone.
+Which might explain my recent focus on validation and verification.
+
+<h2><a name="Other RCU Flavors">Other RCU Flavors</a></h2>
+
+<p>
+One of the more surprising things about RCU is that there are now
+no fewer than five <i>flavors</i>, or API families.
+In addition, the primary flavor that has been the sole focus up to
+this point has two different implementations, non-preemptible and
+preemptible.
+The other four flavors are listed below, with requirements for each
+described in a separate section.
+
+<ol>
+<li>	<a href="#Bottom-Half Flavor">Bottom-Half Flavor</a>
+<li>	<a href="#Sched Flavor">Sched Flavor</a>
+<li>	<a href="#Sleepable RCU">Sleepable RCU</a>
+<li>	<a href="#Tasks RCU">Tasks RCU</a>
+</ol>
+
+<h3><a name="Bottom-Half Flavor">Bottom-Half Flavor</a></h3>
+
+<p>
+The softirq-disable (AKA &ldquo;bottom-half&rdquo;,
+hence the &ldquo;_bh&rdquo; abbreviations)
+flavor of RCU, or <i>RCU-bh</i>, was developed by
+Dipankar Sarma to provide a flavor of RCU that could withstand the
+network-based denial-of-service attacks researched by Robert
+Olsson.
+These attacks placed so much networking load on the system
+that some of the CPUs never exited softirq execution,
+which in turn prevented those CPUs from ever executing a context switch,
+which, in the RCU implementation of that time, prevented grace periods
+from ever ending.
+The result was an out-of-memory condition and a system hang.
+
+<p>
+The solution was the creation of RCU-bh, which does
+<tt>local_bh_disable()</tt>
+across its read-side critical sections, and which uses the transition
+from one type of softirq processing to another as a quiescent state
+in addition to context switch, idle, user mode, and offline.
+This means that RCU-bh grace periods can complete even when some of
+the CPUs execute in softirq indefinitely, thus allowing algorithms
+based on RCU-bh to withstand network-based denial-of-service attacks.
+
+<p>
+Because
+<tt>rcu_read_lock_bh()</tt> and <tt>rcu_read_unlock_bh()</tt>
+disable and re-enable softirq handlers, any attempt to start a softirq
+handlers during the
+RCU-bh read-side critical section will be deferred.
+In this case, <tt>rcu_read_unlock_bh()</tt>
+will invoke softirq processing, which can take considerable time.
+One can of course argue that this softirq overhead should be associated
+with the code following the RCU-bh read-side critical section rather
+than <tt>rcu_read_unlock_bh()</tt>, but the fact
+is that most profiling tools cannot be expected to make this sort
+of fine distinction.
+For example, suppose that a three-millisecond-long RCU-bh read-side
+critical section executes during a time of heavy networking load.
+There will very likely be an attempt to invoke at least one softirq
+handler during that three milliseconds, but any such invocation will
+be delayed until the time of the <tt>rcu_read_unlock_bh()</tt>.
+This can of course make it appear at first glance as if
+<tt>rcu_read_unlock_bh()</tt> was executing very slowly.
+
+<p>
+The
+<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">RCU-bh API</a>
+includes
+<tt>rcu_read_lock_bh()</tt>,
+<tt>rcu_read_unlock_bh()</tt>,
+<tt>rcu_dereference_bh()</tt>,
+<tt>rcu_dereference_bh_check()</tt>,
+<tt>synchronize_rcu_bh()</tt>,
+<tt>synchronize_rcu_bh_expedited()</tt>,
+<tt>call_rcu_bh()</tt>,
+<tt>rcu_barrier_bh()</tt>, and
+<tt>rcu_read_lock_bh_held()</tt>.
+
+<h3><a name="Sched Flavor">Sched Flavor</a></h3>
+
+<p>
+Before preemptible RCU, waiting for an RCU grace period had the
+side effect of also waiting for all pre-existing interrupt
+and NMI handlers.
+However, there are legitimate preemptible-RCU implementations that
+do not have this property, given that any point in the code outside
+of an RCU read-side critical section can be a quiescent state.
+Therefore, <i>RCU-sched</i> was created, which follows &ldquo;classic&rdquo;
+RCU in that an RCU-sched grace period waits for for pre-existing
+interrupt and NMI handlers.
+In kernels built with <tt>CONFIG_PREEMPT=n</tt>, the RCU and RCU-sched
+APIs have identical implementations, while kernels built with
+<tt>CONFIG_PREEMPT=y</tt> provide a separate implementation for each.
+
+<p>
+Note well that in <tt>CONFIG_PREEMPT=y</tt> kernels,
+<tt>rcu_read_lock_sched()</tt> and <tt>rcu_read_unlock_sched()</tt>
+disable and re-enable preemption, respectively.
+This means that if there was a preemption attempt during the
+RCU-sched read-side critical section, <tt>rcu_read_unlock_sched()</tt>
+will enter the scheduler, with all the latency and overhead entailed.
+Just as with <tt>rcu_read_unlock_bh()</tt>, this can make it look
+as if <tt>rcu_read_unlock_sched()</tt> was executing very slowly.
+However, the highest-priority task won't be preempted, so that task
+will enjoy low-overhead <tt>rcu_read_unlock_sched()</tt> invocations.
+
+<p>
+The
+<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">RCU-sched API</a>
+includes
+<tt>rcu_read_lock_sched()</tt>,
+<tt>rcu_read_unlock_sched()</tt>,
+<tt>rcu_read_lock_sched_notrace()</tt>,
+<tt>rcu_read_unlock_sched_notrace()</tt>,
+<tt>rcu_dereference_sched()</tt>,
+<tt>rcu_dereference_sched_check()</tt>,
+<tt>synchronize_sched()</tt>,
+<tt>synchronize_rcu_sched_expedited()</tt>,
+<tt>call_rcu_sched()</tt>,
+<tt>rcu_barrier_sched()</tt>, and
+<tt>rcu_read_lock_sched_held()</tt>.
+However, anything that disables preemption also marks an RCU-sched
+read-side critical section, including
+<tt>preempt_disable()</tt> and <tt>preempt_enable()</tt>,
+<tt>local_irq_save()</tt> and <tt>local_irq_restore()</tt>,
+and so on.
+
+<h3><a name="Sleepable RCU">Sleepable RCU</a></h3>
+
+<p>
+For well over a decade, someone saying &ldquo;I need to block within
+an RCU read-side critical section&rdquo; was a reliable indication
+that this someone did not understand RCU.
+After all, if you are always blocking in an RCU read-side critical
+section, you can probably afford to use a higher-overhead synchronization
+mechanism.
+However, that changed with the advent of the Linux kernel's notifiers,
+whose RCU read-side critical
+sections almost never sleep, but sometimes need to.
+This resulted in the introduction of
+<a href="https://lwn.net/Articles/202847/">sleepable RCU</a>,
+or <i>SRCU</i>.
+
+<p>
+SRCU allows different domains to be defined, with each such domain
+defined by an instance of an <tt>srcu_struct</tt> structure.
+A pointer to this structure must be passed in to each SRCU function,
+for example, <tt>synchronize_srcu(&amp;ss)</tt>, where
+<tt>ss</tt> is the <tt>srcu_struct</tt> structure.
+The key benefit of these domains is that a slow SRCU reader in one
+domain does not delay an SRCU grace period in some other domain.
+That said, one consequence of these domains is that read-side code
+must pass a &ldquo;cookie&rdquo; from <tt>srcu_read_lock()</tt>
+to <tt>srcu_read_unlock()</tt>, for example, as follows:
+
+<blockquote>
+<pre>
+ 1 int idx;
+ 2
+ 3 idx = srcu_read_lock(&amp;ss);
+ 4 do_something();
+ 5 srcu_read_unlock(&amp;ss, idx);
+</pre>
+</blockquote>
+
+<p>
+As noted above, it is legal to block within SRCU read-side critical sections,
+however, with great power comes great responsibility.
+If you block forever in one of a given domain's SRCU read-side critical
+sections, then that domain's grace periods will also be blocked forever.
+Of course, one good way to block forever is to deadlock, which can
+happen if any operation in a given domain's SRCU read-side critical
+section can block waiting, either directly or indirectly, for that domain's
+grace period to elapse.
+For example, this results in a self-deadlock:
+
+<blockquote>
+<pre>
+ 1 int idx;
+ 2
+ 3 idx = srcu_read_lock(&amp;ss);
+ 4 do_something();
+ 5 synchronize_srcu(&amp;ss);
+ 6 srcu_read_unlock(&amp;ss, idx);
+</pre>
+</blockquote>
+
+<p>
+However, if line&nbsp;5 acquired a mutex that was held across
+a <tt>synchronize_srcu()</tt> for domain <tt>ss</tt>,
+deadlock would still be possible.
+Furthermore, if line&nbsp;5 acquired a mutex that was held across
+a <tt>synchronize_srcu()</tt> for some other domain <tt>ss1</tt>,
+and if an <tt>ss1</tt>-domain SRCU read-side critical section
+acquired another mutex that was held across as <tt>ss</tt>-domain
+<tt>synchronize_srcu()</tt>,
+deadlock would again be possible.
+Such a deadlock cycle could extend across an arbitrarily large number
+of different SRCU domains.
+Again, with great power comes great responsibility.
+
+<p>
+Unlike the other RCU flavors, SRCU read-side critical sections can
+run on idle and even offline CPUs.
+This ability requires that <tt>srcu_read_lock()</tt> and
+<tt>srcu_read_unlock()</tt> contain memory barriers, which means
+that SRCU readers will run a bit slower than would RCU readers.
+It also motivates the <tt>smp_mb__after_srcu_read_unlock()</tt>
+API, which, in combination with <tt>srcu_read_unlock()</tt>,
+guarantees a full memory barrier.
+
+<p>
+The
+<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">SRCU API</a>
+includes
+<tt>srcu_read_lock()</tt>,
+<tt>srcu_read_unlock()</tt>,
+<tt>srcu_dereference()</tt>,
+<tt>srcu_dereference_check()</tt>,
+<tt>synchronize_srcu()</tt>,
+<tt>synchronize_srcu_expedited()</tt>,
+<tt>call_srcu()</tt>,
+<tt>srcu_barrier()</tt>, and
+<tt>srcu_read_lock_held()</tt>.
+It also includes
+<tt>DEFINE_SRCU()</tt>,
+<tt>DEFINE_STATIC_SRCU()</tt>, and
+<tt>init_srcu_struct()</tt>
+APIs for defining and initializing <tt>srcu_struct</tt> structures.
+
+<h3><a name="Tasks RCU">Tasks RCU</a></h3>
+
+<p>
+Some forms of tracing use &ldquo;tramopolines&rdquo; to handle the
+binary rewriting required to install different types of probes.
+It would be good to be able to free old trampolines, which sounds
+like a job for some form of RCU.
+However, because it is necessary to be able to install a trace
+anywhere in the code, it is not possible to use read-side markers
+such as <tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>.
+In addition, it does not work to have these markers in the trampoline
+itself, because there would need to be instructions following
+<tt>rcu_read_unlock()</tt>.
+Although <tt>synchronize_rcu()</tt> would guarantee that execution
+reached the <tt>rcu_read_unlock()</tt>, it would not be able to
+guarantee that execution had completely left the trampoline.
+
+<p>
+The solution, in the form of
+<a href="https://lwn.net/Articles/607117/"><i>Tasks RCU</i></a>,
+is to have implicit
+read-side critical sections that are delimited by voluntary context
+switches, that is, calls to <tt>schedule()</tt>,
+<tt>cond_resched_rcu_qs()</tt>, and
+<tt>synchronize_rcu_tasks()</tt>.
+In addition, transitions to and from userspace execution also delimit
+tasks-RCU read-side critical sections.
+
+<p>
+The tasks-RCU API is quite compact, consisting only of
+<tt>call_rcu_tasks()</tt>,
+<tt>synchronize_rcu_tasks()</tt>, and
+<tt>rcu_barrier_tasks()</tt>.
+
+<h2><a name="Possible Future Changes">Possible Future Changes</a></h2>
+
+<p>
+One of the tricks that RCU uses to attain update-side scalability is
+to increase grace-period latency with increasing numbers of CPUs.
+If this becomes a serious problem, it will be necessary to rework the
+grace-period state machine so as to avoid the need for the additional
+latency.
+
+<p>
+Expedited grace periods scan the CPUs, so their latency and overhead
+increases with increasing numbers of CPUs.
+If this becomes a serious problem on large systems, it will be necessary
+to do some redesign to avoid this scalability problem.
+
+<p>
+RCU disables CPU hotplug in a few places, perhaps most notably in the
+expedited grace-period and <tt>rcu_barrier()</tt> operations.
+If there is a strong reason to use expedited grace periods in CPU-hotplug
+notifiers, it will be necessary to avoid disabling CPU hotplug.
+This would introduce some complexity, so there had better be a <i>very</i>
+good reason.
+
+<p>
+The tradeoff between grace-period latency on the one hand and interruptions
+of other CPUs on the other hand may need to be re-examined.
+The desire is of course for zero grace-period latency as well as zero
+interprocessor interrupts undertaken during an expedited grace period
+operation.
+While this ideal is unlikely to be achievable, it is quite possible that
+further improvements can be made.
+
+<p>
+The multiprocessor implementations of RCU use a combining tree that
+groups CPUs so as to reduce lock contention and increase cache locality.
+However, this combining tree does not spread its memory across NUMA
+nodes nor does it align the CPU groups with hardware features such
+as sockets or cores.
+Such spreading and alignment is currently believed to be unnecessary
+because the hotpath read-side primitives do not access the combining
+tree, nor does <tt>call_rcu()</tt> in the common case.
+If you believe that your architecture needs such spreading and alignment,
+then your architecture should also benefit from the
+<tt>rcutree.rcu_fanout_leaf</tt> boot parameter, which can be set
+to the number of CPUs in a socket, NUMA node, or whatever.
+If the number of CPUs is too large, use a fraction of the number of
+CPUs.
+If the number of CPUs is a large prime number, well, that certainly
+is an &ldquo;interesting&rdquo; architectural choice!
+More flexible arrangements might be considered, but only if
+<tt>rcutree.rcu_fanout_leaf</tt> has proven inadequate, and only
+if the inadequacy has been demonstrated by a carefully run and
+realistic system-level workload.
+
+<p>
+Please note that arrangements that require RCU to remap CPU numbers will
+require extremely good demonstration of need and full exploration of
+alternatives.
+
+<p>
+There is an embarrassingly large number of flavors of RCU, and this
+number has been increasing over time.
+Perhaps it will be possible to combine some at some future date.
+
+<p>
+RCU's various kthreads are reasonably recent additions.
+It is quite likely that adjustments will be required to more gracefully
+handle extreme loads.
+It might also be necessary to be able to relate CPU utilization by
+RCU's kthreads and softirq handlers to the code that instigated this
+CPU utilization.
+For example, RCU callback overhead might be charged back to the
+originating <tt>call_rcu()</tt> instance, though probably not
+in production kernels.
+
+<h2><a name="Summary">Summary</a></h2>
+
+<p>
+This document has presented more than two decade's worth of RCU
+requirements.
+Given that the requirements keep changing, this will not be the last
+word on this subject, but at least it serves to get an important
+subset of the requirements set forth.
+
+<h2><a name="Acknowledgments">Acknowledgments</a></h2>
+
+I am grateful to Steven Rostedt, Lai Jiangshan, Ingo Molnar,
+Oleg Nesterov, Borislav Petkov, Peter Zijlstra, Boqun Feng, and
+Andy Lutomirski for their help in rendering
+this article human readable, and to Michelle Rankin for her support
+of this effort.
+Other contributions are acknowledged in the Linux kernel's git archive.
+The cartoon is copyright (c) 2013 by Melissa Broussard,
+and is provided
+under the terms of the Creative Commons Attribution-Share Alike 3.0
+United States license.
+
+<p>@@QQAL@@
+
+</body></html>
diff --git a/Documentation/RCU/Design/htmlqqz.sh b/Documentation/RCU/Design/htmlqqz.sh
new file mode 100755
index 0000000..d354f06
--- /dev/null
+++ b/Documentation/RCU/Design/htmlqqz.sh
@@ -0,0 +1,108 @@
+#!/bin/sh
+#
+# Usage: sh htmlqqz.sh file
+#
+# Extracts and converts quick quizzes in a proto-HTML document file.htmlx.
+# Commands, all of which must be on a line by themselves:
+#
+#	"<p>@@QQ@@": Start of a quick quiz.
+#	"<p>@@QQA@@": Start of a quick-quiz answer.
+#	"<p>@@QQE@@": End of a quick-quiz answer, and thus of the quick quiz.
+#	"<p>@@QQAL@@": Place to put quick-quiz answer list.
+#
+# Places the result in file.html.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, you can access it online at
+# http://www.gnu.org/licenses/gpl-2.0.html.
+#
+# Copyright (c) 2013 Paul E. McKenney, IBM Corporation.
+
+fn=$1
+if test ! -r $fn.htmlx
+then
+	echo "Error: $fn.htmlx unreadable."
+	exit 1
+fi
+
+echo "<!-- DO NOT HAND EDIT. -->" > $fn.html
+echo "<!-- Instead, edit $fn.htmlx and run 'sh htmlqqz.sh $fn' -->" >> $fn.html
+awk < $fn.htmlx >> $fn.html '
+
+state == "" && $1 != "<p>@@QQ@@" && $1 != "<p>@@QQAL@@" {
+	print $0;
+	if ($0 ~ /^<p>@@QQ/)
+		print "Bad Quick Quiz command: " NR " (expected <p>@@QQ@@ or <p>@@QQAL@@)." > "/dev/stderr"
+	next;
+}
+
+state == "" && $1 == "<p>@@QQ@@" {
+	qqn++;
+	qqlineno = NR;
+	haveqq = 1;
+	state = "qq";
+	print "<p><a name=\"Quick Quiz " qqn "\"><b>Quick Quiz " qqn "</b>:</a>"
+	next;
+}
+
+state == "qq" && $1 != "<p>@@QQA@@" {
+	qq[qqn] = qq[qqn] $0 "\n";
+	print $0
+	if ($0 ~ /^<p>@@QQ/)
+		print "Bad Quick Quiz command: " NR ". (expected <p>@@QQA@@)" > "/dev/stderr"
+	next;
+}
+
+state == "qq" && $1 == "<p>@@QQA@@" {
+	state = "qqa";
+	print "<br><a href=\"#qq" qqn "answer\">Answer</a>"
+	next;
+}
+
+state == "qqa" && $1 != "<p>@@QQE@@" {
+	qqa[qqn] = qqa[qqn] $0 "\n";
+	if ($0 ~ /^<p>@@QQ/)
+		print "Bad Quick Quiz command: " NR " (expected <p>@@QQE@@)." > "/dev/stderr"
+	next;
+}
+
+state == "qqa" && $1 == "<p>@@QQE@@" {
+	state = "";
+	next;
+}
+
+state == "" && $1 == "<p>@@QQAL@@" {
+	haveqq = "";
+	print "<h3><a name=\"Answers to Quick Quizzes\">"
+	print "Answers to Quick Quizzes</a></h3>"
+	print "";
+	for (i = 1; i <= qqn; i++) {
+		print "<a name=\"qq" i "answer\"></a>"
+		print "<p><b>Quick Quiz " i "</b>:"
+		print qq[i];
+		print "";
+		print "</p><p><b>Answer</b>:"
+		print qqa[i];
+		print "";
+		print "</p><p><a href=\"#Quick%20Quiz%20" i "\"><b>Back to Quick Quiz " i "</b>.</a>"
+		print "";
+	}
+	next;
+}
+
+END {
+	if (state != "")
+		print "Unterminated Quick Quiz: " qqlineno "." > "/dev/stderr"
+	else if (haveqq)
+		print "Missing \"<p>@@QQAL@@\", no Quick Quiz." > "/dev/stderr"
+}'
-- 
cgit v1.1


From 701e80312fd10270f9c44371e5a229d37a9ae172 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 7 Oct 2015 15:06:44 -0700
Subject: Documentation: Record bottom-bit-zero guarantee for ->next

This commit records RCU's guarantee that the bottom bit of the rcu_head
structure's ->next field will remain zero for callbacks posted via
call_rcu(), but not necessarily for <tt>kfree_rcu()</tt> or some
possible future call_rcu_lazy() variant that might one day be created
for energy-efficiency purposese.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
[ paulmck: Updates URLs as suggested by Josh Triplett. ]
---
 .../RCU/Design/Requirements/Requirements.html      | 43 ++++++++++++++++++++++
 .../RCU/Design/Requirements/Requirements.htmlx     | 43 ++++++++++++++++++++++
 2 files changed, 86 insertions(+)

diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html
index 36de7aa..871f627 100644
--- a/Documentation/RCU/Design/Requirements/Requirements.html
+++ b/Documentation/RCU/Design/Requirements/Requirements.html
@@ -1678,6 +1678,7 @@ Some of the relevant points of interest are as follows:
 <li>	<a href="#Scheduler and RCU">Scheduler and RCU</a>.
 <li>	<a href="#Tracing and RCU">Tracing and RCU</a>.
 <li>	<a href="#Energy Efficiency">Energy Efficiency</a>.
+<li>	<a href="#Memory Efficiency">Memory Efficiency</a>.
 <li>	<a href="#Performance, Scalability, Response Time, and Reliability">
 	Performance, Scalability, Response Time, and Reliability</a>.
 </ol>
@@ -2006,6 +2007,48 @@ I learned of many of these requirements via angry phone calls:
 Flaming me on the Linux-kernel mailing list was apparently not
 sufficient to fully vent their ire at RCU's energy-efficiency bugs!
 
+<h3><a name="Memory Efficiency">Memory Efficiency</a></h3>
+
+<p>
+Although small-memory non-realtime systems can simply use Tiny RCU,
+code size is only one aspect of memory efficiency.
+Another aspect is the size of the <tt>rcu_head</tt> structure
+used by <tt>call_rcu()</tt> and <tt>kfree_rcu()</tt>.
+Although this structure contains nothing more than a pair of pointers,
+it does appear in many RCU-protected data structures, including
+some that are size critical.
+The <tt>page</tt> structure is a case in point, as evidenced by
+the many occurrences of the <tt>union</tt> keyword within that structure.
+
+<p>
+This need for memory efficiency is one reason that RCU uses hand-crafted
+singly linked lists to track the <tt>rcu_head</tt> structures that
+are waiting for a grace period to elapse.
+It is also the reason why <tt>rcu_head</tt> structures do not contain
+debug information, such as fields tracking the file and line of the
+<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt> that posted them.
+Although this information might appear in debug-only kernel builds at some
+point, in the meantime, the <tt>-&gt;func</tt> field will often provide
+the needed debug information.
+
+<p>
+However, in some cases, the need for memory efficiency leads to even
+more extreme measures.
+Returning to the <tt>page</tt> structure, the <tt>rcu_head</tt> field
+shares storage with a great many other structures that are used at
+various points in the corresponding page's lifetime.
+In order to correctly resolve certain
+<a href="https://lkml.kernel.org/g/1439976106-137226-1-git-send-email-kirill.shutemov@linux.intel.com">race conditions</a>,
+the Linux kernel's memory-management subsystem needs a particular bit
+to remain zero during all phases of grace-period processing,
+and that bit happens to map to the bottom bit of the
+<tt>rcu_head</tt> structure's <tt>-&gt;next</tt> field.
+RCU makes this guarantee as long as <tt>call_rcu()</tt>
+is used to post the callback, as opposed to <tt>kfree_rcu()</tt>
+or some future &ldquo;lazy&rdquo;
+variant of <tt>call_rcu()</tt> that might one day be created for
+energy-efficiency purposes.
+
 <h3><a name="Performance, Scalability, Response Time, and Reliability">
 Performance, Scalability, Response Time, and Reliability</a></h3>
 
diff --git a/Documentation/RCU/Design/Requirements/Requirements.htmlx b/Documentation/RCU/Design/Requirements/Requirements.htmlx
index 1168010..a544db4 100644
--- a/Documentation/RCU/Design/Requirements/Requirements.htmlx
+++ b/Documentation/RCU/Design/Requirements/Requirements.htmlx
@@ -1837,6 +1837,7 @@ Some of the relevant points of interest are as follows:
 <li>	<a href="#Scheduler and RCU">Scheduler and RCU</a>.
 <li>	<a href="#Tracing and RCU">Tracing and RCU</a>.
 <li>	<a href="#Energy Efficiency">Energy Efficiency</a>.
+<li>	<a href="#Memory Efficiency">Memory Efficiency</a>.
 <li>	<a href="#Performance, Scalability, Response Time, and Reliability">
 	Performance, Scalability, Response Time, and Reliability</a>.
 </ol>
@@ -2173,6 +2174,48 @@ I learned of many of these requirements via angry phone calls:
 Flaming me on the Linux-kernel mailing list was apparently not
 sufficient to fully vent their ire at RCU's energy-efficiency bugs!
 
+<h3><a name="Memory Efficiency">Memory Efficiency</a></h3>
+
+<p>
+Although small-memory non-realtime systems can simply use Tiny RCU,
+code size is only one aspect of memory efficiency.
+Another aspect is the size of the <tt>rcu_head</tt> structure
+used by <tt>call_rcu()</tt> and <tt>kfree_rcu()</tt>.
+Although this structure contains nothing more than a pair of pointers,
+it does appear in many RCU-protected data structures, including
+some that are size critical.
+The <tt>page</tt> structure is a case in point, as evidenced by
+the many occurrences of the <tt>union</tt> keyword within that structure.
+
+<p>
+This need for memory efficiency is one reason that RCU uses hand-crafted
+singly linked lists to track the <tt>rcu_head</tt> structures that
+are waiting for a grace period to elapse.
+It is also the reason why <tt>rcu_head</tt> structures do not contain
+debug information, such as fields tracking the file and line of the
+<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt> that posted them.
+Although this information might appear in debug-only kernel builds at some
+point, in the meantime, the <tt>-&gt;func</tt> field will often provide
+the needed debug information.
+
+<p>
+However, in some cases, the need for memory efficiency leads to even
+more extreme measures.
+Returning to the <tt>page</tt> structure, the <tt>rcu_head</tt> field
+shares storage with a great many other structures that are used at
+various points in the corresponding page's lifetime.
+In order to correctly resolve certain
+<a href="https://lkml.kernel.org/g/1439976106-137226-1-git-send-email-kirill.shutemov@linux.intel.com">race conditions</a>,
+the Linux kernel's memory-management subsystem needs a particular bit
+to remain zero during all phases of grace-period processing,
+and that bit happens to map to the bottom bit of the
+<tt>rcu_head</tt> structure's <tt>-&gt;next</tt> field.
+RCU makes this guarantee as long as <tt>call_rcu()</tt>
+is used to post the callback, as opposed to <tt>kfree_rcu()</tt>
+or some future &ldquo;lazy&rdquo;
+variant of <tt>call_rcu()</tt> that might one day be created for
+energy-efficiency purposes.
+
 <h3><a name="Performance, Scalability, Response Time, and Reliability">
 Performance, Scalability, Response Time, and Reliability</a></h3>
 
-- 
cgit v1.1


From 01d3ad3834891f19a2620a105415feac93296eeb Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 7 Oct 2015 15:35:35 -0700
Subject: documentation: Cover requirements controlling stall warnings

This commit adds verbiage on boot and sysfs parameters that can be
used to control RCU CPU stall warnings, both to change the timeout
and to suppress these warnings entirely.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 .../RCU/Design/Requirements/Requirements.html      | 25 +++++++++++++++++++++-
 .../RCU/Design/Requirements/Requirements.htmlx     | 25 +++++++++++++++++++++-
 2 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html
index 871f627..cc5b587 100644
--- a/Documentation/RCU/Design/Requirements/Requirements.html
+++ b/Documentation/RCU/Design/Requirements/Requirements.html
@@ -1618,12 +1618,35 @@ guard against mishaps and misuse:
 	supplied the needed
 	<a href="https://lkml.kernel.org/g/20100319013024.GA28456@Krystal">patch</a>.
 <li>	An infinite loop in an RCU read-side critical section will
-	eventually trigger an RCU CPU stall warning splat.
+	eventually trigger an RCU CPU stall warning splat, with
+	the duration of &ldquo;eventually&rdquo; being controlled by the
+	<tt>RCU_CPU_STALL_TIMEOUT</tt> <tt>Kconfig</tt> option, or,
+	alternatively, by the
+	<tt>rcupdate.rcu_cpu_stall_timeout</tt> boot/sysfs
+	parameter.
 	However, RCU is not obligated to produce this splat
 	unless there is a grace period waiting on that particular
 	RCU read-side critical section.
+	<p>
+	Some extreme workloads might intentionally delay
+	RCU grace periods, and systems running those workloads can
+	be booted with <tt>rcupdate.rcu_cpu_stall_suppress</tt>
+	to suppress the splats.
+	This kernel parameter may also be set via <tt>sysfs</tt>.
+	Furthermore, RCU CPU stall warnings are counter-productive
+	during sysrq dumps and during panics.
+	RCU therefore supplies the <tt>rcu_sysrq_start()</tt> and
+	<tt>rcu_sysrq_end()</tt> API members to be called before
+	and after long sysrq dumps.
+	RCU also supplies the <tt>rcu_panic()</tt> notifier that is
+	automatically invoked at the beginning of a panic to suppress
+	further RCU CPU stall warnings.
+
+	<p>
 	This requirement made itself known in the early 1990s, pretty
 	much the first time that it was necessary to debug a CPU stall.
+	That said, the initial implementation in DYNIX/ptx was quite
+	generic in comparison with that of Linux.
 <li>	Although it would be very good to detect pointers leaking out
 	of RCU read-side critical sections, there is currently no
 	good way of doing this.
diff --git a/Documentation/RCU/Design/Requirements/Requirements.htmlx b/Documentation/RCU/Design/Requirements/Requirements.htmlx
index a544db4..23524d7 100644
--- a/Documentation/RCU/Design/Requirements/Requirements.htmlx
+++ b/Documentation/RCU/Design/Requirements/Requirements.htmlx
@@ -1777,12 +1777,35 @@ guard against mishaps and misuse:
 	supplied the needed
 	<a href="https://lkml.kernel.org/g/20100319013024.GA28456@Krystal">patch</a>.
 <li>	An infinite loop in an RCU read-side critical section will
-	eventually trigger an RCU CPU stall warning splat.
+	eventually trigger an RCU CPU stall warning splat, with
+	the duration of &ldquo;eventually&rdquo; being controlled by the
+	<tt>RCU_CPU_STALL_TIMEOUT</tt> <tt>Kconfig</tt> option, or,
+	alternatively, by the
+	<tt>rcupdate.rcu_cpu_stall_timeout</tt> boot/sysfs
+	parameter.
 	However, RCU is not obligated to produce this splat
 	unless there is a grace period waiting on that particular
 	RCU read-side critical section.
+	<p>
+	Some extreme workloads might intentionally delay
+	RCU grace periods, and systems running those workloads can
+	be booted with <tt>rcupdate.rcu_cpu_stall_suppress</tt>
+	to suppress the splats.
+	This kernel parameter may also be set via <tt>sysfs</tt>.
+	Furthermore, RCU CPU stall warnings are counter-productive
+	during sysrq dumps and during panics.
+	RCU therefore supplies the <tt>rcu_sysrq_start()</tt> and
+	<tt>rcu_sysrq_end()</tt> API members to be called before
+	and after long sysrq dumps.
+	RCU also supplies the <tt>rcu_panic()</tt> notifier that is
+	automatically invoked at the beginning of a panic to suppress
+	further RCU CPU stall warnings.
+
+	<p>
 	This requirement made itself known in the early 1990s, pretty
 	much the first time that it was necessary to debug a CPU stall.
+	That said, the initial implementation in DYNIX/ptx was quite
+	generic in comparison with that of Linux.
 <li>	Although it would be very good to detect pointers leaking out
 	of RCU read-side critical sections, there is currently no
 	good way of doing this.
-- 
cgit v1.1


From 0825458b1dbc39ec6840ee2e45b1fedb1b4b4ca1 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 7 Oct 2015 15:43:31 -0700
Subject: documentation: Composability analogies

This commit expands on RCU's composability by comparing it to that of
transactional memory and of locking.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 Documentation/RCU/Design/Requirements/Requirements.html  | 8 ++++++++
 Documentation/RCU/Design/Requirements/Requirements.htmlx | 8 ++++++++
 2 files changed, 16 insertions(+)

diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html
index cc5b587..1052471 100644
--- a/Documentation/RCU/Design/Requirements/Requirements.html
+++ b/Documentation/RCU/Design/Requirements/Requirements.html
@@ -1495,6 +1495,14 @@ section, neither of which is conducive to a long-lived and prosperous
 kernel.
 
 <p>
+It is worth noting that RCU is not alone in limiting composability.
+For example, many transactional-memory implementations prohibit
+composing a pair of transactions separated by an irrevocable
+operation (for example, a network receive operation).
+For another example, lock-based critical sections can be composed
+surprisingly freely, but only if deadlock is avoided.
+
+<p>
 In short, although RCU read-side critical sections are highly composable,
 care is required in some situations, just as is the case for any other
 composable synchronization mechanism.
diff --git a/Documentation/RCU/Design/Requirements/Requirements.htmlx b/Documentation/RCU/Design/Requirements/Requirements.htmlx
index 23524d7..5b76e21 100644
--- a/Documentation/RCU/Design/Requirements/Requirements.htmlx
+++ b/Documentation/RCU/Design/Requirements/Requirements.htmlx
@@ -1654,6 +1654,14 @@ section, neither of which is conducive to a long-lived and prosperous
 kernel.
 
 <p>
+It is worth noting that RCU is not alone in limiting composability.
+For example, many transactional-memory implementations prohibit
+composing a pair of transactions separated by an irrevocable
+operation (for example, a network receive operation).
+For another example, lock-based critical sections can be composed
+surprisingly freely, but only if deadlock is avoided.
+
+<p>
 In short, although RCU read-side critical sections are highly composable,
 care is required in some situations, just as is the case for any other
 composable synchronization mechanism.
-- 
cgit v1.1


From a4b575627e8d1a2498a921940813266d4e26ff56 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 7 Oct 2015 15:52:25 -0700
Subject: documentation: Expand on scheduler/RCU deadlock requirements

This commit adds a second option for avoiding scheduler/RCU deadlocks,
namely that preemption be disabled across the entire RCU read-side
critical section in question.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 Documentation/RCU/Design/Requirements/Requirements.html  | 14 +++++++++-----
 Documentation/RCU/Design/Requirements/Requirements.htmlx | 14 +++++++++-----
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html
index 1052471..ab513ed 100644
--- a/Documentation/RCU/Design/Requirements/Requirements.html
+++ b/Documentation/RCU/Design/Requirements/Requirements.html
@@ -1942,12 +1942,16 @@ RCU depends on the scheduler, and the scheduler uses RCU to
 protect some of its data structures.
 This means the scheduler is forbidden from acquiring
 the runqueue locks and the priority-inheritance locks
-in the middle of an outermost RCU read-side critical section unless
-it also releases them before exiting that same
-RCU read-side critical section.
-This same prohibition also applies to any lock that is acquired
+in the middle of an outermost RCU read-side critical section unless either
+(1)&nbsp;it releases them before exiting that same
+RCU read-side critical section, or
+(2)&nbsp;preemption is disabled across
+that entire RCU read-side critical section.
+This same prohibition also applies (recursively!) to any lock that is acquired
 while holding any lock to which this prohibition applies.
-Violating this rule results in deadlock.
+Adhering to this rule prevents preemptible RCU from invoking
+<tt>rcu_read_unlock_special()</tt> while either runqueue or
+priority-inheritance locks are held, thus avoiding deadlock.
 
 <p>
 For RCU's part, the preemptible-RCU <tt>rcu_read_unlock()</tt>
diff --git a/Documentation/RCU/Design/Requirements/Requirements.htmlx b/Documentation/RCU/Design/Requirements/Requirements.htmlx
index 5b76e21..f7c817f 100644
--- a/Documentation/RCU/Design/Requirements/Requirements.htmlx
+++ b/Documentation/RCU/Design/Requirements/Requirements.htmlx
@@ -2109,12 +2109,16 @@ RCU depends on the scheduler, and the scheduler uses RCU to
 protect some of its data structures.
 This means the scheduler is forbidden from acquiring
 the runqueue locks and the priority-inheritance locks
-in the middle of an outermost RCU read-side critical section unless
-it also releases them before exiting that same
-RCU read-side critical section.
-This same prohibition also applies to any lock that is acquired
+in the middle of an outermost RCU read-side critical section unless either
+(1)&nbsp;it releases them before exiting that same
+RCU read-side critical section, or
+(2)&nbsp;preemption is disabled across
+that entire RCU read-side critical section.
+This same prohibition also applies (recursively!) to any lock that is acquired
 while holding any lock to which this prohibition applies.
-Violating this rule results in deadlock.
+Adhering to this rule prevents preemptible RCU from invoking
+<tt>rcu_read_unlock_special()</tt> while either runqueue or
+priority-inheritance locks are held, thus avoiding deadlock.
 
 <p>
 For RCU's part, the preemptible-RCU <tt>rcu_read_unlock()</tt>
-- 
cgit v1.1


From 4b689330b1a5858e88831b3752e9a6692a5c7bdb Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 12 Oct 2015 08:51:45 -0700
Subject: documentation: Clarify RCU memory barriers and requirements

The RCU requirements do not make it absolutely clear that the
memory-barrier requirements are not intended to replace the fundamental
requirement that all pre-existing RCU readers complete before a grace
period completes.  This commit therefore pulls the memory-barrier
requirements into a separate section and explicitly calls out the
relationship between the memory-barrier requirements and the fundamental
requirement.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 .../RCU/Design/Requirements/Requirements.html      | 66 +++++++++++++---------
 .../RCU/Design/Requirements/Requirements.htmlx     | 66 +++++++++++++---------
 2 files changed, 78 insertions(+), 54 deletions(-)

diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html
index ab513ed..96cdcf7 100644
--- a/Documentation/RCU/Design/Requirements/Requirements.html
+++ b/Documentation/RCU/Design/Requirements/Requirements.html
@@ -80,6 +80,8 @@ These are:
 	Grace-Period Guarantee</a>
 <li>	<a href="#Publish-Subscribe Guarantee">
 	Publish-Subscribe Guarantee</a>
+<li>	<a href="#Memory-Barrier Guarantees">
+	Memory-Barrier Guarantees</a>
 <li>	<a href="#RCU Primitives Guaranteed to Execute Unconditionally">
 	RCU Primitives Guaranteed to Execute Unconditionally</a>
 <li>	<a href="#Guaranteed Read-to-Write Upgrade">
@@ -499,9 +501,37 @@ might the compiler make use of?
 <br><a href="#qq4answer">Answer</a>
 
 <p>
-This simple linked-data-structure scenario clearly demonstrates the need
-for RCU's stringent memory-ordering guarantees on systems with more than
-one CPU:
+In short, RCU's publish-subscribe guarantee is provided by the combination
+of <tt>rcu_assign_pointer()</tt> and <tt>rcu_dereference()</tt>.
+This guarantee allows data elements to be safely added to RCU-protected
+linked data structures without disrupting RCU readers.
+This guarantee can be used in combination with the grace-period
+guarantee to also allow data elements to be removed from RCU-protected
+linked data structures, again without disrupting RCU readers.
+
+<p>
+This guarantee was only partially premeditated.
+DYNIX/ptx used an explicit memory barrier for publication, but had nothing
+resembling <tt>rcu_dereference()</tt> for subscription, nor did it
+have anything resembling the <tt>smp_read_barrier_depends()</tt>
+that was later subsumed into <tt>rcu_dereference()</tt>.
+The need for these operations made itself known quite suddenly at a
+late-1990s meeting with the DEC Alpha architects, back in the days when
+DEC was still a free-standing company.
+It took the Alpha architects a good hour to convince me that any sort
+of barrier would ever be needed, and it then took me a good <i>two</i> hours
+to convince them that their documentation did not make this point clear.
+More recent work with the C and C++ standards committees have provided
+much education on tricks and traps from the compiler.
+In short, compilers were much less tricky in the early 1990s, but in
+2015, don't even think about omitting <tt>rcu_dereference()</tt>!
+
+<h3><a name="Memory-Barrier Guarantees">Memory-Barrier Guarantees</a></h3>
+
+<p>
+The previous section's simple linked-data-structure scenario clearly
+demonstrates the need for RCU's stringent memory-ordering guarantees on
+systems with more than one CPU:
 
 <ol>
 <li>	Each CPU that has an RCU read-side critical section that
@@ -554,30 +584,12 @@ Are all these memory barriers <i> really</i> required?
 <br><a href="#qq6answer">Answer</a>
 
 <p>
-In short, RCU's publish-subscribe guarantee is provided by the combination
-of <tt>rcu_assign_pointer()</tt> and <tt>rcu_dereference()</tt>.
-This guarantee allows data elements to be safely added to RCU-protected
-linked data structures without disrupting RCU readers.
-This guarantee can be used in combination with the grace-period
-guarantee to also allow data elements to be removed from RCU-protected
-linked data structures, again without disrupting RCU readers.
-
-<p>
-This guarantee was only partially premeditated.
-DYNIX/ptx used an explicit memory barrier for publication, but had nothing
-resembling <tt>rcu_dereference()</tt> for subscription, nor did it
-have anything resembling the <tt>smp_read_barrier_depends()</tt>
-that was later subsumed into <tt>rcu_dereference()</tt>.
-The need for these operations made itself known quite suddenly at a
-late-1990s meeting with the DEC Alpha architects, back in the days when
-DEC was still a free-standing company.
-It took the Alpha architects a good hour to convince me that any sort
-of barrier would ever be needed, and it then took me a good <i>two</i> hours
-to convince them that their documentation did not make this point clear.
-More recent work with the C and C++ standards committees have provided
-much education on tricks and traps from the compiler.
-In short, compilers were much less tricky in the early 1990s, but in
-2015, don't even think about omitting <tt>rcu_dereference()</tt>!
+Note that these memory-barrier requirements do not replace the fundamental
+RCU requirement that a grace period wait for all pre-existing readers.
+On the contrary, the memory barriers called out in this section must operate in
+such a way as to <i>enforce</i> this fundamental requirement.
+Of course, different implementations enforce this requirement in different
+ways, but enforce it they must.
 
 <h3><a name="RCU Primitives Guaranteed to Execute Unconditionally">RCU Primitives Guaranteed to Execute Unconditionally</a></h3>
 
diff --git a/Documentation/RCU/Design/Requirements/Requirements.htmlx b/Documentation/RCU/Design/Requirements/Requirements.htmlx
index f7c817f..2d0cd90 100644
--- a/Documentation/RCU/Design/Requirements/Requirements.htmlx
+++ b/Documentation/RCU/Design/Requirements/Requirements.htmlx
@@ -78,6 +78,8 @@ These are:
 	Grace-Period Guarantee</a>
 <li>	<a href="#Publish-Subscribe Guarantee">
 	Publish-Subscribe Guarantee</a>
+<li>	<a href="#Memory-Barrier Guarantees">
+	Memory-Barrier Guarantees</a>
 <li>	<a href="#RCU Primitives Guaranteed to Execute Unconditionally">
 	RCU Primitives Guaranteed to Execute Unconditionally</a>
 <li>	<a href="#Guaranteed Read-to-Write Upgrade">
@@ -539,9 +541,37 @@ either <tt>rcu_access_pointer()</tt> or <tt>rcu_dereference()</tt>.
 <p>@@QQE@@
 
 <p>
-This simple linked-data-structure scenario clearly demonstrates the need
-for RCU's stringent memory-ordering guarantees on systems with more than
-one CPU:
+In short, RCU's publish-subscribe guarantee is provided by the combination
+of <tt>rcu_assign_pointer()</tt> and <tt>rcu_dereference()</tt>.
+This guarantee allows data elements to be safely added to RCU-protected
+linked data structures without disrupting RCU readers.
+This guarantee can be used in combination with the grace-period
+guarantee to also allow data elements to be removed from RCU-protected
+linked data structures, again without disrupting RCU readers.
+
+<p>
+This guarantee was only partially premeditated.
+DYNIX/ptx used an explicit memory barrier for publication, but had nothing
+resembling <tt>rcu_dereference()</tt> for subscription, nor did it
+have anything resembling the <tt>smp_read_barrier_depends()</tt>
+that was later subsumed into <tt>rcu_dereference()</tt>.
+The need for these operations made itself known quite suddenly at a
+late-1990s meeting with the DEC Alpha architects, back in the days when
+DEC was still a free-standing company.
+It took the Alpha architects a good hour to convince me that any sort
+of barrier would ever be needed, and it then took me a good <i>two</i> hours
+to convince them that their documentation did not make this point clear.
+More recent work with the C and C++ standards committees have provided
+much education on tricks and traps from the compiler.
+In short, compilers were much less tricky in the early 1990s, but in
+2015, don't even think about omitting <tt>rcu_dereference()</tt>!
+
+<h3><a name="Memory-Barrier Guarantees">Memory-Barrier Guarantees</a></h3>
+
+<p>
+The previous section's simple linked-data-structure scenario clearly
+demonstrates the need for RCU's stringent memory-ordering guarantees on
+systems with more than one CPU:
 
 <ol>
 <li>	Each CPU that has an RCU read-side critical section that
@@ -653,30 +683,12 @@ adhered to the as-if rule than it is to actually adhere to it!
 <p>@@QQE@@
 
 <p>
-In short, RCU's publish-subscribe guarantee is provided by the combination
-of <tt>rcu_assign_pointer()</tt> and <tt>rcu_dereference()</tt>.
-This guarantee allows data elements to be safely added to RCU-protected
-linked data structures without disrupting RCU readers.
-This guarantee can be used in combination with the grace-period
-guarantee to also allow data elements to be removed from RCU-protected
-linked data structures, again without disrupting RCU readers.
-
-<p>
-This guarantee was only partially premeditated.
-DYNIX/ptx used an explicit memory barrier for publication, but had nothing
-resembling <tt>rcu_dereference()</tt> for subscription, nor did it
-have anything resembling the <tt>smp_read_barrier_depends()</tt>
-that was later subsumed into <tt>rcu_dereference()</tt>.
-The need for these operations made itself known quite suddenly at a
-late-1990s meeting with the DEC Alpha architects, back in the days when
-DEC was still a free-standing company.
-It took the Alpha architects a good hour to convince me that any sort
-of barrier would ever be needed, and it then took me a good <i>two</i> hours
-to convince them that their documentation did not make this point clear.
-More recent work with the C and C++ standards committees have provided
-much education on tricks and traps from the compiler.
-In short, compilers were much less tricky in the early 1990s, but in
-2015, don't even think about omitting <tt>rcu_dereference()</tt>!
+Note that these memory-barrier requirements do not replace the fundamental
+RCU requirement that a grace period wait for all pre-existing readers.
+On the contrary, the memory barriers called out in this section must operate in
+such a way as to <i>enforce</i> this fundamental requirement.
+Of course, different implementations enforce this requirement in different
+ways, but enforce it they must.
 
 <h3><a name="RCU Primitives Guaranteed to Execute Unconditionally">RCU Primitives Guaranteed to Execute Unconditionally</a></h3>
 
-- 
cgit v1.1


From c64c4b0f9a183e4c73abff848378afa6edf796c5 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 6 Nov 2015 23:05:32 -0800
Subject: documentation: Update RCU requirements based on expedited changes

Because RCU-sched expedited grace periods now use IPIs and interact
with rcu_read_unlock(), it is no longer sufficient to disable preemption
across RCU read-side critical sections that acquire and hold scheduler
locks.  It is now necessary to instead disable interrupts.  This commit
documents this change.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 Documentation/RCU/Design/Requirements/Requirements.html  | 12 ++++++++++--
 Documentation/RCU/Design/Requirements/Requirements.htmlx | 10 +++++++++-
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html
index 96cdcf7..a725f99 100644
--- a/Documentation/RCU/Design/Requirements/Requirements.html
+++ b/Documentation/RCU/Design/Requirements/Requirements.html
@@ -1,5 +1,5 @@
 <!-- DO NOT HAND EDIT. -->
-<!-- Instead, edit Requirements.htmlx and run 'sh htmlqqz.sh Requirements' -->
+<!-- Instead, edit Documentation/RCU/Design/Requirements/Requirements.htmlx and run 'sh htmlqqz.sh Documentation/RCU/Design/Requirements/Requirements' -->
 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
         "http://www.w3.org/TR/html4/loose.dtd">
         <html>
@@ -1957,7 +1957,7 @@ the runqueue locks and the priority-inheritance locks
 in the middle of an outermost RCU read-side critical section unless either
 (1)&nbsp;it releases them before exiting that same
 RCU read-side critical section, or
-(2)&nbsp;preemption is disabled across
+(2)&nbsp;interrupts are disabled across
 that entire RCU read-side critical section.
 This same prohibition also applies (recursively!) to any lock that is acquired
 while holding any lock to which this prohibition applies.
@@ -1966,6 +1966,14 @@ Adhering to this rule prevents preemptible RCU from invoking
 priority-inheritance locks are held, thus avoiding deadlock.
 
 <p>
+Prior to v4.4, it was only necessary to disable preemption across
+RCU read-side critical sections that acquired scheduler locks.
+In v4.4, expedited grace periods started using IPIs, and these
+IPIs could force a <tt>rcu_read_unlock()</tt> to take the slowpath.
+Therefore, this expedited-grace-period change required disabling of
+interrupts, not just preemption.
+
+<p>
 For RCU's part, the preemptible-RCU <tt>rcu_read_unlock()</tt>
 implementation must be written carefully to avoid similar deadlocks.
 In particular, <tt>rcu_read_unlock()</tt> must tolerate an
diff --git a/Documentation/RCU/Design/Requirements/Requirements.htmlx b/Documentation/RCU/Design/Requirements/Requirements.htmlx
index 2d0cd90..3a97ba4 100644
--- a/Documentation/RCU/Design/Requirements/Requirements.htmlx
+++ b/Documentation/RCU/Design/Requirements/Requirements.htmlx
@@ -2124,7 +2124,7 @@ the runqueue locks and the priority-inheritance locks
 in the middle of an outermost RCU read-side critical section unless either
 (1)&nbsp;it releases them before exiting that same
 RCU read-side critical section, or
-(2)&nbsp;preemption is disabled across
+(2)&nbsp;interrupts are disabled across
 that entire RCU read-side critical section.
 This same prohibition also applies (recursively!) to any lock that is acquired
 while holding any lock to which this prohibition applies.
@@ -2133,6 +2133,14 @@ Adhering to this rule prevents preemptible RCU from invoking
 priority-inheritance locks are held, thus avoiding deadlock.
 
 <p>
+Prior to v4.4, it was only necessary to disable preemption across
+RCU read-side critical sections that acquired scheduler locks.
+In v4.4, expedited grace periods started using IPIs, and these
+IPIs could force a <tt>rcu_read_unlock()</tt> to take the slowpath.
+Therefore, this expedited-grace-period change required disabling of
+interrupts, not just preemption.
+
+<p>
 For RCU's part, the preemptible-RCU <tt>rcu_read_unlock()</tt>
 implementation must be written carefully to avoid similar deadlocks.
 In particular, <tt>rcu_read_unlock()</tt> must tolerate an
-- 
cgit v1.1


From f84cfbb0ff269b427a0db591e22ac6948c554ab4 Mon Sep 17 00:00:00 2001
From: Chris Metcalf <cmetcalf@ezchip.com>
Date: Mon, 23 Nov 2015 17:04:17 -0500
Subject: Documentation/memory-barriers.txt: Fix ACCESS_ONCE thinko

In commit 2ecf810121c7 ("Documentation/memory-barriers.txt: Add
needed ACCESS_ONCE() calls to memory-barriers.txt") the statement
"Q = P" was converted to "ACCESS_ONCE(Q) = P".  This should have
been "Q = ACCESS_ONCE(P)".  It later became "WRITE_ONCE(Q, P)".
This doesn't match the following text, which is "Q = LOAD P".
Change the statement to be "Q = READ_ONCE(P)".

Signed-off-by: Chris Metcalf <cmetcalf@ezchip.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 Documentation/memory-barriers.txt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index aef9487..85304ebd 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -194,7 +194,7 @@ There are some minimal guarantees that may be expected of a CPU:
  (*) On any given CPU, dependent memory accesses will be issued in order, with
      respect to itself.  This means that for:
 
-	WRITE_ONCE(Q, P); smp_read_barrier_depends(); D = READ_ONCE(*Q);
+	Q = READ_ONCE(P); smp_read_barrier_depends(); D = READ_ONCE(*Q);
 
      the CPU will issue the following memory operations:
 
@@ -202,9 +202,9 @@ There are some minimal guarantees that may be expected of a CPU:
 
      and always in that order.  On most systems, smp_read_barrier_depends()
      does nothing, but it is required for DEC Alpha.  The READ_ONCE()
-     and WRITE_ONCE() are required to prevent compiler mischief.  Please
-     note that you should normally use something like rcu_dereference()
-     instead of open-coding smp_read_barrier_depends().
+     is required to prevent compiler mischief.  Please note that you
+     should normally use something like rcu_dereference() instead of
+     open-coding smp_read_barrier_depends().
 
  (*) Overlapping loads and stores within a particular CPU will appear to be
      ordered within that CPU.  This means that for:
-- 
cgit v1.1


From 83977d273b609477e31af4c993697b75936acde0 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 30 Sep 2015 22:11:48 -0700
Subject: rcutorture: Add batch number to script printout

Currently, the scripts print "----Start batch" at the beginning of each
batch, which does serve as a good visual delimiter between batches.
Unfortunately, if there are a lot of batches, it is hard to quickly
estimate test runtime from the output of "--dryrun sched".  This commit
therefore adds a batch number, so that the beginning-of-batch output
looks like this "----Start batch 10" for the tenth batch.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 tools/testing/selftests/rcutorture/bin/kvm.sh | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh
index f648360..013c482 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm.sh
@@ -307,10 +307,10 @@ awk < $T/cfgcpu.pack \
 }
 
 # Dump out the scripting required to run one test batch.
-function dump(first, pastlast)
+function dump(first, pastlast, batchnum)
 {
-	print "echo ----Start batch: `date`";
-	print "echo ----Start batch: `date` >> " rd "/log";
+	print "echo ----Start batch " batchnum ": `date`";
+	print "echo ----Start batch " batchnum ": `date` >> " rd "/log";
 	jn=1
 	for (j = first; j < pastlast; j++) {
 		builddir=KVM "/b" jn
@@ -371,25 +371,28 @@ END {
 	njobs = i;
 	nc = ncpus;
 	first = 0;
+	batchnum = 1;
 
 	# Each pass through the following loop considers one test.
 	for (i = 0; i < njobs; i++) {
 		if (ncpus == 0) {
 			# Sequential test specified, each test its own batch.
-			dump(i, i + 1);
+			dump(i, i + 1, batchnum);
 			first = i;
+			batchnum++;
 		} else if (nc < cpus[i] && i != 0) {
 			# Out of CPUs, dump out a batch.
-			dump(first, i);
+			dump(first, i, batchnum);
 			first = i;
 			nc = ncpus;
+			batchnum++;
 		}
 		# Account for the CPUs needed by the current test.
 		nc -= cpus[i];
 	}
 	# Dump the last batch.
 	if (ncpus != 0)
-		dump(first, i);
+		dump(first, i, batchnum);
 }' >> $T/script
 
 cat << ___EOF___ >> $T/script
-- 
cgit v1.1


From a0e3a3aa2841d5720a277de53b6882eb8b2ef698 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 5 Dec 2015 17:34:10 -0800
Subject: rcutorture: Flag nonexistent RCU GP kthread

Currently, if the RCU grace-period kthread has not yet been created,
in which case the starvation-check code will print zero for the state,
which maps to TASK_RUNNING.  This could clearly be quite confusing, so
this commit prints ~0, which does not map to any legal ->state value.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcu/tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 81aa1cd..e2315fb 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1201,7 +1201,7 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
 		       rsp->name, j - gpa,
 		       rsp->gpnum, rsp->completed,
 		       rsp->gp_flags, rsp->gp_state,
-		       rsp->gp_kthread ? rsp->gp_kthread->state : 0);
+		       rsp->gp_kthread ? rsp->gp_kthread->state : ~0);
 }
 
 /*
-- 
cgit v1.1


From b1adb3e2736b695821badc715d2c7a5d873b8b94 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 1 Oct 2015 10:38:16 -0700
Subject: rcutorture:  Dump stack when GP kthread stalls

This commit increases debug information in the case where the grace-period
kthread is being prevented from running by dumping that kthread's stack.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
[ paulmck: Split into prior commit and this commit, as suggested by
  Josh Triplett. ]
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcu/tree.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index e2315fb..7b78c88 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1196,12 +1196,15 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
 
 	j = jiffies;
 	gpa = READ_ONCE(rsp->gp_activity);
-	if (j - gpa > 2 * HZ)
+	if (j - gpa > 2 * HZ) {
 		pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x s%d ->state=%#lx\n",
 		       rsp->name, j - gpa,
 		       rsp->gpnum, rsp->completed,
 		       rsp->gp_flags, rsp->gp_state,
 		       rsp->gp_kthread ? rsp->gp_kthread->state : ~0);
+		if (rsp->gp_kthread)
+			sched_show_task(rsp->gp_kthread);
+	}
 }
 
 /*
-- 
cgit v1.1


From 542e83329db44622a401b74b4be0ea2d5f0850be Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 1 Oct 2015 11:41:06 -0700
Subject: rcutorture: Default grace period to three minutes, allow override

The default test grace period of two minutes is insufficient in some
cases and excessive in others.  This commit therefore increases the
default to three minutes, but also adds a --shutdown-grace parameter
to allow the default to be overridden.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh | 4 +---
 tools/testing/selftests/rcutorture/bin/kvm.sh            | 7 +++++++
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
index 5236e07..d39273d 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
@@ -38,8 +38,6 @@
 #
 # Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
 
-grace=120
-
 T=/tmp/kvm-test-1-run.sh.$$
 trap 'rm -rf $T' 0
 touch $T
@@ -214,7 +212,7 @@ then
 		else
 			break
 		fi
-		if test $kruntime -ge $((seconds + grace))
+		if test $kruntime -ge $((seconds + $TORTURE_SHUTDOWN_GRACE))
 		then
 			echo "!!! PID $qemu_pid hung at $kruntime vs. $seconds seconds" >> $resdir/Warnings 2>&1
 			kill -KILL $qemu_pid
diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh
index 013c482..4a43176 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm.sh
@@ -42,6 +42,7 @@ TORTURE_DEFCONFIG=defconfig
 TORTURE_BOOT_IMAGE=""
 TORTURE_INITRD="$KVM/initrd"; export TORTURE_INITRD
 TORTURE_KMAKE_ARG=""
+TORTURE_SHUTDOWN_GRACE=180
 TORTURE_SUITE=rcu
 resdir=""
 configs=""
@@ -149,6 +150,11 @@ do
 		resdir=$2
 		shift
 		;;
+	--shutdown-grace)
+		checkarg --shutdown-grace "(seconds)" "$#" "$2" '^[0-9]*$' '^error'
+		TORTURE_SHUTDOWN_GRACE=$2
+		shift
+		;;
 	--torture)
 		checkarg --torture "(suite name)" "$#" "$2" '^\(lock\|rcu\)$' '^--'
 		TORTURE_SUITE=$2
@@ -266,6 +272,7 @@ TORTURE_KMAKE_ARG="$TORTURE_KMAKE_ARG"; export TORTURE_KMAKE_ARG
 TORTURE_QEMU_CMD="$TORTURE_QEMU_CMD"; export TORTURE_QEMU_CMD
 TORTURE_QEMU_INTERACTIVE="$TORTURE_QEMU_INTERACTIVE"; export TORTURE_QEMU_INTERACTIVE
 TORTURE_QEMU_MAC="$TORTURE_QEMU_MAC"; export TORTURE_QEMU_MAC
+TORTURE_SHUTDOWN_GRACE="$TORTURE_SHUTDOWN_GRACE"; export TORTURE_SHUTDOWN_GRACE
 TORTURE_SUITE="$TORTURE_SUITE"; export TORTURE_SUITE
 if ! test -e $resdir
 then
-- 
cgit v1.1


From 91bf6a83e5a121c9313ae47156dd47df46ea2aac Mon Sep 17 00:00:00 2001
From: Yang Shi <yang.shi@linaro.org>
Date: Thu, 15 Oct 2015 16:10:07 -0700
Subject: rcutorture: Remove CONFIG_RCU_USER_QS from rcutorture selftest doc

Commit d1ec4c34c7a9 ("rcu: Drop RCU_USER_QS in favor of NO_HZ_FULL") has
removed RCU_USER_QS from Kconfig file, so remove it from some documents
to avoid any confusion.

Signed-off-by: Yang Shi <yang.shi@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 tools/testing/selftests/rcutorture/doc/TINY_RCU.txt         | 1 -
 tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt | 4 ----
 2 files changed, 5 deletions(-)

diff --git a/tools/testing/selftests/rcutorture/doc/TINY_RCU.txt b/tools/testing/selftests/rcutorture/doc/TINY_RCU.txt
index 9ef33a7..24396ae 100644
--- a/tools/testing/selftests/rcutorture/doc/TINY_RCU.txt
+++ b/tools/testing/selftests/rcutorture/doc/TINY_RCU.txt
@@ -20,7 +20,6 @@ CONFIG_PROVE_RCU
 
 CONFIG_NO_HZ_FULL_SYSIDLE
 CONFIG_RCU_NOCB_CPU
-CONFIG_RCU_USER_QS
 
 	Meaningless for TINY_RCU.
 
diff --git a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
index 657f3a0..4e2b189 100644
--- a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
+++ b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
@@ -72,10 +72,6 @@ CONFIG_RCU_TORTURE_TEST_RUNNABLE
 
 	Always used in KVM testing.
 
-CONFIG_RCU_USER_QS
-
-	Redundant with CONFIG_NO_HZ_FULL.
-
 CONFIG_PREEMPT_RCU
 CONFIG_TREE_RCU
 
-- 
cgit v1.1


From 18aff33e7314253b9437234bd6d69ddc4827de70 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 17 Nov 2015 13:35:28 -0800
Subject: rcutorture: Print symbolic name for rcu_torture_writer_state

Currently, rcu_torture_writer_state is printed as an integer, which slows
debugging.  This commit therefore prints a symbolic name in addition to
the integer.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
[ paulmck: More "const", as suggested by Josh Triplett. ]
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcu/rcutorture.c | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index d89328e..d2988d0 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -162,6 +162,27 @@ static int rcu_torture_writer_state;
 #define RTWS_SYNC		7
 #define RTWS_STUTTER		8
 #define RTWS_STOPPING		9
+static const char * const rcu_torture_writer_state_names[] = {
+	"RTWS_FIXED_DELAY",
+	"RTWS_DELAY",
+	"RTWS_REPLACE",
+	"RTWS_DEF_FREE",
+	"RTWS_EXP_SYNC",
+	"RTWS_COND_GET",
+	"RTWS_COND_SYNC",
+	"RTWS_SYNC",
+	"RTWS_STUTTER",
+	"RTWS_STOPPING",
+};
+
+static const char *rcu_torture_writer_state_getname(void)
+{
+	unsigned int i = READ_ONCE(rcu_torture_writer_state);
+
+	if (i >= ARRAY_SIZE(rcu_torture_writer_state_names))
+		return "???";
+	return rcu_torture_writer_state_names[i];
+}
 
 #if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE)
 #define RCUTORTURE_RUNNABLE_INIT 1
@@ -1307,7 +1328,8 @@ rcu_torture_stats_print(void)
 
 		rcutorture_get_gp_data(cur_ops->ttype,
 				       &flags, &gpnum, &completed);
-		pr_alert("??? Writer stall state %d g%lu c%lu f%#x\n",
+		pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x\n",
+			 rcu_torture_writer_state_getname(),
 			 rcu_torture_writer_state,
 			 gpnum, completed, flags);
 		show_rcu_gp_kthreads();
-- 
cgit v1.1


From 6b50e119c440b7532ed749b635a58b3839f62992 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 17 Nov 2015 14:39:26 -0800
Subject: rcutorture: Print symbolic name for ->gp_state

Currently, ->gp_state is printed as an integer, which slows debugging.
This commit therefore prints a symbolic name in addition to the integer.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
[ paulmck: Updated to fix relational operator called out by Dan Carpenter. ]
[ paulmck: More "const", as suggested by Josh Triplett. ]
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcu/tree.c | 15 +++++++++++++--
 kernel/rcu/tree.h | 12 ++++++++++++
 2 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 7b78c88..3163541 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1187,6 +1187,16 @@ static void record_gp_stall_check_time(struct rcu_state *rsp)
 }
 
 /*
+ * Convert a ->gp_state value to a character string.
+ */
+static const char *gp_state_getname(short gs)
+{
+	if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names))
+		return "???";
+	return gp_state_names[gs];
+}
+
+/*
  * Complain about starvation of grace-period kthread.
  */
 static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
@@ -1197,10 +1207,11 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
 	j = jiffies;
 	gpa = READ_ONCE(rsp->gp_activity);
 	if (j - gpa > 2 * HZ) {
-		pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x s%d ->state=%#lx\n",
+		pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x %s(%d) ->state=%#lx\n",
 		       rsp->name, j - gpa,
 		       rsp->gpnum, rsp->completed,
-		       rsp->gp_flags, rsp->gp_state,
+		       rsp->gp_flags,
+		       gp_state_getname(rsp->gp_state), rsp->gp_state,
 		       rsp->gp_kthread ? rsp->gp_kthread->state : ~0);
 		if (rsp->gp_kthread)
 			sched_show_task(rsp->gp_kthread);
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index f32bebb..a3fb6fe 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -545,6 +545,18 @@ struct rcu_state {
 #define RCU_GP_CLEANUP   5	/* Grace-period cleanup started. */
 #define RCU_GP_CLEANED   6	/* Grace-period cleanup complete. */
 
+#ifndef RCU_TREE_NONCORE
+static const char * const gp_state_names[] = {
+	"RCU_GP_IDLE",
+	"RCU_GP_WAIT_GPS",
+	"RCU_GP_DONE_GPS",
+	"RCU_GP_WAIT_FQS",
+	"RCU_GP_DOING_FQS",
+	"RCU_GP_CLEANUP",
+	"RCU_GP_CLEANED",
+};
+#endif /* #ifndef RCU_TREE_NONCORE */
+
 extern struct list_head rcu_struct_flavors;
 
 /* Sequence through rcu_state structures for each RCU flavor. */
-- 
cgit v1.1


From 5708c6475789ac5f58ff620e78bd08ca2caa1f23 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 1 Dec 2015 12:37:57 -0800
Subject: torture: Abbreviate console error dump

Currently, the scripts print a list of warning/bug indicators from the
console.log file.  This works well if there are only a few warnings or
bugs, but can be quite annoying if there is a large number.  This commit
therefore prints a summary listing the number of each type of warning/bug
indicator, but only if there is at least one such indicator.  The full
list is stored in the results directory at console.log.diags, which
makes it easier to find the warning/bugs in the full console.log.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 .../selftests/rcutorture/bin/parse-console.sh      | 41 ++++++++++++++++++----
 1 file changed, 35 insertions(+), 6 deletions(-)

diff --git a/tools/testing/selftests/rcutorture/bin/parse-console.sh b/tools/testing/selftests/rcutorture/bin/parse-console.sh
index d8f35cf..844787a 100755
--- a/tools/testing/selftests/rcutorture/bin/parse-console.sh
+++ b/tools/testing/selftests/rcutorture/bin/parse-console.sh
@@ -24,9 +24,6 @@
 #
 # Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
 
-T=/tmp/abat-chk-badness.sh.$$
-trap 'rm -f $T' 0
-
 file="$1"
 title="$2"
 
@@ -36,9 +33,41 @@ if grep -Pq '\x00' < $file
 then
 	print_warning Console output contains nul bytes, old qemu still running?
 fi
-egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:|Stall ended before state dump start' < $file | grep -v 'ODEBUG: ' | grep -v 'Warning: unable to open an initial console' > $T
-if test -s $T
+egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:|detected stalls on CPUs/tasks:|Stall ended before state dump start' < $file | grep -v 'ODEBUG: ' | grep -v 'Warning: unable to open an initial console' > $1.diags
+if test -s $1.diags
 then
 	print_warning Assertion failure in $file $title
-	cat $T
+	# cat $1.diags
+	summary=""
+	n_badness=`grep -c Badness $1`
+	if test "$n_badness" -ne 0
+	then
+		summary="$summary  Badness: $n_badness"
+	fi
+	n_warn=`grep -v 'Warning: unable to open an initial console' $1 | egrep -c 'WARNING:|Warn'`
+	if test "$n_warn" -ne 0
+	then
+		summary="$summary  Warnings: $n_warn"
+	fi
+	n_bugs=`egrep -c 'BUG|Oops:' $1`
+	if test "$n_bugs" -ne 0
+	then
+		summary="$summary  Bugs: $n_bugs"
+	fi
+	n_calltrace=`grep -c 'Call Trace:' $1`
+	if test "$n_calltrace" -ne 0
+	then
+		summary="$summary  Call Traces: $n_calltrace"
+	fi
+	n_lockdep=`grep -c =========== $1`
+	if test "$n_badness" -ne 0
+	then
+		summary="$summary  lockdep: $n_badness"
+	fi
+	n_stalls=`egrep -c 'detected stalls on CPUs/tasks:|Stall ended before state dump start' $1`
+	if test "$n_stalls" -ne 0
+	then
+		summary="$summary  Stalls: $n_stalls"
+	fi
+	print_warning Summary: $summary
 fi
-- 
cgit v1.1


From c979ff991764a2e620db0b1bfb0a105b9cf78b6a Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 1 Dec 2015 15:53:11 -0800
Subject: torture: Place console.log files correctly from the get-go

Currently, the console output files ("console.log") are placed in the
build directory initially, then copied to the results directory.
One problem with this is if a qemu refuses to die in a timely fashion
after a kernel hang, it will continue to write after the next qemu
starts up, resulting in confusing output from the old instance of
qemu.  This commit prevents such confusion by placing the console.log
files into the results directory to begin with, so that a given instance
of qemu is always writing only to its own console.log file.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
index d39273d..0f80eef 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
@@ -150,7 +150,7 @@ fi
 qemu_args="`specify_qemu_cpus "$QEMU" "$qemu_args" "$cpu_count"`"
 
 # Generate architecture-specific and interaction-specific qemu arguments
-qemu_args="$qemu_args `identify_qemu_args "$QEMU" "$builddir/console.log"`"
+qemu_args="$qemu_args `identify_qemu_args "$QEMU" "$resdir/console.log"`"
 
 # Generate qemu -append arguments
 qemu_append="`identify_qemu_append "$QEMU"`"
@@ -166,7 +166,7 @@ then
 	touch $resdir/buildonly
 	exit 0
 fi
-echo "NOTE: $QEMU either did not run or was interactive" > $builddir/console.log
+echo "NOTE: $QEMU either did not run or was interactive" > $resdir/console.log
 echo $QEMU $qemu_args -m 512 -kernel $resdir/bzImage -append \"$qemu_append $boot_args\" > $resdir/qemu-cmd
 ( $QEMU $qemu_args -m 512 -kernel $resdir/bzImage -append "$qemu_append $boot_args"; echo $? > $resdir/qemu-retval ) &
 qemu_pid=$!
@@ -222,6 +222,5 @@ then
 	done
 fi
 
-cp $builddir/console.log $resdir
 parse-torture.sh $resdir/console.log $title
 parse-console.sh $resdir/console.log $title
-- 
cgit v1.1


From 8dd3303001976aa8583bf20f6b93590c74114308 Mon Sep 17 00:00:00 2001
From: Igor Mammedov <imammedo@redhat.com>
Date: Fri, 4 Dec 2015 14:07:05 +0100
Subject: x86/mm: Introduce max_possible_pfn

max_possible_pfn will be used for tracking max possible
PFN for memory that isn't present in E820 table and
could be hotplugged later.

By default max_possible_pfn is initialized with max_pfn,
but later it could be updated with highest PFN of
hotpluggable memory ranges declared in ACPI SRAT table
if any present.

Signed-off-by: Igor Mammedov <imammedo@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: akataria@vmware.com
Cc: fujita.tomonori@lab.ntt.co.jp
Cc: konrad.wilk@oracle.com
Cc: pbonzini@redhat.com
Cc: revers@redhat.com
Cc: riel@redhat.com
Link: http://lkml.kernel.org/r/1449234426-273049-2-git-send-email-imammedo@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/setup.c | 2 ++
 arch/x86/mm/srat.c      | 2 ++
 include/linux/bootmem.h | 4 ++++
 mm/bootmem.c            | 1 +
 mm/nobootmem.c          | 1 +
 5 files changed, 10 insertions(+)

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 29db25f..16a8465 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1048,6 +1048,8 @@ void __init setup_arch(char **cmdline_p)
 	if (mtrr_trim_uncached_memory(max_pfn))
 		max_pfn = e820_end_of_ram_pfn();
 
+	max_possible_pfn = max_pfn;
+
 #ifdef CONFIG_X86_32
 	/* max_low_pfn get updated here */
 	find_low_pfn_range();
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index c2aea63..b5f8218 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -203,6 +203,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 		pr_warn("SRAT: Failed to mark hotplug range [mem %#010Lx-%#010Lx] in memblock\n",
 			(unsigned long long)start, (unsigned long long)end - 1);
 
+	max_possible_pfn = max(max_possible_pfn, PFN_UP(end - 1));
+
 	return 0;
 out_err_bad_srat:
 	bad_srat();
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index f589222..35b22f9 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -19,6 +19,10 @@ extern unsigned long min_low_pfn;
  * highest page
  */
 extern unsigned long max_pfn;
+/*
+ * highest possible page
+ */
+extern unsigned long long max_possible_pfn;
 
 #ifndef CONFIG_NO_BOOTMEM
 /*
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 3b63807..91e32bc 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -33,6 +33,7 @@ EXPORT_SYMBOL(contig_page_data);
 unsigned long max_low_pfn;
 unsigned long min_low_pfn;
 unsigned long max_pfn;
+unsigned long long max_possible_pfn;
 
 bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata;
 
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index e57cf24..99feb2b 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -31,6 +31,7 @@ EXPORT_SYMBOL(contig_page_data);
 unsigned long max_low_pfn;
 unsigned long min_low_pfn;
 unsigned long max_pfn;
+unsigned long long max_possible_pfn;
 
 static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
 					u64 goal, u64 limit)
-- 
cgit v1.1


From ec941c5ffede4d788b9fc008f9eeca75b9e964f5 Mon Sep 17 00:00:00 2001
From: Igor Mammedov <imammedo@redhat.com>
Date: Fri, 4 Dec 2015 14:07:06 +0100
Subject: x86/mm/64: Enable SWIOTLB if system has SRAT memory regions above
 MAX_DMA32_PFN

when memory hotplug enabled system is booted with less
than 4GB of RAM and then later more RAM is hotplugged
32-bit devices stop functioning with following error:

 nommu_map_single: overflow 327b4f8c0+1522 of device mask ffffffff

the reason for this is that if x86_64 system were booted
with RAM less than 4GB, it doesn't enable SWIOTLB and
when memory is hotplugged beyond MAX_DMA32_PFN, devices
that expect 32-bit addresses can't handle 64-bit addresses.

Fix it by tracking max possible PFN when parsing
memory affinity structures from SRAT ACPI table and
enable SWIOTLB if there is hotpluggable memory
regions beyond MAX_DMA32_PFN.

It fixes KVM guests when they use emulated devices
(reproduces with ata_piix, e1000 and usb devices,
 RHBZ: 1275941, 1275977, 1271527)

It also fixes the HyperV, VMWare with emulated devices
which are affected by this issue as well.

Signed-off-by: Igor Mammedov <imammedo@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: akataria@vmware.com
Cc: fujita.tomonori@lab.ntt.co.jp
Cc: konrad.wilk@oracle.com
Cc: pbonzini@redhat.com
Cc: revers@redhat.com
Cc: riel@redhat.com
Link: http://lkml.kernel.org/r/1449234426-273049-3-git-send-email-imammedo@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/pci-swiotlb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index adf0392..7c577a1 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -88,7 +88,7 @@ int __init pci_swiotlb_detect_4gb(void)
 {
 	/* don't initialize swiotlb if iommu=off (no_iommu=1) */
 #ifdef CONFIG_X86_64
-	if (!no_iommu && max_pfn > MAX_DMA32_PFN)
+	if (!no_iommu && max_possible_pfn > MAX_DMA32_PFN)
 		swiotlb = 1;
 #endif
 	return swiotlb;
-- 
cgit v1.1


From 0017960f38a2470e70d9f1991228e2b55b2abe0c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 30 Nov 2015 16:26:35 +0100
Subject: perf/core: Collapse common IPI pattern

Various functions implement the same pattern to send IPIs to an
event's CPU. Collapse the easy ones in a common helper function to
reduce duplication.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 180 ++++++++++++++++++++++-----------------------------
 1 file changed, 76 insertions(+), 104 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 39cf4a4..c3d61b9 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -126,6 +126,37 @@ static int cpu_function_call(int cpu, remote_function_f func, void *info)
 	return data.ret;
 }
 
+static void event_function_call(struct perf_event *event,
+				int (*active)(void *),
+				void (*inactive)(void *),
+				void *data)
+{
+	struct perf_event_context *ctx = event->ctx;
+	struct task_struct *task = ctx->task;
+
+	if (!task) {
+		cpu_function_call(event->cpu, active, data);
+		return;
+	}
+
+again:
+	if (!task_function_call(task, active, data))
+		return;
+
+	raw_spin_lock_irq(&ctx->lock);
+	if (ctx->is_active) {
+		/*
+		 * Reload the task pointer, it might have been changed by
+		 * a concurrent perf_event_context_sched_out().
+		 */
+		task = ctx->task;
+		raw_spin_unlock_irq(&ctx->lock);
+		goto again;
+	}
+	inactive(data);
+	raw_spin_unlock_irq(&ctx->lock);
+}
+
 #define EVENT_OWNER_KERNEL ((void *) -1)
 
 static bool is_kernel_event(struct perf_event *event)
@@ -1629,6 +1660,17 @@ struct remove_event {
 	bool detach_group;
 };
 
+static void ___perf_remove_from_context(void *info)
+{
+	struct remove_event *re = info;
+	struct perf_event *event = re->event;
+	struct perf_event_context *ctx = event->ctx;
+
+	if (re->detach_group)
+		perf_group_detach(event);
+	list_del_event(event, ctx);
+}
+
 /*
  * Cross CPU call to remove a performance event
  *
@@ -1656,7 +1698,6 @@ static int __perf_remove_from_context(void *info)
 	return 0;
 }
 
-
 /*
  * Remove the event from a task's (or a CPU's) list of events.
  *
@@ -1673,7 +1714,6 @@ static int __perf_remove_from_context(void *info)
 static void perf_remove_from_context(struct perf_event *event, bool detach_group)
 {
 	struct perf_event_context *ctx = event->ctx;
-	struct task_struct *task = ctx->task;
 	struct remove_event re = {
 		.event = event,
 		.detach_group = detach_group,
@@ -1681,44 +1721,8 @@ static void perf_remove_from_context(struct perf_event *event, bool detach_group
 
 	lockdep_assert_held(&ctx->mutex);
 
-	if (!task) {
-		/*
-		 * Per cpu events are removed via an smp call. The removal can
-		 * fail if the CPU is currently offline, but in that case we
-		 * already called __perf_remove_from_context from
-		 * perf_event_exit_cpu.
-		 */
-		cpu_function_call(event->cpu, __perf_remove_from_context, &re);
-		return;
-	}
-
-retry:
-	if (!task_function_call(task, __perf_remove_from_context, &re))
-		return;
-
-	raw_spin_lock_irq(&ctx->lock);
-	/*
-	 * If we failed to find a running task, but find the context active now
-	 * that we've acquired the ctx->lock, retry.
-	 */
-	if (ctx->is_active) {
-		raw_spin_unlock_irq(&ctx->lock);
-		/*
-		 * Reload the task pointer, it might have been changed by
-		 * a concurrent perf_event_context_sched_out().
-		 */
-		task = ctx->task;
-		goto retry;
-	}
-
-	/*
-	 * Since the task isn't running, its safe to remove the event, us
-	 * holding the ctx->lock ensures the task won't get scheduled in.
-	 */
-	if (detach_group)
-		perf_group_detach(event);
-	list_del_event(event, ctx);
-	raw_spin_unlock_irq(&ctx->lock);
+	event_function_call(event, __perf_remove_from_context,
+			    ___perf_remove_from_context, &re);
 }
 
 /*
@@ -2067,6 +2071,18 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
 		ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
 }
 
+static void ___perf_install_in_context(void *info)
+{
+	struct perf_event *event = info;
+	struct perf_event_context *ctx = event->ctx;
+
+	/*
+	 * Since the task isn't running, its safe to add the event, us holding
+	 * the ctx->lock ensures the task won't get scheduled in.
+	 */
+	add_event_to_ctx(event, ctx);
+}
+
 /*
  * Cross CPU call to install and enable a performance event
  *
@@ -2143,48 +2159,14 @@ perf_install_in_context(struct perf_event_context *ctx,
 			struct perf_event *event,
 			int cpu)
 {
-	struct task_struct *task = ctx->task;
-
 	lockdep_assert_held(&ctx->mutex);
 
 	event->ctx = ctx;
 	if (event->cpu != -1)
 		event->cpu = cpu;
 
-	if (!task) {
-		/*
-		 * Per cpu events are installed via an smp call and
-		 * the install is always successful.
-		 */
-		cpu_function_call(cpu, __perf_install_in_context, event);
-		return;
-	}
-
-retry:
-	if (!task_function_call(task, __perf_install_in_context, event))
-		return;
-
-	raw_spin_lock_irq(&ctx->lock);
-	/*
-	 * If we failed to find a running task, but find the context active now
-	 * that we've acquired the ctx->lock, retry.
-	 */
-	if (ctx->is_active) {
-		raw_spin_unlock_irq(&ctx->lock);
-		/*
-		 * Reload the task pointer, it might have been changed by
-		 * a concurrent perf_event_context_sched_out().
-		 */
-		task = ctx->task;
-		goto retry;
-	}
-
-	/*
-	 * Since the task isn't running, its safe to add the event, us holding
-	 * the ctx->lock ensures the task won't get scheduled in.
-	 */
-	add_event_to_ctx(event, ctx);
-	raw_spin_unlock_irq(&ctx->lock);
+	event_function_call(event, __perf_install_in_context,
+			    ___perf_install_in_context, event);
 }
 
 /*
@@ -4154,6 +4136,22 @@ struct period_event {
 	u64 value;
 };
 
+static void ___perf_event_period(void *info)
+{
+	struct period_event *pe = info;
+	struct perf_event *event = pe->event;
+	u64 value = pe->value;
+
+	if (event->attr.freq) {
+		event->attr.sample_freq = value;
+	} else {
+		event->attr.sample_period = value;
+		event->hw.sample_period = value;
+	}
+
+	local64_set(&event->hw.period_left, 0);
+}
+
 static int __perf_event_period(void *info)
 {
 	struct period_event *pe = info;
@@ -4190,8 +4188,6 @@ static int __perf_event_period(void *info)
 static int perf_event_period(struct perf_event *event, u64 __user *arg)
 {
 	struct period_event pe = { .event = event, };
-	struct perf_event_context *ctx = event->ctx;
-	struct task_struct *task;
 	u64 value;
 
 	if (!is_sampling_event(event))
@@ -4206,34 +4202,10 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
 	if (event->attr.freq && value > sysctl_perf_event_sample_rate)
 		return -EINVAL;
 
-	task = ctx->task;
 	pe.value = value;
 
-	if (!task) {
-		cpu_function_call(event->cpu, __perf_event_period, &pe);
-		return 0;
-	}
-
-retry:
-	if (!task_function_call(task, __perf_event_period, &pe))
-		return 0;
-
-	raw_spin_lock_irq(&ctx->lock);
-	if (ctx->is_active) {
-		raw_spin_unlock_irq(&ctx->lock);
-		task = ctx->task;
-		goto retry;
-	}
-
-	if (event->attr.freq) {
-		event->attr.sample_freq = value;
-	} else {
-		event->attr.sample_period = value;
-		event->hw.sample_period = value;
-	}
-
-	local64_set(&event->hw.period_left, 0);
-	raw_spin_unlock_irq(&ctx->lock);
+	event_function_call(event, __perf_event_period,
+			    ___perf_event_period, &pe);
 
 	return 0;
 }
-- 
cgit v1.1


From da008ee72cabdee0ee98d3a3580ca5cfb8d2d1f1 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Mon, 30 Nov 2015 09:48:42 -0800
Subject: perf/x86/intel: Fix __initconst declaration in the RAPL perf driver

Fix a definition in the perf rapl driver. __initconst must
be applied to a const object, but to declare a const pointer
you need to use * const ..., not const ... *

This fixes a section attribute conflict with LTO builds.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1448905722-2767-1-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_rapl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
index ed446bd..fb5843d 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
@@ -63,7 +63,7 @@
 #define INTEL_RAPL_PP1		0x4	/* pseudo-encoding */
 
 #define NR_RAPL_DOMAINS         0x4
-static const char *rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
+static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
 	"pp0-core",
 	"package",
 	"dram",
-- 
cgit v1.1


From 153a4334c439cfb62e1d31cee0c790ba4157813d Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Tue, 1 Dec 2015 17:00:57 -0800
Subject: x86/headers: Don't include asm/processor.h in asm/atomic.h

asm/atomic.h doesn't really need asm/processor.h anymore. Everything
it uses has moved to other header files. So remove that include.

processor.h is a nasty header that includes lots of
other headers and makes it prone to include loops. Removing the
include here makes asm/atomic.h a "leaf" header that can
be safely included in most other headers.

The only fallout is in the lib/atomic tester which relied on
this implicit include. Give it an explicit include.
(the include is in ifdef because the user is also in ifdef)

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: rostedt@goodmis.org
Link: http://lkml.kernel.org/r/1449018060-1742-1-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/atomic.h      | 1 -
 arch/x86/include/asm/atomic64_32.h | 1 -
 lib/atomic64_test.c                | 4 ++++
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index ae5fb83..3e86742 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -3,7 +3,6 @@
 
 #include <linux/compiler.h>
 #include <linux/types.h>
-#include <asm/processor.h>
 #include <asm/alternative.h>
 #include <asm/cmpxchg.h>
 #include <asm/rmwcc.h>
diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index a11c30b..a984111 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -3,7 +3,6 @@
 
 #include <linux/compiler.h>
 #include <linux/types.h>
-#include <asm/processor.h>
 //#include <asm/cmpxchg.h>
 
 /* An 64bit atomic type */
diff --git a/lib/atomic64_test.c b/lib/atomic64_test.c
index 83c33a5b..d51e25a 100644
--- a/lib/atomic64_test.c
+++ b/lib/atomic64_test.c
@@ -16,6 +16,10 @@
 #include <linux/kernel.h>
 #include <linux/atomic.h>
 
+#ifdef CONFIG_X86
+#include <asm/processor.h>	/* for boot_cpu_has below */
+#endif
+
 #define TEST(bit, op, c_op, val)				\
 do {								\
 	atomic##bit##_set(&v, v0);				\
-- 
cgit v1.1


From bd2a634d9e852b9b6100f9ae9c3c790b0ff91ce0 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Tue, 1 Dec 2015 17:00:58 -0800
Subject: tracepoints: Move struct tracepoint to new tracepoint-defs.h header

Steven recommended open coding access to tracepoint->key to add
trace points to headers. Unfortunately this is difficult for some
headers (such as x86 asm/msr.h) because including tracepoint.h
includes so many other headers that it causes include loops.
The main problem is the include of linux/rcupdate.h, which
pulls in a lot of other headers. The rcu header is only needed
when actually defining trace points.

Move the struct tracepoint into a separate tracepoint-defs.h
header that can be included without pulling in all of RCU.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1449018060-1742-2-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/tracepoint-defs.h | 27 +++++++++++++++++++++++++++
 include/linux/tracepoint.h      | 16 +---------------
 2 files changed, 28 insertions(+), 15 deletions(-)
 create mode 100644 include/linux/tracepoint-defs.h

diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h
new file mode 100644
index 0000000..e1ee97c
--- /dev/null
+++ b/include/linux/tracepoint-defs.h
@@ -0,0 +1,27 @@
+#ifndef TRACEPOINT_DEFS_H
+#define TRACEPOINT_DEFS_H 1
+
+/*
+ * File can be included directly by headers who only want to access
+ * tracepoint->key to guard out of line trace calls. Otherwise
+ * linux/tracepoint.h should be used.
+ */
+
+#include <linux/atomic.h>
+#include <linux/static_key.h>
+
+struct tracepoint_func {
+	void *func;
+	void *data;
+	int prio;
+};
+
+struct tracepoint {
+	const char *name;		/* Tracepoint name */
+	struct static_key key;
+	void (*regfunc)(void);
+	void (*unregfunc)(void);
+	struct tracepoint_func __rcu *funcs;
+};
+
+#endif
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 696a339c..f7c732b 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -17,26 +17,12 @@
 #include <linux/errno.h>
 #include <linux/types.h>
 #include <linux/rcupdate.h>
-#include <linux/static_key.h>
+#include <linux/tracepoint-defs.h>
 
 struct module;
 struct tracepoint;
 struct notifier_block;
 
-struct tracepoint_func {
-	void *func;
-	void *data;
-	int prio;
-};
-
-struct tracepoint {
-	const char *name;		/* Tracepoint name */
-	struct static_key key;
-	void (*regfunc)(void);
-	void (*unregfunc)(void);
-	struct tracepoint_func __rcu *funcs;
-};
-
 struct trace_enum_map {
 	const char		*system;
 	const char		*enum_string;
-- 
cgit v1.1


From 7f47d8cc039f8746e0038fe05f1ddcb15a2e27f0 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Tue, 1 Dec 2015 17:00:59 -0800
Subject: x86, tracing, perf: Add trace point for MSR accesses

For debugging low level code interacting with the CPU it is often
useful to trace the MSR read/writes. This gives a concise summary of
PMU and other operations.

perf has an ad-hoc way to do this using trace_printk, but it's
somewhat limited (and also now spews ugly boot messages when enabled)

Instead define real trace points for all MSR accesses.

This adds three new trace points: read_msr and write_msr and rdpmc.

They also report if the access faulted (if *_safe is used)

This allows filtering and triggering on specific MSR values, which
allows various more advanced debugging techniques.

All the values are well defined in the CPU documentation.

The trace can be post processed with
Documentation/trace/postprocess/decode_msr.py to add symbolic MSR
names to the trace.

I only added it to native MSR accesses in C, not paravirtualized or in
entry*.S (which is not too interesting)

Originally the patch kit moved the MSRs out of line.  This uses an
alternative approach recommended by Steven Rostedt of only moving the
trace calls out of line, but open coding the access to the jump label.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1449018060-1742-3-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/trace/events-msr.txt            | 37 +++++++++++++++++
 Documentation/trace/postprocess/decode_msr.py | 37 +++++++++++++++++
 arch/x86/include/asm/msr-trace.h              | 57 +++++++++++++++++++++++++++
 arch/x86/include/asm/msr.h                    | 31 +++++++++++++++
 arch/x86/lib/msr.c                            | 26 ++++++++++++
 5 files changed, 188 insertions(+)
 create mode 100644 Documentation/trace/events-msr.txt
 create mode 100644 Documentation/trace/postprocess/decode_msr.py
 create mode 100644 arch/x86/include/asm/msr-trace.h

diff --git a/Documentation/trace/events-msr.txt b/Documentation/trace/events-msr.txt
new file mode 100644
index 0000000..78c383b
--- /dev/null
+++ b/Documentation/trace/events-msr.txt
@@ -0,0 +1,37 @@
+
+The x86 kernel supports tracing most MSR (Model Specific Register) accesses.
+To see the definition of the MSRs on Intel systems please see the SDM
+at http://www.intel.com/sdm (Volume 3)
+
+Available trace points:
+
+/sys/kernel/debug/tracing/events/msr/
+
+Trace MSR reads
+
+read_msr
+
+msr: MSR number
+val: Value written
+failed: 1 if the access failed, otherwise 0
+
+
+Trace MSR writes
+
+write_msr
+
+msr: MSR number
+val: Value written
+failed: 1 if the access failed, otherwise 0
+
+
+Trace RDPMC in kernel
+
+rdpmc
+
+The trace data can be post processed with the postprocess/decode_msr.py script
+
+cat /sys/kernel/debug/tracing/trace | decode_msr.py /usr/src/linux/include/asm/msr-index.h
+
+to add symbolic MSR names.
+
diff --git a/Documentation/trace/postprocess/decode_msr.py b/Documentation/trace/postprocess/decode_msr.py
new file mode 100644
index 0000000..0ab40e0
--- /dev/null
+++ b/Documentation/trace/postprocess/decode_msr.py
@@ -0,0 +1,37 @@
+#!/usr/bin/python
+# add symbolic names to read_msr / write_msr in trace
+# decode_msr msr-index.h < trace
+import sys
+import re
+
+msrs = dict()
+
+with open(sys.argv[1] if len(sys.argv) > 1 else "msr-index.h", "r") as f:
+	for j in f:
+		m = re.match(r'#define (MSR_\w+)\s+(0x[0-9a-fA-F]+)', j)
+		if m:
+			msrs[int(m.group(2), 16)] = m.group(1)
+
+extra_ranges = (
+	( "MSR_LASTBRANCH_%d_FROM_IP", 0x680, 0x69F ),
+	( "MSR_LASTBRANCH_%d_TO_IP", 0x6C0, 0x6DF ),
+	( "LBR_INFO_%d", 0xdc0, 0xddf ),
+)
+
+for j in sys.stdin:
+	m = re.search(r'(read|write)_msr:\s+([0-9a-f]+)', j)
+	if m:
+		r = None
+		num = int(m.group(2), 16)
+		if num in msrs:
+			r = msrs[num]
+		else:
+			for er in extra_ranges:
+				if er[1] <= num <= er[2]:
+					r = er[0] % (num - er[1],)
+					break
+		if r:
+			j = j.replace(" " + m.group(2), " " + r + "(" + m.group(2) + ")")
+	print j,
+
+
diff --git a/arch/x86/include/asm/msr-trace.h b/arch/x86/include/asm/msr-trace.h
new file mode 100644
index 0000000..7567225
--- /dev/null
+++ b/arch/x86/include/asm/msr-trace.h
@@ -0,0 +1,57 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM msr
+
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE msr-trace
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH asm/
+
+#if !defined(_TRACE_MSR_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_MSR_H
+
+#include <linux/tracepoint.h>
+
+/*
+ * Tracing for x86 model specific registers. Directly maps to the
+ * RDMSR/WRMSR instructions.
+ */
+
+DECLARE_EVENT_CLASS(msr_trace_class,
+	    TP_PROTO(unsigned msr, u64 val, int failed),
+	    TP_ARGS(msr, val, failed),
+	    TP_STRUCT__entry(
+		    __field(	unsigned,	msr )
+		    __field(    u64,		val )
+		    __field(    int,		failed )
+	    ),
+	    TP_fast_assign(
+		    __entry->msr = msr;
+		    __entry->val = val;
+		    __entry->failed = failed;
+	    ),
+	    TP_printk("%x, value %llx%s",
+		      __entry->msr,
+		      __entry->val,
+		      __entry->failed ? " #GP" : "")
+);
+
+DEFINE_EVENT(msr_trace_class, read_msr,
+	     TP_PROTO(unsigned msr, u64 val, int failed),
+	     TP_ARGS(msr, val, failed)
+);
+
+DEFINE_EVENT(msr_trace_class, write_msr,
+	     TP_PROTO(unsigned msr, u64 val, int failed),
+	     TP_ARGS(msr, val, failed)
+);
+
+DEFINE_EVENT(msr_trace_class, rdpmc,
+	     TP_PROTO(unsigned msr, u64 val, int failed),
+	     TP_ARGS(msr, val, failed)
+);
+
+#endif /* _TRACE_MSR_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 77d8b28..fedd6e6 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -57,11 +57,34 @@ static inline unsigned long long native_read_tscp(unsigned int *aux)
 #define EAX_EDX_RET(val, low, high)	"=A" (val)
 #endif
 
+#ifdef CONFIG_TRACEPOINTS
+/*
+ * Be very careful with includes. This header is prone to include loops.
+ */
+#include <asm/atomic.h>
+#include <linux/tracepoint-defs.h>
+
+extern struct tracepoint __tracepoint_read_msr;
+extern struct tracepoint __tracepoint_write_msr;
+extern struct tracepoint __tracepoint_rdpmc;
+#define msr_tracepoint_active(t) static_key_false(&(t).key)
+extern void do_trace_write_msr(unsigned msr, u64 val, int failed);
+extern void do_trace_read_msr(unsigned msr, u64 val, int failed);
+extern void do_trace_rdpmc(unsigned msr, u64 val, int failed);
+#else
+#define msr_tracepoint_active(t) false
+static inline void do_trace_write_msr(unsigned msr, u64 val, int failed) {}
+static inline void do_trace_read_msr(unsigned msr, u64 val, int failed) {}
+static inline void do_trace_rdpmc(unsigned msr, u64 val, int failed) {}
+#endif
+
 static inline unsigned long long native_read_msr(unsigned int msr)
 {
 	DECLARE_ARGS(val, low, high);
 
 	asm volatile("rdmsr" : EAX_EDX_RET(val, low, high) : "c" (msr));
+	if (msr_tracepoint_active(__tracepoint_read_msr))
+		do_trace_read_msr(msr, EAX_EDX_VAL(val, low, high), 0);
 	return EAX_EDX_VAL(val, low, high);
 }
 
@@ -78,6 +101,8 @@ static inline unsigned long long native_read_msr_safe(unsigned int msr,
 		     _ASM_EXTABLE(2b, 3b)
 		     : [err] "=r" (*err), EAX_EDX_RET(val, low, high)
 		     : "c" (msr), [fault] "i" (-EIO));
+	if (msr_tracepoint_active(__tracepoint_read_msr))
+		do_trace_read_msr(msr, EAX_EDX_VAL(val, low, high), *err);
 	return EAX_EDX_VAL(val, low, high);
 }
 
@@ -85,6 +110,8 @@ static inline void native_write_msr(unsigned int msr,
 				    unsigned low, unsigned high)
 {
 	asm volatile("wrmsr" : : "c" (msr), "a"(low), "d" (high) : "memory");
+	if (msr_tracepoint_active(__tracepoint_read_msr))
+		do_trace_write_msr(msr, ((u64)high << 32 | low), 0);
 }
 
 /* Can be uninlined because referenced by paravirt */
@@ -102,6 +129,8 @@ notrace static inline int native_write_msr_safe(unsigned int msr,
 		     : "c" (msr), "0" (low), "d" (high),
 		       [fault] "i" (-EIO)
 		     : "memory");
+	if (msr_tracepoint_active(__tracepoint_read_msr))
+		do_trace_write_msr(msr, ((u64)high << 32 | low), err);
 	return err;
 }
 
@@ -160,6 +189,8 @@ static inline unsigned long long native_read_pmc(int counter)
 	DECLARE_ARGS(val, low, high);
 
 	asm volatile("rdpmc" : EAX_EDX_RET(val, low, high) : "c" (counter));
+	if (msr_tracepoint_active(__tracepoint_rdpmc))
+		do_trace_rdpmc(counter, EAX_EDX_VAL(val, low, high), 0);
 	return EAX_EDX_VAL(val, low, high);
 }
 
diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c
index 4362373..004c861 100644
--- a/arch/x86/lib/msr.c
+++ b/arch/x86/lib/msr.c
@@ -1,6 +1,8 @@
 #include <linux/module.h>
 #include <linux/preempt.h>
 #include <asm/msr.h>
+#define CREATE_TRACE_POINTS
+#include <asm/msr-trace.h>
 
 struct msr *msrs_alloc(void)
 {
@@ -108,3 +110,27 @@ int msr_clear_bit(u32 msr, u8 bit)
 {
 	return __flip_bit(msr, bit, false);
 }
+
+#ifdef CONFIG_TRACEPOINTS
+void do_trace_write_msr(unsigned msr, u64 val, int failed)
+{
+	trace_write_msr(msr, val, failed);
+}
+EXPORT_SYMBOL(do_trace_write_msr);
+EXPORT_TRACEPOINT_SYMBOL(write_msr);
+
+void do_trace_read_msr(unsigned msr, u64 val, int failed)
+{
+	trace_read_msr(msr, val, failed);
+}
+EXPORT_SYMBOL(do_trace_read_msr);
+EXPORT_TRACEPOINT_SYMBOL(read_msr);
+
+void do_trace_rdpmc(unsigned counter, u64 val, int failed)
+{
+	trace_rdpmc(counter, val, failed);
+}
+EXPORT_SYMBOL(do_trace_rdpmc);
+EXPORT_TRACEPOINT_SYMBOL(rdpmc);
+
+#endif
-- 
cgit v1.1


From f1ad44884a4c421ceaa9a4a8242aeeee6f686670 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Tue, 1 Dec 2015 17:01:00 -0800
Subject: perf/x86: Remove old MSR perf tracing code

Now that we have generic MSR trace points we can remove the old
hackish perf MSR read tracing code.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: rostedt@goodmis.org
Link: http://lkml.kernel.org/r/1449018060-1742-4-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.h | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index a7ab350..799e6bd 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -14,17 +14,7 @@
 
 #include <linux/perf_event.h>
 
-#if 0
-#undef wrmsrl
-#define wrmsrl(msr, val) 						\
-do {									\
-	unsigned int _msr = (msr);					\
-	u64 _val = (val);						\
-	trace_printk("wrmsrl(%x, %Lx)\n", (unsigned int)(_msr),		\
-			(unsigned long long)(_val));			\
-	native_write_msr((_msr), (u32)(_val), (u32)(_val >> 32));	\
-} while (0)
-#endif
+/* To enable MSR tracing please use the generic trace points. */
 
 /*
  *          |   NHM/WSM    |      SNB     |
-- 
cgit v1.1


From 11803f97f0da4487ab947a975e5817fdc62a23e0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 13 Nov 2015 06:57:13 -0500
Subject: switch befs long symlinks to page_symlink_operations

just give them the right ->readpage()...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/befs/linuxvfs.c | 41 ++++++++++++++++++++++-------------------
 1 file changed, 22 insertions(+), 19 deletions(-)

diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 46aedac..1c8b0dc 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -42,7 +42,7 @@ static struct inode *befs_iget(struct super_block *, unsigned long);
 static struct inode *befs_alloc_inode(struct super_block *sb);
 static void befs_destroy_inode(struct inode *inode);
 static void befs_destroy_inodecache(void);
-static const char *befs_follow_link(struct dentry *, void **);
+static int befs_symlink_readpage(struct file *, struct page *);
 static int befs_utf2nls(struct super_block *sb, const char *in, int in_len,
 			char **out, int *out_len);
 static int befs_nls2utf(struct super_block *sb, const char *in, int in_len,
@@ -79,10 +79,8 @@ static const struct address_space_operations befs_aops = {
 	.bmap		= befs_bmap,
 };
 
-static const struct inode_operations befs_symlink_inode_operations = {
-	.readlink	= generic_readlink,
-	.follow_link	= befs_follow_link,
-	.put_link	= kfree_put_link,
+static const struct address_space_operations befs_symlink_aops = {
+	.readpage	= befs_symlink_readpage,
 };
 
 /* 
@@ -398,7 +396,8 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
 		inode->i_fop = &befs_dir_operations;
 	} else if (S_ISLNK(inode->i_mode)) {
 		if (befs_ino->i_flags & BEFS_LONG_SYMLINK) {
-			inode->i_op = &befs_symlink_inode_operations;
+			inode->i_op = &page_symlink_inode_operations;
+			inode->i_mapping->a_ops = &befs_symlink_aops;
 		} else {
 			inode->i_link = befs_ino->i_data.symlink;
 			inode->i_op = &simple_symlink_inode_operations;
@@ -463,31 +462,35 @@ befs_destroy_inodecache(void)
  * The data stream become link name. Unless the LONG_SYMLINK
  * flag is set.
  */
-static const char *
-befs_follow_link(struct dentry *dentry, void **cookie)
+static int befs_symlink_readpage(struct file *unused, struct page *page)
 {
-	struct super_block *sb = dentry->d_sb;
-	struct befs_inode_info *befs_ino = BEFS_I(d_inode(dentry));
+	struct inode *inode = page->mapping->host;
+	struct super_block *sb = inode->i_sb;
+	struct befs_inode_info *befs_ino = BEFS_I(inode);
 	befs_data_stream *data = &befs_ino->i_data.ds;
 	befs_off_t len = data->size;
-	char *link;
+	char *link = kmap(page);
 
-	if (len == 0) {
+	if (len == 0 || len > PAGE_SIZE) {
 		befs_error(sb, "Long symlink with illegal length");
-		return ERR_PTR(-EIO);
+		goto fail;
 	}
 	befs_debug(sb, "Follow long symlink");
 
-	link = kmalloc(len, GFP_NOFS);
-	if (!link)
-		return ERR_PTR(-ENOMEM);
 	if (befs_read_lsymlink(sb, data, link, len) != len) {
-		kfree(link);
 		befs_error(sb, "Failed to read entire long symlink");
-		return ERR_PTR(-EIO);
+		goto fail;
 	}
 	link[len - 1] = '\0';
-	return *cookie = link;
+	SetPageUptodate(page);
+	kunmap(page);
+	unlock_page(page);
+	return 0;
+fail:
+	SetPageError(page);
+	kunmap(page);
+	unlock_page(page);
+	return -EIO;
 }
 
 /*
-- 
cgit v1.1


From fb417f13aec5f202d35fce9bc7b654e1d3a749e8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 13 Nov 2015 17:23:54 -0500
Subject: logfs: don't duplicate page_symlink_inode_operations

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/logfs/dir.c   | 8 +-------
 fs/logfs/inode.c | 2 +-
 fs/logfs/logfs.h | 1 -
 3 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index f9b45d4..99944a4 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -528,7 +528,7 @@ static int logfs_symlink(struct inode *dir, struct dentry *dentry,
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
 
-	inode->i_op = &logfs_symlink_iops;
+	inode->i_op = &page_symlink_inode_operations;
 	inode->i_mapping->a_ops = &logfs_reg_aops;
 
 	return __logfs_create(dir, dentry, inode, target, destlen);
@@ -776,12 +776,6 @@ fail:
 	return -EIO;
 }
 
-const struct inode_operations logfs_symlink_iops = {
-	.readlink	= generic_readlink,
-	.follow_link	= page_follow_link_light,
-	.put_link	= page_put_link,
-};
-
 const struct inode_operations logfs_dir_iops = {
 	.create		= logfs_create,
 	.link		= logfs_link,
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index af49e2d..06baa92 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -64,7 +64,7 @@ static void logfs_inode_setops(struct inode *inode)
 		inode->i_mapping->a_ops = &logfs_reg_aops;
 		break;
 	case S_IFLNK:
-		inode->i_op = &logfs_symlink_iops;
+		inode->i_op = &page_symlink_inode_operations;
 		inode->i_mapping->a_ops = &logfs_reg_aops;
 		break;
 	case S_IFSOCK:	/* fall through */
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index 5f09376..209a26d 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -495,7 +495,6 @@ static inline int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
 #endif
 
 /* dir.c */
-extern const struct inode_operations logfs_symlink_iops;
 extern const struct inode_operations logfs_dir_iops;
 extern const struct file_operations logfs_dir_fops;
 int logfs_replay_journal(struct super_block *sb);
-- 
cgit v1.1


From c73119c58fef2590e0a2bef959a12cff7a07874b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 13 Nov 2015 20:33:18 -0500
Subject: udf: don't duplicate page_symlink_inode_operations

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/udf/inode.c   | 2 +-
 fs/udf/namei.c   | 7 +------
 fs/udf/udfdecl.h | 1 -
 3 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 8d0b3ad..8675c2b 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1540,7 +1540,7 @@ reread:
 		break;
 	case ICBTAG_FILE_TYPE_SYMLINK:
 		inode->i_data.a_ops = &udf_symlink_aops;
-		inode->i_op = &udf_symlink_inode_operations;
+		inode->i_op = &page_symlink_inode_operations;
 		inode->i_mode = S_IFLNK | S_IRWXUGO;
 		break;
 	case ICBTAG_FILE_TYPE_MAIN:
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index c97b5a8..d0e6de1 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -921,7 +921,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
 	}
 
 	inode->i_data.a_ops = &udf_symlink_aops;
-	inode->i_op = &udf_symlink_inode_operations;
+	inode->i_op = &page_symlink_inode_operations;
 
 	if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
 		struct kernel_lb_addr eloc;
@@ -1344,8 +1344,3 @@ const struct inode_operations udf_dir_inode_operations = {
 	.rename				= udf_rename,
 	.tmpfile			= udf_tmpfile,
 };
-const struct inode_operations udf_symlink_inode_operations = {
-	.readlink	= generic_readlink,
-	.follow_link	= page_follow_link_light,
-	.put_link	= page_put_link,
-};
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 47bb3f5..ce169b4 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -85,7 +85,6 @@ extern const struct inode_operations udf_dir_inode_operations;
 extern const struct file_operations udf_dir_operations;
 extern const struct inode_operations udf_file_inode_operations;
 extern const struct file_operations udf_file_operations;
-extern const struct inode_operations udf_symlink_inode_operations;
 extern const struct address_space_operations udf_aops;
 extern const struct address_space_operations udf_adinicb_aops;
 extern const struct address_space_operations udf_symlink_aops;
-- 
cgit v1.1


From 9cdce3c074fbd7083923f15225e112a91daff8ed Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 15 Nov 2015 18:24:17 -0500
Subject: ufs: get rid of ->setattr() for symlinks

It was to needed for a couple of months in 2010, until UFS
quota support got dropped.  Since then it's equivalent to
simple_setattr() (i.e. the default) for everything except the
regular files.  And dropping it there allows to convert all
UFS symlinks to {page,simple}_symlink_inode_operations, getting
rid of fs/ufs/symlink.c completely.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ufs/Makefile  |  2 +-
 fs/ufs/inode.c   |  4 ++--
 fs/ufs/namei.c   |  4 ++--
 fs/ufs/symlink.c | 42 ------------------------------------------
 fs/ufs/ufs.h     |  4 ----
 5 files changed, 5 insertions(+), 51 deletions(-)
 delete mode 100644 fs/ufs/symlink.c

diff --git a/fs/ufs/Makefile b/fs/ufs/Makefile
index 392db25..ec4a6b4 100644
--- a/fs/ufs/Makefile
+++ b/fs/ufs/Makefile
@@ -5,5 +5,5 @@
 obj-$(CONFIG_UFS_FS) += ufs.o
 
 ufs-objs := balloc.o cylinder.o dir.o file.o ialloc.o inode.o \
-	    namei.o super.o symlink.o util.o
+	    namei.o super.o util.o
 ccflags-$(CONFIG_UFS_DEBUG)    += -DDEBUG
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index a064cf44..737160a 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -528,11 +528,11 @@ static void ufs_set_inode_ops(struct inode *inode)
 		inode->i_mapping->a_ops = &ufs_aops;
 	} else if (S_ISLNK(inode->i_mode)) {
 		if (!inode->i_blocks) {
-			inode->i_op = &ufs_fast_symlink_inode_operations;
 			inode->i_link = (char *)UFS_I(inode)->i_u1.i_symlink;
+			inode->i_op = &simple_symlink_inode_operations;
 		} else {
-			inode->i_op = &ufs_symlink_inode_operations;
 			inode->i_mapping->a_ops = &ufs_aops;
+			inode->i_op = &page_symlink_inode_operations;
 		}
 	} else
 		init_special_inode(inode, inode->i_mode,
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 4796655..24b0cbd 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -123,14 +123,14 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
 
 	if (l > UFS_SB(sb)->s_uspi->s_maxsymlinklen) {
 		/* slow symlink */
-		inode->i_op = &ufs_symlink_inode_operations;
+		inode->i_op = &page_symlink_inode_operations;
 		inode->i_mapping->a_ops = &ufs_aops;
 		err = page_symlink(inode, symname, l);
 		if (err)
 			goto out_fail;
 	} else {
 		/* fast symlink */
-		inode->i_op = &ufs_fast_symlink_inode_operations;
+		inode->i_op = &simple_symlink_inode_operations;
 		inode->i_link = (char *)UFS_I(inode)->i_u1.i_symlink;
 		memcpy(inode->i_link, symname, l);
 		inode->i_size = l-1;
diff --git a/fs/ufs/symlink.c b/fs/ufs/symlink.c
deleted file mode 100644
index 874480b..0000000
--- a/fs/ufs/symlink.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- *  linux/fs/ufs/symlink.c
- *
- * Only fast symlinks left here - the rest is done by generic code. AV, 1999
- *
- * Copyright (C) 1998
- * Daniel Pirkl <daniel.pirkl@emai.cz>
- * Charles University, Faculty of Mathematics and Physics
- *
- *  from
- *
- *  linux/fs/ext2/symlink.c
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- *
- *  from
- *
- *  linux/fs/minix/symlink.c
- *
- *  Copyright (C) 1991, 1992  Linus Torvalds
- *
- *  ext2 symlink handling code
- */
-
-#include "ufs_fs.h"
-#include "ufs.h"
-
-const struct inode_operations ufs_fast_symlink_inode_operations = {
-	.readlink	= generic_readlink,
-	.follow_link	= simple_follow_link,
-	.setattr	= ufs_setattr,
-};
-
-const struct inode_operations ufs_symlink_inode_operations = {
-	.readlink	= generic_readlink,
-	.follow_link	= page_follow_link_light,
-	.put_link	= page_put_link,
-	.setattr	= ufs_setattr,
-};
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index 7da4aca..c87f4c3 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -136,10 +136,6 @@ extern __printf(3, 4)
 void ufs_panic(struct super_block *, const char *, const char *, ...);
 void ufs_mark_sb_dirty(struct super_block *sb);
 
-/* symlink.c */
-extern const struct inode_operations ufs_fast_symlink_inode_operations;
-extern const struct inode_operations ufs_symlink_inode_operations;
-
 static inline struct ufs_sb_info *UFS_SB(struct super_block *sb)
 {
 	return sb->s_fs_info;
-- 
cgit v1.1


From aa80deab33a8fb180e718f5e45514db19aade165 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 16 Nov 2015 18:26:34 -0500
Subject: namei: page_getlink() and page_follow_link_light() are the same thing

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 32 +++++++++++++-------------------
 1 file changed, 13 insertions(+), 19 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 0c3974c..4bae5cb 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4518,7 +4518,7 @@ int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
 EXPORT_SYMBOL(generic_readlink);
 
 /* get the link contents into pagecache */
-static char *page_getlink(struct dentry * dentry, struct page **ppage)
+static const char *page_getlink(struct dentry * dentry, void **cookie)
 {
 	char *kaddr;
 	struct page *page;
@@ -4526,31 +4526,15 @@ static char *page_getlink(struct dentry * dentry, struct page **ppage)
 	page = read_mapping_page(mapping, 0, NULL);
 	if (IS_ERR(page))
 		return (char*)page;
-	*ppage = page;
+	*cookie = page;
 	kaddr = kmap(page);
 	nd_terminate_link(kaddr, dentry->d_inode->i_size, PAGE_SIZE - 1);
 	return kaddr;
 }
 
-int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
-{
-	struct page *page = NULL;
-	int res = readlink_copy(buffer, buflen, page_getlink(dentry, &page));
-	if (page) {
-		kunmap(page);
-		page_cache_release(page);
-	}
-	return res;
-}
-EXPORT_SYMBOL(page_readlink);
-
 const char *page_follow_link_light(struct dentry *dentry, void **cookie)
 {
-	struct page *page = NULL;
-	char *res = page_getlink(dentry, &page);
-	if (!IS_ERR(res))
-		*cookie = page;
-	return res;
+	return page_getlink(dentry, cookie);
 }
 EXPORT_SYMBOL(page_follow_link_light);
 
@@ -4562,6 +4546,16 @@ void page_put_link(struct inode *unused, void *cookie)
 }
 EXPORT_SYMBOL(page_put_link);
 
+int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
+{
+	void *cookie = NULL;
+	int res = readlink_copy(buffer, buflen, page_getlink(dentry, &cookie));
+	if (cookie)
+		page_put_link(NULL, cookie);
+	return res;
+}
+EXPORT_SYMBOL(page_readlink);
+
 /*
  * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
  */
-- 
cgit v1.1


From 80602324d5a02fea97ed1328ba04e5d7f10aaff8 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Wed, 2 Dec 2015 14:44:33 +0100
Subject: vfs: Remove vfs_xattr_cmp

This function was only briefly used in security/integrity/evm, between
commits 66dbc325 and 15647eb3.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Reviewed-by: James Morris <james.l.morris@oracle.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/xattr.c            | 19 -------------------
 include/linux/xattr.h |  2 --
 2 files changed, 21 deletions(-)

diff --git a/fs/xattr.c b/fs/xattr.c
index 9b932b9..58bdabc 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -208,25 +208,6 @@ vfs_getxattr_alloc(struct dentry *dentry, const char *name, char **xattr_value,
 	return error;
 }
 
-/* Compare an extended attribute value with the given value */
-int vfs_xattr_cmp(struct dentry *dentry, const char *xattr_name,
-		  const char *value, size_t size, gfp_t flags)
-{
-	char *xattr_value = NULL;
-	int rc;
-
-	rc = vfs_getxattr_alloc(dentry, xattr_name, &xattr_value, 0, flags);
-	if (rc < 0)
-		return rc;
-
-	if ((rc != size) || (memcmp(xattr_value, value, rc) != 0))
-		rc = -EINVAL;
-	else
-		rc = 0;
-	kfree(xattr_value);
-	return rc;
-}
-
 ssize_t
 vfs_getxattr(struct dentry *dentry, const char *name, void *value, size_t size)
 {
diff --git a/include/linux/xattr.h b/include/linux/xattr.h
index 89474b9..45fa345 100644
--- a/include/linux/xattr.h
+++ b/include/linux/xattr.h
@@ -53,8 +53,6 @@ int generic_setxattr(struct dentry *dentry, const char *name, const void *value,
 int generic_removexattr(struct dentry *dentry, const char *name);
 ssize_t vfs_getxattr_alloc(struct dentry *dentry, const char *name,
 			   char **xattr_value, size_t size, gfp_t flags);
-int vfs_xattr_cmp(struct dentry *dentry, const char *xattr_name,
-		  const char *value, size_t size, gfp_t flags);
 
 struct simple_xattrs {
 	struct list_head head;
-- 
cgit v1.1


From 44cb0d3f778da6646f8e993245ee827a6b7df6d5 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Wed, 2 Dec 2015 14:44:34 +0100
Subject: gfs2: Remove gfs2_xattr_acl_chmod

Function gfs2_xattr_acl_chmod is unused since commit e01580bf.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Reviewed-by: James Morris <james.l.morris@oracle.com>
Acked-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/gfs2/xattr.c | 50 --------------------------------------------------
 fs/gfs2/xattr.h |  1 -
 2 files changed, 51 deletions(-)

diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 53ce76a..84f2d81 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -1237,56 +1237,6 @@ static int gfs2_xattr_set(const struct xattr_handler *handler,
 				size, flags, handler->flags);
 }
 
-
-static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip,
-				  struct gfs2_ea_header *ea, char *data)
-{
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	unsigned int amount = GFS2_EA_DATA_LEN(ea);
-	unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize);
-	int ret;
-
-	ret = gfs2_trans_begin(sdp, nptrs + RES_DINODE, 0);
-	if (ret)
-		return ret;
-
-	ret = gfs2_iter_unstuffed(ip, ea, data, NULL);
-	gfs2_trans_end(sdp);
-
-	return ret;
-}
-
-int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
-{
-	struct inode *inode = &ip->i_inode;
-	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	struct gfs2_ea_location el;
-	int error;
-
-	error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, GFS2_POSIX_ACL_ACCESS, &el);
-	if (error)
-		return error;
-
-	if (GFS2_EA_IS_STUFFED(el.el_ea)) {
-		error = gfs2_trans_begin(sdp, RES_DINODE + RES_EATTR, 0);
-		if (error == 0) {
-			gfs2_trans_add_meta(ip->i_gl, el.el_bh);
-			memcpy(GFS2_EA2DATA(el.el_ea), data,
-			       GFS2_EA_DATA_LEN(el.el_ea));
-		}
-	} else {
-		error = ea_acl_chmod_unstuffed(ip, el.el_ea, data);
-	}
-
-	brelse(el.el_bh);
-	if (error)
-		return error;
-
-	error = gfs2_setattr_simple(inode, attr);
-	gfs2_trans_end(sdp);
-	return error;
-}
-
 static int ea_dealloc_indirect(struct gfs2_inode *ip)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
diff --git a/fs/gfs2/xattr.h b/fs/gfs2/xattr.h
index d392f83..2d887c8 100644
--- a/fs/gfs2/xattr.h
+++ b/fs/gfs2/xattr.h
@@ -62,6 +62,5 @@ extern int gfs2_ea_dealloc(struct gfs2_inode *ip);
 /* Exported to acl.c */
 
 extern int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **data);
-extern int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data);
 
 #endif /* __EATTR_DOT_H__ */
-- 
cgit v1.1


From 97d79299223baab330b194437e676d301f12d5f6 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Wed, 2 Dec 2015 14:44:35 +0100
Subject: posix acls: Remove duplicate xattr name definitions

Remove POSIX_ACL_XATTR_{ACCESS,DEFAULT} and GFS2_POSIX_ACL_{ACCESS,DEFAULT}
and replace them with the definitions in <include/uapi/linux/xattr.h>.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Reviewed-by: James Morris <james.l.morris@oracle.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/staging/lustre/lustre/llite/xattr.c |  4 ++--
 fs/9p/acl.c                                 | 12 ++++++------
 fs/btrfs/acl.c                              |  8 ++++----
 fs/btrfs/inode.c                            |  8 ++++----
 fs/ceph/acl.c                               | 16 ++++++++--------
 fs/cifs/xattr.c                             | 16 ++++++++--------
 fs/gfs2/acl.c                               |  4 ++--
 fs/gfs2/acl.h                               |  2 --
 fs/hfsplus/posix_acl.c                      |  8 ++++----
 fs/jfs/acl.c                                |  8 ++++----
 fs/nfs/nfs3acl.c                            |  4 ++--
 fs/posix_acl.c                              |  4 ++--
 fs/reiserfs/xattr_acl.c                     |  8 ++++----
 fs/xfs/xfs_xattr.c                          |  8 ++++----
 include/linux/posix_acl_xattr.h             |  6 +-----
 15 files changed, 55 insertions(+), 61 deletions(-)

diff --git a/drivers/staging/lustre/lustre/llite/xattr.c b/drivers/staging/lustre/lustre/llite/xattr.c
index 4b7eb33..660b8ac 100644
--- a/drivers/staging/lustre/lustre/llite/xattr.c
+++ b/drivers/staging/lustre/lustre/llite/xattr.c
@@ -60,10 +60,10 @@
 static
 int get_xattr_type(const char *name)
 {
-	if (!strcmp(name, POSIX_ACL_XATTR_ACCESS))
+	if (!strcmp(name, XATTR_NAME_POSIX_ACL_ACCESS))
 		return XATTR_ACL_ACCESS_T;
 
-	if (!strcmp(name, POSIX_ACL_XATTR_DEFAULT))
+	if (!strcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT))
 		return XATTR_ACL_DEFAULT_T;
 
 	if (!strncmp(name, XATTR_USER_PREFIX,
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index a7e2889..6ff30b1 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -67,8 +67,8 @@ int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
 		return 0;
 	}
 	/* get the default/access acl values and cache them */
-	dacl = __v9fs_get_acl(fid, POSIX_ACL_XATTR_DEFAULT);
-	pacl = __v9fs_get_acl(fid, POSIX_ACL_XATTR_ACCESS);
+	dacl = __v9fs_get_acl(fid, XATTR_NAME_POSIX_ACL_DEFAULT);
+	pacl = __v9fs_get_acl(fid, XATTR_NAME_POSIX_ACL_ACCESS);
 
 	if (!IS_ERR(dacl) && !IS_ERR(pacl)) {
 		set_cached_acl(inode, ACL_TYPE_DEFAULT, dacl);
@@ -133,10 +133,10 @@ static int v9fs_set_acl(struct p9_fid *fid, int type, struct posix_acl *acl)
 		goto err_free_out;
 	switch (type) {
 	case ACL_TYPE_ACCESS:
-		name = POSIX_ACL_XATTR_ACCESS;
+		name = XATTR_NAME_POSIX_ACL_ACCESS;
 		break;
 	case ACL_TYPE_DEFAULT:
-		name = POSIX_ACL_XATTR_DEFAULT;
+		name = XATTR_NAME_POSIX_ACL_DEFAULT;
 		break;
 	default:
 		BUG();
@@ -328,14 +328,14 @@ err_out:
 }
 
 const struct xattr_handler v9fs_xattr_acl_access_handler = {
-	.prefix	= POSIX_ACL_XATTR_ACCESS,
+	.prefix	= XATTR_NAME_POSIX_ACL_ACCESS,
 	.flags	= ACL_TYPE_ACCESS,
 	.get	= v9fs_xattr_get_acl,
 	.set	= v9fs_xattr_set_acl,
 };
 
 const struct xattr_handler v9fs_xattr_acl_default_handler = {
-	.prefix	= POSIX_ACL_XATTR_DEFAULT,
+	.prefix	= XATTR_NAME_POSIX_ACL_DEFAULT,
 	.flags	= ACL_TYPE_DEFAULT,
 	.get	= v9fs_xattr_get_acl,
 	.set	= v9fs_xattr_set_acl,
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 9a0124a..f89db0c 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -37,10 +37,10 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
 
 	switch (type) {
 	case ACL_TYPE_ACCESS:
-		name = POSIX_ACL_XATTR_ACCESS;
+		name = XATTR_NAME_POSIX_ACL_ACCESS;
 		break;
 	case ACL_TYPE_DEFAULT:
-		name = POSIX_ACL_XATTR_DEFAULT;
+		name = XATTR_NAME_POSIX_ACL_DEFAULT;
 		break;
 	default:
 		BUG();
@@ -81,7 +81,7 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
 
 	switch (type) {
 	case ACL_TYPE_ACCESS:
-		name = POSIX_ACL_XATTR_ACCESS;
+		name = XATTR_NAME_POSIX_ACL_ACCESS;
 		if (acl) {
 			ret = posix_acl_equiv_mode(acl, &inode->i_mode);
 			if (ret < 0)
@@ -94,7 +94,7 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
 	case ACL_TYPE_DEFAULT:
 		if (!S_ISDIR(inode->i_mode))
 			return acl ? -EINVAL : 0;
-		name = POSIX_ACL_XATTR_DEFAULT;
+		name = XATTR_NAME_POSIX_ACL_DEFAULT;
 		break;
 	default:
 		return -EINVAL;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a70c579..d540fd7 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3550,10 +3550,10 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
 	int scanned = 0;
 
 	if (!xattr_access) {
-		xattr_access = btrfs_name_hash(POSIX_ACL_XATTR_ACCESS,
-					strlen(POSIX_ACL_XATTR_ACCESS));
-		xattr_default = btrfs_name_hash(POSIX_ACL_XATTR_DEFAULT,
-					strlen(POSIX_ACL_XATTR_DEFAULT));
+		xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS,
+					strlen(XATTR_NAME_POSIX_ACL_ACCESS));
+		xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT,
+					strlen(XATTR_NAME_POSIX_ACL_DEFAULT));
 	}
 
 	slot++;
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index 8f84646..f197084 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -49,10 +49,10 @@ struct posix_acl *ceph_get_acl(struct inode *inode, int type)
 
 	switch (type) {
 	case ACL_TYPE_ACCESS:
-		name = POSIX_ACL_XATTR_ACCESS;
+		name = XATTR_NAME_POSIX_ACL_ACCESS;
 		break;
 	case ACL_TYPE_DEFAULT:
-		name = POSIX_ACL_XATTR_DEFAULT;
+		name = XATTR_NAME_POSIX_ACL_DEFAULT;
 		break;
 	default:
 		BUG();
@@ -92,7 +92,7 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 
 	switch (type) {
 	case ACL_TYPE_ACCESS:
-		name = POSIX_ACL_XATTR_ACCESS;
+		name = XATTR_NAME_POSIX_ACL_ACCESS;
 		if (acl) {
 			ret = posix_acl_equiv_mode(acl, &new_mode);
 			if (ret < 0)
@@ -106,7 +106,7 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 			ret = acl ? -EINVAL : 0;
 			goto out;
 		}
-		name = POSIX_ACL_XATTR_DEFAULT;
+		name = XATTR_NAME_POSIX_ACL_DEFAULT;
 		break;
 	default:
 		ret = -EINVAL;
@@ -202,11 +202,11 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
 	ceph_pagelist_encode_32(pagelist, acl && default_acl ? 2 : 1);
 
 	if (acl) {
-		size_t len = strlen(POSIX_ACL_XATTR_ACCESS);
+		size_t len = strlen(XATTR_NAME_POSIX_ACL_ACCESS);
 		err = ceph_pagelist_reserve(pagelist, len + val_size1 + 8);
 		if (err)
 			goto out_err;
-		ceph_pagelist_encode_string(pagelist, POSIX_ACL_XATTR_ACCESS,
+		ceph_pagelist_encode_string(pagelist, XATTR_NAME_POSIX_ACL_ACCESS,
 					    len);
 		err = posix_acl_to_xattr(&init_user_ns, acl,
 					 tmp_buf, val_size1);
@@ -216,12 +216,12 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
 		ceph_pagelist_append(pagelist, tmp_buf, val_size1);
 	}
 	if (default_acl) {
-		size_t len = strlen(POSIX_ACL_XATTR_DEFAULT);
+		size_t len = strlen(XATTR_NAME_POSIX_ACL_DEFAULT);
 		err = ceph_pagelist_reserve(pagelist, len + val_size2 + 8);
 		if (err)
 			goto out_err;
 		err = ceph_pagelist_encode_string(pagelist,
-						  POSIX_ACL_XATTR_DEFAULT, len);
+						  XATTR_NAME_POSIX_ACL_DEFAULT, len);
 		err = posix_acl_to_xattr(&init_user_ns, default_acl,
 					 tmp_buf, val_size2);
 		if (err < 0)
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index ff9e1f8..f5dc2f0 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -190,8 +190,8 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
 #endif /* CONFIG_CIFS_ACL */
 	} else {
 		int temp;
-		temp = strncmp(ea_name, POSIX_ACL_XATTR_ACCESS,
-			strlen(POSIX_ACL_XATTR_ACCESS));
+		temp = strncmp(ea_name, XATTR_NAME_POSIX_ACL_ACCESS,
+			strlen(XATTR_NAME_POSIX_ACL_ACCESS));
 		if (temp == 0) {
 #ifdef CONFIG_CIFS_POSIX
 			if (sb->s_flags & MS_POSIXACL)
@@ -203,8 +203,8 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
 #else
 			cifs_dbg(FYI, "set POSIX ACL not supported\n");
 #endif
-		} else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT,
-				   strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) {
+		} else if (strncmp(ea_name, XATTR_NAME_POSIX_ACL_DEFAULT,
+				   strlen(XATTR_NAME_POSIX_ACL_DEFAULT)) == 0) {
 #ifdef CONFIG_CIFS_POSIX
 			if (sb->s_flags & MS_POSIXACL)
 				rc = CIFSSMBSetPosixACL(xid, pTcon, full_path,
@@ -292,8 +292,8 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 			rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon,
 				full_path, ea_name, ea_value, buf_size,
 				cifs_sb->local_nls, cifs_remap(cifs_sb));
-	} else if (strncmp(ea_name, POSIX_ACL_XATTR_ACCESS,
-			  strlen(POSIX_ACL_XATTR_ACCESS)) == 0) {
+	} else if (strncmp(ea_name, XATTR_NAME_POSIX_ACL_ACCESS,
+			  strlen(XATTR_NAME_POSIX_ACL_ACCESS)) == 0) {
 #ifdef CONFIG_CIFS_POSIX
 		if (sb->s_flags & MS_POSIXACL)
 			rc = CIFSSMBGetPosixACL(xid, pTcon, full_path,
@@ -303,8 +303,8 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 #else
 		cifs_dbg(FYI, "Query POSIX ACL not supported yet\n");
 #endif /* CONFIG_CIFS_POSIX */
-	} else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT,
-			  strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) {
+	} else if (strncmp(ea_name, XATTR_NAME_POSIX_ACL_DEFAULT,
+			  strlen(XATTR_NAME_POSIX_ACL_DEFAULT)) == 0) {
 #ifdef CONFIG_CIFS_POSIX
 		if (sb->s_flags & MS_POSIXACL)
 			rc = CIFSSMBGetPosixACL(xid, pTcon, full_path,
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 1be3b06..7919326 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -31,9 +31,9 @@ static const char *gfs2_acl_name(int type)
 {
 	switch (type) {
 	case ACL_TYPE_ACCESS:
-		return GFS2_POSIX_ACL_ACCESS;
+		return XATTR_POSIX_ACL_ACCESS;
 	case ACL_TYPE_DEFAULT:
-		return GFS2_POSIX_ACL_DEFAULT;
+		return XATTR_POSIX_ACL_DEFAULT;
 	}
 	return NULL;
 }
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index 2d65ec4..3af4f40 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -12,8 +12,6 @@
 
 #include "incore.h"
 
-#define GFS2_POSIX_ACL_ACCESS		"posix_acl_access"
-#define GFS2_POSIX_ACL_DEFAULT		"posix_acl_default"
 #define GFS2_ACL_MAX_ENTRIES(sdp) ((300 << (sdp)->sd_sb.sb_bsize_shift) >> 12)
 
 extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type);
diff --git a/fs/hfsplus/posix_acl.c b/fs/hfsplus/posix_acl.c
index df0c9af..afb33ed 100644
--- a/fs/hfsplus/posix_acl.c
+++ b/fs/hfsplus/posix_acl.c
@@ -21,10 +21,10 @@ struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type)
 
 	switch (type) {
 	case ACL_TYPE_ACCESS:
-		xattr_name = POSIX_ACL_XATTR_ACCESS;
+		xattr_name = XATTR_NAME_POSIX_ACL_ACCESS;
 		break;
 	case ACL_TYPE_DEFAULT:
-		xattr_name = POSIX_ACL_XATTR_DEFAULT;
+		xattr_name = XATTR_NAME_POSIX_ACL_DEFAULT;
 		break;
 	default:
 		return ERR_PTR(-EINVAL);
@@ -66,7 +66,7 @@ int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl,
 
 	switch (type) {
 	case ACL_TYPE_ACCESS:
-		xattr_name = POSIX_ACL_XATTR_ACCESS;
+		xattr_name = XATTR_NAME_POSIX_ACL_ACCESS;
 		if (acl) {
 			err = posix_acl_equiv_mode(acl, &inode->i_mode);
 			if (err < 0)
@@ -76,7 +76,7 @@ int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl,
 		break;
 
 	case ACL_TYPE_DEFAULT:
-		xattr_name = POSIX_ACL_XATTR_DEFAULT;
+		xattr_name = XATTR_NAME_POSIX_ACL_DEFAULT;
 		if (!S_ISDIR(inode->i_mode))
 			return acl ? -EACCES : 0;
 		break;
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 0c8ca83..4945685 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -40,10 +40,10 @@ struct posix_acl *jfs_get_acl(struct inode *inode, int type)
 
 	switch(type) {
 		case ACL_TYPE_ACCESS:
-			ea_name = POSIX_ACL_XATTR_ACCESS;
+			ea_name = XATTR_NAME_POSIX_ACL_ACCESS;
 			break;
 		case ACL_TYPE_DEFAULT:
-			ea_name = POSIX_ACL_XATTR_DEFAULT;
+			ea_name = XATTR_NAME_POSIX_ACL_DEFAULT;
 			break;
 		default:
 			return ERR_PTR(-EINVAL);
@@ -82,7 +82,7 @@ static int __jfs_set_acl(tid_t tid, struct inode *inode, int type,
 
 	switch (type) {
 	case ACL_TYPE_ACCESS:
-		ea_name = POSIX_ACL_XATTR_ACCESS;
+		ea_name = XATTR_NAME_POSIX_ACL_ACCESS;
 		if (acl) {
 			rc = posix_acl_equiv_mode(acl, &inode->i_mode);
 			if (rc < 0)
@@ -94,7 +94,7 @@ static int __jfs_set_acl(tid_t tid, struct inode *inode, int type,
 		}
 		break;
 	case ACL_TYPE_DEFAULT:
-		ea_name = POSIX_ACL_XATTR_DEFAULT;
+		ea_name = XATTR_NAME_POSIX_ACL_DEFAULT;
 		break;
 	default:
 		return -EINVAL;
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 1ebe2fc..17c0fa1 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -284,12 +284,12 @@ nfs3_listxattr(struct dentry *dentry, char *data, size_t size)
 	int error;
 
 	error = nfs3_list_one_acl(inode, ACL_TYPE_ACCESS,
-			POSIX_ACL_XATTR_ACCESS, data, size, &result);
+			XATTR_NAME_POSIX_ACL_ACCESS, data, size, &result);
 	if (error)
 		return error;
 
 	error = nfs3_list_one_acl(inode, ACL_TYPE_DEFAULT,
-			POSIX_ACL_XATTR_DEFAULT, data, size, &result);
+			XATTR_NAME_POSIX_ACL_DEFAULT, data, size, &result);
 	if (error)
 		return error;
 	return result;
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 4adde1e..571465d 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -845,7 +845,7 @@ posix_acl_xattr_list(const struct xattr_handler *handler,
 }
 
 const struct xattr_handler posix_acl_access_xattr_handler = {
-	.prefix = POSIX_ACL_XATTR_ACCESS,
+	.prefix = XATTR_NAME_POSIX_ACL_ACCESS,
 	.flags = ACL_TYPE_ACCESS,
 	.list = posix_acl_xattr_list,
 	.get = posix_acl_xattr_get,
@@ -854,7 +854,7 @@ const struct xattr_handler posix_acl_access_xattr_handler = {
 EXPORT_SYMBOL_GPL(posix_acl_access_xattr_handler);
 
 const struct xattr_handler posix_acl_default_xattr_handler = {
-	.prefix = POSIX_ACL_XATTR_DEFAULT,
+	.prefix = XATTR_NAME_POSIX_ACL_DEFAULT,
 	.flags = ACL_TYPE_DEFAULT,
 	.list = posix_acl_xattr_list,
 	.get = posix_acl_xattr_get,
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 4b34b9d..558a16b 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -186,10 +186,10 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
 
 	switch (type) {
 	case ACL_TYPE_ACCESS:
-		name = POSIX_ACL_XATTR_ACCESS;
+		name = XATTR_NAME_POSIX_ACL_ACCESS;
 		break;
 	case ACL_TYPE_DEFAULT:
-		name = POSIX_ACL_XATTR_DEFAULT;
+		name = XATTR_NAME_POSIX_ACL_DEFAULT;
 		break;
 	default:
 		BUG();
@@ -244,7 +244,7 @@ __reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
 
 	switch (type) {
 	case ACL_TYPE_ACCESS:
-		name = POSIX_ACL_XATTR_ACCESS;
+		name = XATTR_NAME_POSIX_ACL_ACCESS;
 		if (acl) {
 			error = posix_acl_equiv_mode(acl, &inode->i_mode);
 			if (error < 0)
@@ -256,7 +256,7 @@ __reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
 		}
 		break;
 	case ACL_TYPE_DEFAULT:
-		name = POSIX_ACL_XATTR_DEFAULT;
+		name = XATTR_NAME_POSIX_ACL_DEFAULT;
 		if (!S_ISDIR(inode->i_mode))
 			return acl ? -EACCES : 0;
 		break;
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 839b35c..7288795 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -253,16 +253,16 @@ xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size)
 	 * Then add the two synthetic ACL attributes.
 	 */
 	if (posix_acl_access_exists(inode)) {
-		error = list_one_attr(POSIX_ACL_XATTR_ACCESS,
-				strlen(POSIX_ACL_XATTR_ACCESS) + 1,
+		error = list_one_attr(XATTR_NAME_POSIX_ACL_ACCESS,
+				strlen(XATTR_NAME_POSIX_ACL_ACCESS) + 1,
 				data, size, &context.count);
 		if (error)
 			return error;
 	}
 
 	if (posix_acl_default_exists(inode)) {
-		error = list_one_attr(POSIX_ACL_XATTR_DEFAULT,
-				strlen(POSIX_ACL_XATTR_DEFAULT) + 1,
+		error = list_one_attr(XATTR_NAME_POSIX_ACL_DEFAULT,
+				strlen(XATTR_NAME_POSIX_ACL_DEFAULT) + 1,
 				data, size, &context.count);
 		if (error)
 			return error;
diff --git a/include/linux/posix_acl_xattr.h b/include/linux/posix_acl_xattr.h
index 6f14ee2..e5e8ec4 100644
--- a/include/linux/posix_acl_xattr.h
+++ b/include/linux/posix_acl_xattr.h
@@ -9,16 +9,12 @@
 #ifndef _POSIX_ACL_XATTR_H
 #define _POSIX_ACL_XATTR_H
 
+#include <uapi/linux/xattr.h>
 #include <linux/posix_acl.h>
 
-/* Extended attribute names */
-#define POSIX_ACL_XATTR_ACCESS	"system.posix_acl_access"
-#define POSIX_ACL_XATTR_DEFAULT	"system.posix_acl_default"
-
 /* Supported ACL a_version fields */
 #define POSIX_ACL_XATTR_VERSION	0x0002
 
-
 /* An undefined entry e_id value */
 #define ACL_UNDEFINED_ID	(-1)
 
-- 
cgit v1.1


From 98e9cb5711c68223f0e4d5201b9a6add255ec550 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Wed, 2 Dec 2015 14:44:36 +0100
Subject: vfs: Distinguish between full xattr names and proper prefixes

Add an additional "name" field to struct xattr_handler.  When the name
is set, the handler matches attributes with exactly that name.  When the
prefix is set instead, the handler matches attributes with the given
prefix and with a non-empty suffix.

This patch should avoid bugs like the one fixed in commit c361016a in
the future.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Reviewed-by: James Morris <james.l.morris@oracle.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/9p/acl.c              | 16 +++++-----------
 fs/9p/xattr.c            |  4 ----
 fs/ext2/xattr_security.c |  4 ----
 fs/ext2/xattr_trusted.c  |  4 ----
 fs/ext2/xattr_user.c     |  4 ----
 fs/ext4/xattr_security.c |  4 ----
 fs/ext4/xattr_trusted.c  |  4 ----
 fs/ext4/xattr_user.c     |  4 ----
 fs/f2fs/xattr.c          | 20 ++++++--------------
 fs/f2fs/xattr.h          |  2 +-
 fs/hfsplus/xattr.c       | 12 ------------
 fs/jffs2/security.c      |  6 ------
 fs/jffs2/xattr_trusted.c |  4 ----
 fs/jffs2/xattr_user.c    |  4 ----
 fs/nfs/nfs4proc.c        |  8 +-------
 fs/ocfs2/xattr.c         | 17 +----------------
 fs/posix_acl.c           | 10 +++-------
 fs/reiserfs/xattr.c      |  3 ++-
 fs/squashfs/xattr.c      |  3 ---
 fs/xattr.c               | 27 +++++++++++++++++----------
 fs/xfs/xfs_xattr.c       |  6 ------
 include/linux/xattr.h    | 11 +++++++++++
 22 files changed, 47 insertions(+), 130 deletions(-)

diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 6ff30b1..9da967f 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -220,15 +220,12 @@ static int v9fs_xattr_get_acl(const struct xattr_handler *handler,
 	struct posix_acl *acl;
 	int error;
 
-	if (strcmp(name, "") != 0)
-		return -EINVAL;
-
 	v9ses = v9fs_dentry2v9ses(dentry);
 	/*
 	 * We allow set/get/list of acl when access=client is not specified
 	 */
 	if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
-		return v9fs_xattr_get(dentry, handler->prefix, buffer, size);
+		return v9fs_xattr_get(dentry, handler->name, buffer, size);
 
 	acl = v9fs_get_cached_acl(d_inode(dentry), handler->flags);
 	if (IS_ERR(acl))
@@ -250,16 +247,13 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
 	struct v9fs_session_info *v9ses;
 	struct inode *inode = d_inode(dentry);
 
-	if (strcmp(name, "") != 0)
-		return -EINVAL;
-
 	v9ses = v9fs_dentry2v9ses(dentry);
 	/*
 	 * set the attribute on the remote. Without even looking at the
 	 * xattr value. We leave it to the server to validate
 	 */
 	if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
-		return v9fs_xattr_set(dentry, handler->prefix, value, size,
+		return v9fs_xattr_set(dentry, handler->name, value, size,
 				      flags);
 
 	if (S_ISLNK(inode->i_mode))
@@ -319,7 +313,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
 	default:
 		BUG();
 	}
-	retval = v9fs_xattr_set(dentry, handler->prefix, value, size, flags);
+	retval = v9fs_xattr_set(dentry, handler->name, value, size, flags);
 	if (!retval)
 		set_cached_acl(inode, handler->flags, acl);
 err_out:
@@ -328,14 +322,14 @@ err_out:
 }
 
 const struct xattr_handler v9fs_xattr_acl_access_handler = {
-	.prefix	= XATTR_NAME_POSIX_ACL_ACCESS,
+	.name	= XATTR_NAME_POSIX_ACL_ACCESS,
 	.flags	= ACL_TYPE_ACCESS,
 	.get	= v9fs_xattr_get_acl,
 	.set	= v9fs_xattr_set_acl,
 };
 
 const struct xattr_handler v9fs_xattr_acl_default_handler = {
-	.prefix	= XATTR_NAME_POSIX_ACL_DEFAULT,
+	.name	= XATTR_NAME_POSIX_ACL_DEFAULT,
 	.flags	= ACL_TYPE_DEFAULT,
 	.get	= v9fs_xattr_get_acl,
 	.set	= v9fs_xattr_set_acl,
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index e3d026a..9dd9b47 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -143,8 +143,6 @@ static int v9fs_xattr_handler_get(const struct xattr_handler *handler,
 {
 	const char *full_name = xattr_full_name(handler, name);
 
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
 	return v9fs_xattr_get(dentry, full_name, buffer, size);
 }
 
@@ -154,8 +152,6 @@ static int v9fs_xattr_handler_set(const struct xattr_handler *handler,
 {
 	const char *full_name = xattr_full_name(handler, name);
 
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
 	return v9fs_xattr_set(dentry, full_name, value, size, flags);
 }
 
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index dfb0875..118bf231 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -28,8 +28,6 @@ ext2_xattr_security_get(const struct xattr_handler *handler,
 			struct dentry *dentry, const char *name,
 			void *buffer, size_t size)
 {
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
 	return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_SECURITY, name,
 			      buffer, size);
 }
@@ -39,8 +37,6 @@ ext2_xattr_security_set(const struct xattr_handler *handler,
 			struct dentry *dentry, const char *name,
 			const void *value, size_t size, int flags)
 {
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
 	return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_SECURITY, name,
 			      value, size, flags);
 }
diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c
index 3150dd3..3f8f2bc 100644
--- a/fs/ext2/xattr_trusted.c
+++ b/fs/ext2/xattr_trusted.c
@@ -32,8 +32,6 @@ ext2_xattr_trusted_get(const struct xattr_handler *handler,
 		       struct dentry *dentry, const char *name,
 		       void *buffer, size_t size)
 {
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
 	return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_TRUSTED, name,
 			      buffer, size);
 }
@@ -43,8 +41,6 @@ ext2_xattr_trusted_set(const struct xattr_handler *handler,
 		       struct dentry *dentry, const char *name,
 		       const void *value, size_t size, int flags)
 {
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
 	return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_TRUSTED, name,
 			      value, size, flags);
 }
diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c
index 339a49b..afd45ab 100644
--- a/fs/ext2/xattr_user.c
+++ b/fs/ext2/xattr_user.c
@@ -34,8 +34,6 @@ ext2_xattr_user_get(const struct xattr_handler *handler,
 		    struct dentry *dentry, const char *name,
 		    void *buffer, size_t size)
 {
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
 	if (!test_opt(dentry->d_sb, XATTR_USER))
 		return -EOPNOTSUPP;
 	return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_USER,
@@ -47,8 +45,6 @@ ext2_xattr_user_set(const struct xattr_handler *handler,
 		    struct dentry *dentry, const char *name,
 		    const void *value, size_t size, int flags)
 {
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
 	if (!test_opt(dentry->d_sb, XATTR_USER))
 		return -EOPNOTSUPP;
 
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 36f4c1a..195abc4 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -33,8 +33,6 @@ ext4_xattr_security_get(const struct xattr_handler *handler,
 			struct dentry *dentry, const char *name,
 			void *buffer, size_t size)
 {
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
 	return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_SECURITY,
 			      name, buffer, size);
 }
@@ -44,8 +42,6 @@ ext4_xattr_security_set(const struct xattr_handler *handler,
 			struct dentry *dentry, const char *name,
 			const void *value, size_t size, int flags)
 {
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
 	return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_SECURITY,
 			      name, value, size, flags);
 }
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index 4880890..121fdf9 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -36,8 +36,6 @@ ext4_xattr_trusted_get(const struct xattr_handler *handler,
 		       struct dentry *dentry, const char *name, void *buffer,
 		       size_t size)
 {
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
 	return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_TRUSTED,
 			      name, buffer, size);
 }
@@ -47,8 +45,6 @@ ext4_xattr_trusted_set(const struct xattr_handler *handler,
 		       struct dentry *dentry, const char *name,
 		       const void *value, size_t size, int flags)
 {
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
 	return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_TRUSTED,
 			      name, value, size, flags);
 }
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index d2dec33..577fc12 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -35,8 +35,6 @@ ext4_xattr_user_get(const struct xattr_handler *handler,
 		    struct dentry *dentry, const char *name,
 		    void *buffer, size_t size)
 {
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
 	if (!test_opt(dentry->d_sb, XATTR_USER))
 		return -EOPNOTSUPP;
 	return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_USER,
@@ -48,8 +46,6 @@ ext4_xattr_user_set(const struct xattr_handler *handler,
 		    struct dentry *dentry, const char *name,
 		    const void *value, size_t size, int flags)
 {
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
 	if (!test_opt(dentry->d_sb, XATTR_USER))
 		return -EOPNOTSUPP;
 	return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_USER,
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 862368a..21cfe51 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -30,6 +30,7 @@ static size_t f2fs_xattr_generic_list(const struct xattr_handler *handler,
 		const char *name, size_t len)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
+	const char *prefix;
 	int total_len, prefix_len;
 
 	switch (handler->flags) {
@@ -47,10 +48,11 @@ static size_t f2fs_xattr_generic_list(const struct xattr_handler *handler,
 		return -EINVAL;
 	}
 
-	prefix_len = strlen(handler->prefix);
+	prefix = xattr_prefix(handler);
+	prefix_len = strlen(prefix);
 	total_len = prefix_len + len + 1;
 	if (list && total_len <= list_size) {
-		memcpy(list, handler->prefix, prefix_len);
+		memcpy(list, prefix, prefix_len);
 		memcpy(list + prefix_len, name, len);
 		list[prefix_len + len] = '\0';
 	}
@@ -77,8 +79,6 @@ static int f2fs_xattr_generic_get(const struct xattr_handler *handler,
 	default:
 		return -EINVAL;
 	}
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
 	return f2fs_getxattr(d_inode(dentry), handler->flags, name,
 			     buffer, size, NULL);
 }
@@ -103,9 +103,6 @@ static int f2fs_xattr_generic_set(const struct xattr_handler *handler,
 	default:
 		return -EINVAL;
 	}
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
-
 	return f2fs_setxattr(d_inode(dentry), handler->flags, name,
 					value, size, NULL, flags);
 }
@@ -114,7 +111,7 @@ static size_t f2fs_xattr_advise_list(const struct xattr_handler *handler,
 		struct dentry *dentry, char *list, size_t list_size,
 		const char *name, size_t len)
 {
-	const char *xname = F2FS_SYSTEM_ADVISE_PREFIX;
+	const char *xname = F2FS_SYSTEM_ADVISE_NAME;
 	size_t size;
 
 	size = strlen(xname) + 1;
@@ -129,9 +126,6 @@ static int f2fs_xattr_advise_get(const struct xattr_handler *handler,
 {
 	struct inode *inode = d_inode(dentry);
 
-	if (strcmp(name, "") != 0)
-		return -EINVAL;
-
 	if (buffer)
 		*((char *)buffer) = F2FS_I(inode)->i_advise;
 	return sizeof(char);
@@ -143,8 +137,6 @@ static int f2fs_xattr_advise_set(const struct xattr_handler *handler,
 {
 	struct inode *inode = d_inode(dentry);
 
-	if (strcmp(name, "") != 0)
-		return -EINVAL;
 	if (!inode_owner_or_capable(inode))
 		return -EPERM;
 	if (value == NULL)
@@ -197,7 +189,7 @@ const struct xattr_handler f2fs_xattr_trusted_handler = {
 };
 
 const struct xattr_handler f2fs_xattr_advise_handler = {
-	.prefix = F2FS_SYSTEM_ADVISE_PREFIX,
+	.name	= F2FS_SYSTEM_ADVISE_NAME,
 	.flags	= F2FS_XATTR_INDEX_ADVISE,
 	.list   = f2fs_xattr_advise_list,
 	.get    = f2fs_xattr_advise_get,
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index 71a7100..79dccc8 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h
@@ -27,7 +27,7 @@
 #define F2FS_XATTR_REFCOUNT_MAX         1024
 
 /* Name indexes */
-#define F2FS_SYSTEM_ADVISE_PREFIX		"system.advise"
+#define F2FS_SYSTEM_ADVISE_NAME			"system.advise"
 #define F2FS_XATTR_INDEX_USER			1
 #define F2FS_XATTR_INDEX_POSIX_ACL_ACCESS	2
 #define F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT	3
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index e41a010..ab01530 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -431,9 +431,6 @@ int hfsplus_setxattr(struct dentry *dentry, const char *name,
 	char *xattr_name;
 	int res;
 
-	if (!strcmp(name, ""))
-		return -EINVAL;
-
 	xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1,
 		GFP_KERNEL);
 	if (!xattr_name)
@@ -589,9 +586,6 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
 	int res;
 	char *xattr_name;
 
-	if (!strcmp(name, ""))
-		return -EINVAL;
-
 	xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1,
 			     GFP_KERNEL);
 	if (!xattr_name)
@@ -853,9 +847,6 @@ static int hfsplus_osx_getxattr(const struct xattr_handler *handler,
 				struct dentry *dentry, const char *name,
 				void *buffer, size_t size)
 {
-	if (!strcmp(name, ""))
-		return -EINVAL;
-
 	/*
 	 * Don't allow retrieving properly prefixed attributes
 	 * by prepending them with "osx."
@@ -876,9 +867,6 @@ static int hfsplus_osx_setxattr(const struct xattr_handler *handler,
 				struct dentry *dentry, const char *name,
 				const void *buffer, size_t size, int flags)
 {
-	if (!strcmp(name, ""))
-		return -EINVAL;
-
 	/*
 	 * Don't allow setting properly prefixed attributes
 	 * by prepending them with "osx."
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
index bf12fe5..ea79932 100644
--- a/fs/jffs2/security.c
+++ b/fs/jffs2/security.c
@@ -52,9 +52,6 @@ static int jffs2_security_getxattr(const struct xattr_handler *handler,
 				   struct dentry *dentry, const char *name,
 				   void *buffer, size_t size)
 {
-	if (!strcmp(name, ""))
-		return -EINVAL;
-
 	return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_SECURITY,
 				 name, buffer, size);
 }
@@ -63,9 +60,6 @@ static int jffs2_security_setxattr(const struct xattr_handler *handler,
 				   struct dentry *dentry, const char *name,
 				   const void *buffer, size_t size, int flags)
 {
-	if (!strcmp(name, ""))
-		return -EINVAL;
-
 	return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_SECURITY,
 				 name, buffer, size, flags);
 }
diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c
index a562da0..8b55fe4 100644
--- a/fs/jffs2/xattr_trusted.c
+++ b/fs/jffs2/xattr_trusted.c
@@ -20,8 +20,6 @@ static int jffs2_trusted_getxattr(const struct xattr_handler *handler,
 				  struct dentry *dentry, const char *name,
 				  void *buffer, size_t size)
 {
-	if (!strcmp(name, ""))
-		return -EINVAL;
 	return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_TRUSTED,
 				 name, buffer, size);
 }
@@ -30,8 +28,6 @@ static int jffs2_trusted_setxattr(const struct xattr_handler *handler,
 				  struct dentry *dentry, const char *name,
 				  const void *buffer, size_t size, int flags)
 {
-	if (!strcmp(name, ""))
-		return -EINVAL;
 	return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_TRUSTED,
 				 name, buffer, size, flags);
 }
diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c
index cbc0472..b04335b 100644
--- a/fs/jffs2/xattr_user.c
+++ b/fs/jffs2/xattr_user.c
@@ -20,8 +20,6 @@ static int jffs2_user_getxattr(const struct xattr_handler *handler,
 			       struct dentry *dentry, const char *name,
 			       void *buffer, size_t size)
 {
-	if (!strcmp(name, ""))
-		return -EINVAL;
 	return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_USER,
 				 name, buffer, size);
 }
@@ -30,8 +28,6 @@ static int jffs2_user_setxattr(const struct xattr_handler *handler,
 			       struct dentry *dentry, const char *name,
 			       const void *buffer, size_t size, int flags)
 {
-	if (!strcmp(name, ""))
-		return -EINVAL;
 	return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_USER,
 				 name, buffer, size, flags);
 }
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 8981803..f6f40aa 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -6253,9 +6253,6 @@ static int nfs4_xattr_set_nfs4_acl(const struct xattr_handler *handler,
 				   const void *buf, size_t buflen,
 				   int flags)
 {
-	if (strcmp(key, "") != 0)
-		return -EINVAL;
-
 	return nfs4_proc_set_acl(d_inode(dentry), buf, buflen);
 }
 
@@ -6263,9 +6260,6 @@ static int nfs4_xattr_get_nfs4_acl(const struct xattr_handler *handler,
 				   struct dentry *dentry, const char *key,
 				   void *buf, size_t buflen)
 {
-	if (strcmp(key, "") != 0)
-		return -EINVAL;
-
 	return nfs4_proc_get_acl(d_inode(dentry), buf, buflen);
 }
 
@@ -8834,7 +8828,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
 };
 
 static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = {
-	.prefix	= XATTR_NAME_NFSV4_ACL,
+	.name	= XATTR_NAME_NFSV4_ACL,
 	.list	= nfs4_xattr_list_nfs4_acl,
 	.get	= nfs4_xattr_get_nfs4_acl,
 	.set	= nfs4_xattr_set_nfs4_acl,
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index e9164f0..5823f98 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -544,8 +544,7 @@ static inline const char *ocfs2_xattr_prefix(int name_index)
 
 	if (name_index > 0 && name_index < OCFS2_XATTR_MAX)
 		handler = ocfs2_xattr_handler_map[name_index];
-
-	return handler ? handler->prefix : NULL;
+	return handler ? xattr_prefix(handler) : NULL;
 }
 
 static u32 ocfs2_xattr_name_hash(struct inode *inode,
@@ -7249,8 +7248,6 @@ static int ocfs2_xattr_security_get(const struct xattr_handler *handler,
 				    struct dentry *dentry, const char *name,
 				    void *buffer, size_t size)
 {
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
 	return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_SECURITY,
 			       name, buffer, size);
 }
@@ -7259,9 +7256,6 @@ static int ocfs2_xattr_security_set(const struct xattr_handler *handler,
 				    struct dentry *dentry, const char *name,
 				    const void *value, size_t size, int flags)
 {
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
-
 	return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_SECURITY,
 			       name, value, size, flags);
 }
@@ -7345,8 +7339,6 @@ static int ocfs2_xattr_trusted_get(const struct xattr_handler *handler,
 				   struct dentry *dentry, const char *name,
 				   void *buffer, size_t size)
 {
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
 	return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_TRUSTED,
 			       name, buffer, size);
 }
@@ -7355,9 +7347,6 @@ static int ocfs2_xattr_trusted_set(const struct xattr_handler *handler,
 				   struct dentry *dentry, const char *name,
 				   const void *value, size_t size, int flags)
 {
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
-
 	return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_TRUSTED,
 			       name, value, size, flags);
 }
@@ -7398,8 +7387,6 @@ static int ocfs2_xattr_user_get(const struct xattr_handler *handler,
 {
 	struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
 
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
 	if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
 		return -EOPNOTSUPP;
 	return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_USER, name,
@@ -7412,8 +7399,6 @@ static int ocfs2_xattr_user_set(const struct xattr_handler *handler,
 {
 	struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
 
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
 	if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
 		return -EOPNOTSUPP;
 
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 571465d..17efd76 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -769,8 +769,6 @@ posix_acl_xattr_get(const struct xattr_handler *handler,
 	struct posix_acl *acl;
 	int error;
 
-	if (strcmp(name, "") != 0)
-		return -EINVAL;
 	if (!IS_POSIXACL(d_backing_inode(dentry)))
 		return -EOPNOTSUPP;
 	if (d_is_symlink(dentry))
@@ -797,8 +795,6 @@ posix_acl_xattr_set(const struct xattr_handler *handler,
 	struct posix_acl *acl = NULL;
 	int ret;
 
-	if (strcmp(name, "") != 0)
-		return -EINVAL;
 	if (!IS_POSIXACL(inode))
 		return -EOPNOTSUPP;
 	if (!inode->i_op->set_acl)
@@ -832,7 +828,7 @@ posix_acl_xattr_list(const struct xattr_handler *handler,
 		     struct dentry *dentry, char *list, size_t list_size,
 		     const char *name, size_t name_len)
 {
-	const char *xname = handler->prefix;
+	const char *xname = handler->name;
 	size_t size;
 
 	if (!IS_POSIXACL(d_backing_inode(dentry)))
@@ -845,7 +841,7 @@ posix_acl_xattr_list(const struct xattr_handler *handler,
 }
 
 const struct xattr_handler posix_acl_access_xattr_handler = {
-	.prefix = XATTR_NAME_POSIX_ACL_ACCESS,
+	.name = XATTR_NAME_POSIX_ACL_ACCESS,
 	.flags = ACL_TYPE_ACCESS,
 	.list = posix_acl_xattr_list,
 	.get = posix_acl_xattr_get,
@@ -854,7 +850,7 @@ const struct xattr_handler posix_acl_access_xattr_handler = {
 EXPORT_SYMBOL_GPL(posix_acl_access_xattr_handler);
 
 const struct xattr_handler posix_acl_default_xattr_handler = {
-	.prefix = XATTR_NAME_POSIX_ACL_DEFAULT,
+	.name = XATTR_NAME_POSIX_ACL_DEFAULT,
 	.flags = ACL_TYPE_DEFAULT,
 	.list = posix_acl_xattr_list,
 	.get = posix_acl_xattr_get,
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 66b26fd..efe2ed3 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -756,7 +756,8 @@ find_xattr_handler_prefix(const struct xattr_handler **handlers,
 		return NULL;
 
 	for_each_xattr_handler(handlers, xah) {
-		if (strncmp(xah->prefix, name, strlen(xah->prefix)) == 0)
+		const char *prefix = xattr_prefix(xah);
+		if (strncmp(prefix, name, strlen(prefix)) == 0)
 			break;
 	}
 
diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c
index 6a4cc34..2f0ccba 100644
--- a/fs/squashfs/xattr.c
+++ b/fs/squashfs/xattr.c
@@ -228,9 +228,6 @@ static int squashfs_xattr_handler_get(const struct xattr_handler *handler,
 				      struct dentry *d, const char *name,
 				      void *buffer, size_t size)
 {
-	if (name[0] == '\0')
-		return  -EINVAL;
-
 	return squashfs_xattr_get(d_inode(d), handler->flags, name,
 		buffer, size);
 }
diff --git a/fs/xattr.c b/fs/xattr.c
index 58bdabc..418ad69 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -681,13 +681,20 @@ xattr_resolve_name(const struct xattr_handler **handlers, const char **name)
 		return NULL;
 
 	for_each_xattr_handler(handlers, handler) {
-		const char *n = strcmp_prefix(*name, handler->prefix);
+		const char *n;
+
+		n = strcmp_prefix(*name, xattr_prefix(handler));
 		if (n) {
+			if (!handler->prefix ^ !*n) {
+				if (*n)
+					continue;
+				return ERR_PTR(-EINVAL);
+			}
 			*name = n;
-			break;
+			return handler;
 		}
 	}
-	return handler;
+	return ERR_PTR(-EOPNOTSUPP);
 }
 
 /*
@@ -699,8 +706,8 @@ generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t s
 	const struct xattr_handler *handler;
 
 	handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
-	if (!handler)
-		return -EOPNOTSUPP;
+	if (IS_ERR(handler))
+		return PTR_ERR(handler);
 	return handler->get(handler, dentry, name, buffer, size);
 }
 
@@ -746,8 +753,8 @@ generic_setxattr(struct dentry *dentry, const char *name, const void *value, siz
 	if (size == 0)
 		value = "";  /* empty EA, do not remove */
 	handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
-	if (!handler)
-		return -EOPNOTSUPP;
+	if (IS_ERR(handler))
+		return PTR_ERR(handler);
 	return handler->set(handler, dentry, name, value, size, flags);
 }
 
@@ -761,8 +768,8 @@ generic_removexattr(struct dentry *dentry, const char *name)
 	const struct xattr_handler *handler;
 
 	handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
-	if (!handler)
-		return -EOPNOTSUPP;
+	if (IS_ERR(handler))
+		return PTR_ERR(handler);
 	return handler->set(handler, dentry, name, NULL, 0, XATTR_REPLACE);
 }
 
@@ -789,7 +796,7 @@ EXPORT_SYMBOL(generic_removexattr);
 const char *xattr_full_name(const struct xattr_handler *handler,
 			    const char *name)
 {
-	size_t prefix_len = strlen(handler->prefix);
+	size_t prefix_len = strlen(xattr_prefix(handler));
 
 	return name - prefix_len;
 }
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 7288795..36a4385 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -39,9 +39,6 @@ xfs_xattr_get(const struct xattr_handler *handler, struct dentry *dentry,
 	struct xfs_inode *ip = XFS_I(d_inode(dentry));
 	int error, asize = size;
 
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
-
 	/* Convert Linux syscall to XFS internal ATTR flags */
 	if (!size) {
 		xflags |= ATTR_KERNOVAL;
@@ -84,9 +81,6 @@ xfs_xattr_set(const struct xattr_handler *handler, struct dentry *dentry,
 	struct xfs_inode	*ip = XFS_I(d_inode(dentry));
 	int			error;
 
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
-
 	/* Convert Linux syscall to XFS internal ATTR flags */
 	if (flags & XATTR_CREATE)
 		xflags |= ATTR_CREATE;
diff --git a/include/linux/xattr.h b/include/linux/xattr.h
index 45fa345..03c847f 100644
--- a/include/linux/xattr.h
+++ b/include/linux/xattr.h
@@ -19,7 +19,13 @@
 struct inode;
 struct dentry;
 
+/*
+ * struct xattr_handler: When @name is set, match attributes with exactly that
+ * name.  When @prefix is set instead, match attributes with that prefix and
+ * with a non-empty suffix.
+ */
 struct xattr_handler {
+	const char *name;
 	const char *prefix;
 	int flags;      /* fs private flags */
 	size_t (*list)(const struct xattr_handler *, struct dentry *dentry,
@@ -54,6 +60,11 @@ int generic_removexattr(struct dentry *dentry, const char *name);
 ssize_t vfs_getxattr_alloc(struct dentry *dentry, const char *name,
 			   char **xattr_value, size_t size, gfp_t flags);
 
+static inline const char *xattr_prefix(const struct xattr_handler *handler)
+{
+	return handler->prefix ?: handler->name;
+}
+
 struct simple_xattrs {
 	struct list_head head;
 	spinlock_t lock;
-- 
cgit v1.1


From 9172abbcd371f2f62903087bbd228f11d380b7b4 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Wed, 2 Dec 2015 14:44:37 +0100
Subject: btrfs: Use xattr handler infrastructure

Use the VFS xattr handler infrastructure and get rid of similar code in
the filesystem.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/inode.c |   8 +--
 fs/btrfs/xattr.c | 166 ++++++++++++++++++++-----------------------------------
 fs/btrfs/xattr.h |   2 -
 3 files changed, 63 insertions(+), 113 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d540fd7..4fb8d6e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9994,7 +9994,7 @@ static const struct inode_operations btrfs_dir_inode_operations = {
 	.setattr	= btrfs_setattr,
 	.mknod		= btrfs_mknod,
 	.setxattr	= btrfs_setxattr,
-	.getxattr	= btrfs_getxattr,
+	.getxattr	= generic_getxattr,
 	.listxattr	= btrfs_listxattr,
 	.removexattr	= btrfs_removexattr,
 	.permission	= btrfs_permission,
@@ -10071,7 +10071,7 @@ static const struct inode_operations btrfs_file_inode_operations = {
 	.getattr	= btrfs_getattr,
 	.setattr	= btrfs_setattr,
 	.setxattr	= btrfs_setxattr,
-	.getxattr	= btrfs_getxattr,
+	.getxattr	= generic_getxattr,
 	.listxattr      = btrfs_listxattr,
 	.removexattr	= btrfs_removexattr,
 	.permission	= btrfs_permission,
@@ -10085,7 +10085,7 @@ static const struct inode_operations btrfs_special_inode_operations = {
 	.setattr	= btrfs_setattr,
 	.permission	= btrfs_permission,
 	.setxattr	= btrfs_setxattr,
-	.getxattr	= btrfs_getxattr,
+	.getxattr	= generic_getxattr,
 	.listxattr	= btrfs_listxattr,
 	.removexattr	= btrfs_removexattr,
 	.get_acl	= btrfs_get_acl,
@@ -10100,7 +10100,7 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
 	.setattr	= btrfs_setattr,
 	.permission	= btrfs_permission,
 	.setxattr	= btrfs_setxattr,
-	.getxattr	= btrfs_getxattr,
+	.getxattr	= generic_getxattr,
 	.listxattr	= btrfs_listxattr,
 	.removexattr	= btrfs_removexattr,
 	.update_time	= btrfs_update_time,
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 1fcd7b6..7cbef1a 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -351,137 +351,89 @@ err:
 	return ret;
 }
 
-/*
- * List of handlers for synthetic system.* attributes.  All real ondisk
- * attributes are handled directly.
- */
-const struct xattr_handler *btrfs_xattr_handlers[] = {
-#ifdef CONFIG_BTRFS_FS_POSIX_ACL
-	&posix_acl_access_xattr_handler,
-	&posix_acl_default_xattr_handler,
-#endif
-	NULL,
-};
-
-/*
- * Check if the attribute is in a supported namespace.
- *
- * This is applied after the check for the synthetic attributes in the system
- * namespace.
- */
-static int btrfs_is_valid_xattr(const char *name)
+static int btrfs_xattr_handler_get(const struct xattr_handler *handler,
+				   struct dentry *dentry, const char *name,
+				   void *buffer, size_t size)
 {
-	int len = strlen(name);
-	int prefixlen = 0;
-
-	if (!strncmp(name, XATTR_SECURITY_PREFIX,
-			XATTR_SECURITY_PREFIX_LEN))
-		prefixlen = XATTR_SECURITY_PREFIX_LEN;
-	else if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
-		prefixlen = XATTR_SYSTEM_PREFIX_LEN;
-	else if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN))
-		prefixlen = XATTR_TRUSTED_PREFIX_LEN;
-	else if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
-		prefixlen = XATTR_USER_PREFIX_LEN;
-	else if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
-		prefixlen = XATTR_BTRFS_PREFIX_LEN;
-	else
-		return -EOPNOTSUPP;
-
-	/*
-	 * The name cannot consist of just prefix
-	 */
-	if (len <= prefixlen)
-		return -EINVAL;
+	struct inode *inode = d_inode(dentry);
 
-	return 0;
+	name = xattr_full_name(handler, name);
+	return __btrfs_getxattr(inode, name, buffer, size);
 }
 
-ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
-		       void *buffer, size_t size)
+static int btrfs_xattr_handler_set(const struct xattr_handler *handler,
+				   struct dentry *dentry, const char *name,
+				   const void *buffer, size_t size,
+				   int flags)
 {
-	int ret;
+	struct inode *inode = d_inode(dentry);
 
-	/*
-	 * If this is a request for a synthetic attribute in the system.*
-	 * namespace use the generic infrastructure to resolve a handler
-	 * for it via sb->s_xattr.
-	 */
-	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
-		return generic_getxattr(dentry, name, buffer, size);
+	name = xattr_full_name(handler, name);
+	return __btrfs_setxattr(NULL, inode, name, buffer, size, flags);
+}
 
-	ret = btrfs_is_valid_xattr(name);
-	if (ret)
-		return ret;
-	return __btrfs_getxattr(d_inode(dentry), name, buffer, size);
+static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
+					struct dentry *dentry,
+					const char *name, const void *value,
+					size_t size, int flags)
+{
+	name = xattr_full_name(handler, name);
+	return btrfs_set_prop(d_inode(dentry), name, value, size, flags);
 }
 
+static const struct xattr_handler btrfs_security_xattr_handler = {
+	.prefix = XATTR_SECURITY_PREFIX,
+	.get = btrfs_xattr_handler_get,
+	.set = btrfs_xattr_handler_set,
+};
+
+static const struct xattr_handler btrfs_trusted_xattr_handler = {
+	.prefix = XATTR_TRUSTED_PREFIX,
+	.get = btrfs_xattr_handler_get,
+	.set = btrfs_xattr_handler_set,
+};
+
+static const struct xattr_handler btrfs_user_xattr_handler = {
+	.prefix = XATTR_USER_PREFIX,
+	.get = btrfs_xattr_handler_get,
+	.set = btrfs_xattr_handler_set,
+};
+
+static const struct xattr_handler btrfs_btrfs_xattr_handler = {
+	.prefix = XATTR_BTRFS_PREFIX,
+	.get = btrfs_xattr_handler_get,
+	.set = btrfs_xattr_handler_set_prop,
+};
+
+const struct xattr_handler *btrfs_xattr_handlers[] = {
+	&btrfs_security_xattr_handler,
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
+#endif
+	&btrfs_trusted_xattr_handler,
+	&btrfs_user_xattr_handler,
+	&btrfs_btrfs_xattr_handler,
+	NULL,
+};
+
 int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 		   size_t size, int flags)
 {
 	struct btrfs_root *root = BTRFS_I(d_inode(dentry))->root;
-	int ret;
 
-	/*
-	 * The permission on security.* and system.* is not checked
-	 * in permission().
-	 */
 	if (btrfs_root_readonly(root))
 		return -EROFS;
-
-	/*
-	 * If this is a request for a synthetic attribute in the system.*
-	 * namespace use the generic infrastructure to resolve a handler
-	 * for it via sb->s_xattr.
-	 */
-	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
-		return generic_setxattr(dentry, name, value, size, flags);
-
-	ret = btrfs_is_valid_xattr(name);
-	if (ret)
-		return ret;
-
-	if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
-		return btrfs_set_prop(d_inode(dentry), name,
-				      value, size, flags);
-
-	if (size == 0)
-		value = "";  /* empty EA, do not remove */
-
-	return __btrfs_setxattr(NULL, d_inode(dentry), name, value, size,
-				flags);
+	return generic_setxattr(dentry, name, value, size, flags);
 }
 
 int btrfs_removexattr(struct dentry *dentry, const char *name)
 {
 	struct btrfs_root *root = BTRFS_I(d_inode(dentry))->root;
-	int ret;
 
-	/*
-	 * The permission on security.* and system.* is not checked
-	 * in permission().
-	 */
 	if (btrfs_root_readonly(root))
 		return -EROFS;
-
-	/*
-	 * If this is a request for a synthetic attribute in the system.*
-	 * namespace use the generic infrastructure to resolve a handler
-	 * for it via sb->s_xattr.
-	 */
-	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
-		return generic_removexattr(dentry, name);
-
-	ret = btrfs_is_valid_xattr(name);
-	if (ret)
-		return ret;
-
-	if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
-		return btrfs_set_prop(d_inode(dentry), name,
-				      NULL, 0, XATTR_REPLACE);
-
-	return __btrfs_setxattr(NULL, d_inode(dentry), name, NULL, 0,
-				XATTR_REPLACE);
+	return generic_removexattr(dentry, name);
 }
 
 static int btrfs_initxattrs(struct inode *inode,
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 5049608..96807b3 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -28,8 +28,6 @@ extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
 extern int __btrfs_setxattr(struct btrfs_trans_handle *trans,
 			    struct inode *inode, const char *name,
 			    const void *value, size_t size, int flags);
-extern ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
-		void *buffer, size_t size);
 extern int btrfs_setxattr(struct dentry *dentry, const char *name,
 		const void *value, size_t size, int flags);
 extern int btrfs_removexattr(struct dentry *dentry, const char *name);
-- 
cgit v1.1


From aa7c5241c380adb7e6913549292c1b83c1469bda Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Wed, 2 Dec 2015 14:44:38 +0100
Subject: tmpfs: Use xattr handler infrastructure

Use the VFS xattr handler infrastructure and get rid of similar code in
the filesystem.  For implementing shmem_xattr_handler_set, we need a
version of simple_xattr_set which removes the attribute when value is
NULL.  Use this to implement kernfs_iop_removexattr as well.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Reviewed-by: James Morris <james.l.morris@oracle.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: linux-mm@kvack.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/kernfs/inode.c     |   2 +-
 fs/xattr.c            |  48 ++++++------------
 include/linux/xattr.h |   4 +-
 mm/shmem.c            | 131 ++++++++++++++++----------------------------------
 4 files changed, 60 insertions(+), 125 deletions(-)

diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 756dd56..f97e1f7 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -205,7 +205,7 @@ int kernfs_iop_removexattr(struct dentry *dentry, const char *name)
 	if (!attrs)
 		return -ENOMEM;
 
-	return simple_xattr_remove(&attrs->xattrs, name);
+	return simple_xattr_set(&attrs->xattrs, name, NULL, 0, XATTR_REPLACE);
 }
 
 ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf,
diff --git a/fs/xattr.c b/fs/xattr.c
index 418ad69..4ef8b37 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -851,8 +851,22 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name,
 	return ret;
 }
 
-static int __simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
-			      const void *value, size_t size, int flags)
+/**
+ * simple_xattr_set - xattr SET operation for in-memory/pseudo filesystems
+ * @xattrs: target simple_xattr list
+ * @name: name of the extended attribute
+ * @value: value of the xattr. If %NULL, will remove the attribute.
+ * @size: size of the new xattr
+ * @flags: %XATTR_{CREATE|REPLACE}
+ *
+ * %XATTR_CREATE is set, the xattr shouldn't exist already; otherwise fails
+ * with -EEXIST.  If %XATTR_REPLACE is set, the xattr should exist;
+ * otherwise, fails with -ENODATA.
+ *
+ * Returns 0 on success, -errno on failure.
+ */
+int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
+		     const void *value, size_t size, int flags)
 {
 	struct simple_xattr *xattr;
 	struct simple_xattr *new_xattr = NULL;
@@ -902,36 +916,6 @@ out:
 
 }
 
-/**
- * simple_xattr_set - xattr SET operation for in-memory/pseudo filesystems
- * @xattrs: target simple_xattr list
- * @name: name of the new extended attribute
- * @value: value of the new xattr. If %NULL, will remove the attribute
- * @size: size of the new xattr
- * @flags: %XATTR_{CREATE|REPLACE}
- *
- * %XATTR_CREATE is set, the xattr shouldn't exist already; otherwise fails
- * with -EEXIST.  If %XATTR_REPLACE is set, the xattr should exist;
- * otherwise, fails with -ENODATA.
- *
- * Returns 0 on success, -errno on failure.
- */
-int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
-		     const void *value, size_t size, int flags)
-{
-	if (size == 0)
-		value = ""; /* empty EA, do not remove */
-	return __simple_xattr_set(xattrs, name, value, size, flags);
-}
-
-/*
- * xattr REMOVE operation for in-memory/pseudo filesystems
- */
-int simple_xattr_remove(struct simple_xattrs *xattrs, const char *name)
-{
-	return __simple_xattr_set(xattrs, name, NULL, 0, XATTR_REPLACE);
-}
-
 static bool xattr_is_trusted(const char *name)
 {
 	return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
diff --git a/include/linux/xattr.h b/include/linux/xattr.h
index 03c847f..4dd40cb 100644
--- a/include/linux/xattr.h
+++ b/include/linux/xattr.h
@@ -104,9 +104,7 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name,
 		     void *buffer, size_t size);
 int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
 		     const void *value, size_t size, int flags);
-int simple_xattr_remove(struct simple_xattrs *xattrs, const char *name);
-ssize_t simple_xattr_list(struct simple_xattrs *xattrs, char *buffer,
-			  size_t size);
+ssize_t simple_xattr_list(struct simple_xattrs *xattrs, char *buffer, size_t size);
 void simple_xattr_list_add(struct simple_xattrs *xattrs,
 			   struct simple_xattr *new_xattr);
 
diff --git a/mm/shmem.c b/mm/shmem.c
index 9187eee..fdfe6c8 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2561,94 +2561,47 @@ static int shmem_initxattrs(struct inode *inode,
 	return 0;
 }
 
-static const struct xattr_handler *shmem_xattr_handlers[] = {
-#ifdef CONFIG_TMPFS_POSIX_ACL
-	&posix_acl_access_xattr_handler,
-	&posix_acl_default_xattr_handler,
-#endif
-	NULL
-};
-
-static int shmem_xattr_validate(const char *name)
-{
-	struct { const char *prefix; size_t len; } arr[] = {
-		{ XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN },
-		{ XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN }
-	};
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(arr); i++) {
-		size_t preflen = arr[i].len;
-		if (strncmp(name, arr[i].prefix, preflen) == 0) {
-			if (!name[preflen])
-				return -EINVAL;
-			return 0;
-		}
-	}
-	return -EOPNOTSUPP;
-}
-
-static ssize_t shmem_getxattr(struct dentry *dentry, const char *name,
-			      void *buffer, size_t size)
+static int shmem_xattr_handler_get(const struct xattr_handler *handler,
+				   struct dentry *dentry, const char *name,
+				   void *buffer, size_t size)
 {
 	struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
-	int err;
-
-	/*
-	 * If this is a request for a synthetic attribute in the system.*
-	 * namespace use the generic infrastructure to resolve a handler
-	 * for it via sb->s_xattr.
-	 */
-	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
-		return generic_getxattr(dentry, name, buffer, size);
-
-	err = shmem_xattr_validate(name);
-	if (err)
-		return err;
 
+	name = xattr_full_name(handler, name);
 	return simple_xattr_get(&info->xattrs, name, buffer, size);
 }
 
-static int shmem_setxattr(struct dentry *dentry, const char *name,
-			  const void *value, size_t size, int flags)
+static int shmem_xattr_handler_set(const struct xattr_handler *handler,
+				   struct dentry *dentry, const char *name,
+				   const void *value, size_t size, int flags)
 {
 	struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
-	int err;
-
-	/*
-	 * If this is a request for a synthetic attribute in the system.*
-	 * namespace use the generic infrastructure to resolve a handler
-	 * for it via sb->s_xattr.
-	 */
-	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
-		return generic_setxattr(dentry, name, value, size, flags);
-
-	err = shmem_xattr_validate(name);
-	if (err)
-		return err;
 
+	name = xattr_full_name(handler, name);
 	return simple_xattr_set(&info->xattrs, name, value, size, flags);
 }
 
-static int shmem_removexattr(struct dentry *dentry, const char *name)
-{
-	struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
-	int err;
-
-	/*
-	 * If this is a request for a synthetic attribute in the system.*
-	 * namespace use the generic infrastructure to resolve a handler
-	 * for it via sb->s_xattr.
-	 */
-	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
-		return generic_removexattr(dentry, name);
+static const struct xattr_handler shmem_security_xattr_handler = {
+	.prefix = XATTR_SECURITY_PREFIX,
+	.get = shmem_xattr_handler_get,
+	.set = shmem_xattr_handler_set,
+};
 
-	err = shmem_xattr_validate(name);
-	if (err)
-		return err;
+static const struct xattr_handler shmem_trusted_xattr_handler = {
+	.prefix = XATTR_TRUSTED_PREFIX,
+	.get = shmem_xattr_handler_get,
+	.set = shmem_xattr_handler_set,
+};
 
-	return simple_xattr_remove(&info->xattrs, name);
-}
+static const struct xattr_handler *shmem_xattr_handlers[] = {
+#ifdef CONFIG_TMPFS_POSIX_ACL
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
+#endif
+	&shmem_security_xattr_handler,
+	&shmem_trusted_xattr_handler,
+	NULL
+};
 
 static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
 {
@@ -2661,10 +2614,10 @@ static const struct inode_operations shmem_short_symlink_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= simple_follow_link,
 #ifdef CONFIG_TMPFS_XATTR
-	.setxattr	= shmem_setxattr,
-	.getxattr	= shmem_getxattr,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
 	.listxattr	= shmem_listxattr,
-	.removexattr	= shmem_removexattr,
+	.removexattr	= generic_removexattr,
 #endif
 };
 
@@ -2673,10 +2626,10 @@ static const struct inode_operations shmem_symlink_inode_operations = {
 	.follow_link	= shmem_follow_link,
 	.put_link	= shmem_put_link,
 #ifdef CONFIG_TMPFS_XATTR
-	.setxattr	= shmem_setxattr,
-	.getxattr	= shmem_getxattr,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
 	.listxattr	= shmem_listxattr,
-	.removexattr	= shmem_removexattr,
+	.removexattr	= generic_removexattr,
 #endif
 };
 
@@ -3148,10 +3101,10 @@ static const struct inode_operations shmem_inode_operations = {
 	.getattr	= shmem_getattr,
 	.setattr	= shmem_setattr,
 #ifdef CONFIG_TMPFS_XATTR
-	.setxattr	= shmem_setxattr,
-	.getxattr	= shmem_getxattr,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
 	.listxattr	= shmem_listxattr,
-	.removexattr	= shmem_removexattr,
+	.removexattr	= generic_removexattr,
 	.set_acl	= simple_set_acl,
 #endif
 };
@@ -3170,10 +3123,10 @@ static const struct inode_operations shmem_dir_inode_operations = {
 	.tmpfile	= shmem_tmpfile,
 #endif
 #ifdef CONFIG_TMPFS_XATTR
-	.setxattr	= shmem_setxattr,
-	.getxattr	= shmem_getxattr,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
 	.listxattr	= shmem_listxattr,
-	.removexattr	= shmem_removexattr,
+	.removexattr	= generic_removexattr,
 #endif
 #ifdef CONFIG_TMPFS_POSIX_ACL
 	.setattr	= shmem_setattr,
@@ -3183,10 +3136,10 @@ static const struct inode_operations shmem_dir_inode_operations = {
 
 static const struct inode_operations shmem_special_inode_operations = {
 #ifdef CONFIG_TMPFS_XATTR
-	.setxattr	= shmem_setxattr,
-	.getxattr	= shmem_getxattr,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
 	.listxattr	= shmem_listxattr,
-	.removexattr	= shmem_removexattr,
+	.removexattr	= generic_removexattr,
 #endif
 #ifdef CONFIG_TMPFS_POSIX_ACL
 	.setattr	= shmem_setattr,
-- 
cgit v1.1


From 786534b92f3ce68f4afc8a761c80b76887797b0a Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Wed, 2 Dec 2015 14:44:39 +0100
Subject: tmpfs: listxattr should include POSIX ACL xattrs

When a file on tmpfs has an ACL or a Default ACL, listxattr should include the
corresponding xattr name.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Reviewed-by: James Morris <james.l.morris@oracle.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: linux-mm@kvack.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/kernfs/inode.c     |  2 +-
 fs/xattr.c            | 53 +++++++++++++++++++++++++++++++++++----------------
 include/linux/xattr.h |  3 ++-
 mm/shmem.c            |  2 +-
 4 files changed, 41 insertions(+), 19 deletions(-)

diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index f97e1f7..16405ae 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -230,7 +230,7 @@ ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size)
 	if (!attrs)
 		return -ENOMEM;
 
-	return simple_xattr_list(&attrs->xattrs, buf, size);
+	return simple_xattr_list(d_inode(dentry), &attrs->xattrs, buf, size);
 }
 
 static inline void set_default_inode_attr(struct inode *inode, umode_t mode)
diff --git a/fs/xattr.c b/fs/xattr.c
index 4ef8b37..c3af6c9 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -921,38 +921,59 @@ static bool xattr_is_trusted(const char *name)
 	return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
 }
 
+static int xattr_list_one(char **buffer, ssize_t *remaining_size,
+			  const char *name)
+{
+	size_t len = strlen(name) + 1;
+	if (*buffer) {
+		if (*remaining_size < len)
+			return -ERANGE;
+		memcpy(*buffer, name, len);
+		*buffer += len;
+	}
+	*remaining_size -= len;
+	return 0;
+}
+
 /*
  * xattr LIST operation for in-memory/pseudo filesystems
  */
-ssize_t simple_xattr_list(struct simple_xattrs *xattrs, char *buffer,
-			  size_t size)
+ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs,
+			  char *buffer, size_t size)
 {
 	bool trusted = capable(CAP_SYS_ADMIN);
 	struct simple_xattr *xattr;
-	size_t used = 0;
+	ssize_t remaining_size = size;
+	int err;
+
+#ifdef CONFIG_FS_POSIX_ACL
+	if (inode->i_acl) {
+		err = xattr_list_one(&buffer, &remaining_size,
+				     XATTR_NAME_POSIX_ACL_ACCESS);
+		if (err)
+			return err;
+	}
+	if (inode->i_default_acl) {
+		err = xattr_list_one(&buffer, &remaining_size,
+				     XATTR_NAME_POSIX_ACL_DEFAULT);
+		if (err)
+			return err;
+	}
+#endif
 
 	spin_lock(&xattrs->lock);
 	list_for_each_entry(xattr, &xattrs->head, list) {
-		size_t len;
-
 		/* skip "trusted." attributes for unprivileged callers */
 		if (!trusted && xattr_is_trusted(xattr->name))
 			continue;
 
-		len = strlen(xattr->name) + 1;
-		used += len;
-		if (buffer) {
-			if (size < used) {
-				used = -ERANGE;
-				break;
-			}
-			memcpy(buffer, xattr->name, len);
-			buffer += len;
-		}
+		err = xattr_list_one(&buffer, &remaining_size, xattr->name);
+		if (err)
+			return err;
 	}
 	spin_unlock(&xattrs->lock);
 
-	return used;
+	return size - remaining_size;
 }
 
 /*
diff --git a/include/linux/xattr.h b/include/linux/xattr.h
index 4dd40cb..d23ce8e 100644
--- a/include/linux/xattr.h
+++ b/include/linux/xattr.h
@@ -104,7 +104,8 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name,
 		     void *buffer, size_t size);
 int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
 		     const void *value, size_t size, int flags);
-ssize_t simple_xattr_list(struct simple_xattrs *xattrs, char *buffer, size_t size);
+ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, char *buffer,
+			  size_t size);
 void simple_xattr_list_add(struct simple_xattrs *xattrs,
 			   struct simple_xattr *new_xattr);
 
diff --git a/mm/shmem.c b/mm/shmem.c
index fdfe6c8..297390f 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2606,7 +2606,7 @@ static const struct xattr_handler *shmem_xattr_handlers[] = {
 static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
 {
 	struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
-	return simple_xattr_list(&info->xattrs, buffer, size);
+	return simple_xattr_list(d_inode(dentry), &info->xattrs, buffer, size);
 }
 #endif /* CONFIG_TMPFS_XATTR */
 
-- 
cgit v1.1


From 5d92b75c753ae27578ee764df3be650c67fa5877 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Wed, 2 Dec 2015 14:44:40 +0100
Subject: xfs: Change how listxattr generates synthetic attributes

Instead of adding the synthesized POSIX ACL attribute names after listing all
non-synthesized attributes, generate them immediately when listing the
non-synthesized attributes.

In addition, merge xfs_xattr_put_listent and xfs_xattr_put_listent_sizes to
ensure that the list size is computed correctly; the split version was
overestimating the list size for non-root users.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: xfs@oss.sgi.com
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/xfs/xfs_acl.c   |  23 ---------
 fs/xfs/xfs_acl.h   |   4 --
 fs/xfs/xfs_xattr.c | 137 +++++++++++++++++++++++------------------------------
 3 files changed, 59 insertions(+), 105 deletions(-)

diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 6bb470f..2d5df1f 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -252,29 +252,6 @@ xfs_set_mode(struct inode *inode, umode_t mode)
 	return error;
 }
 
-static int
-xfs_acl_exists(struct inode *inode, unsigned char *name)
-{
-	int len = XFS_ACL_MAX_SIZE(XFS_M(inode->i_sb));
-
-	return (xfs_attr_get(XFS_I(inode), name, NULL, &len,
-			    ATTR_ROOT|ATTR_KERNOVAL) == 0);
-}
-
-int
-posix_acl_access_exists(struct inode *inode)
-{
-	return xfs_acl_exists(inode, SGI_ACL_FILE);
-}
-
-int
-posix_acl_default_exists(struct inode *inode)
-{
-	if (!S_ISDIR(inode->i_mode))
-		return 0;
-	return xfs_acl_exists(inode, SGI_ACL_DEFAULT);
-}
-
 int
 xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 {
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 52f8255..286fa89 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -24,16 +24,12 @@ struct posix_acl;
 #ifdef CONFIG_XFS_POSIX_ACL
 extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
 extern int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
-extern int posix_acl_access_exists(struct inode *inode);
-extern int posix_acl_default_exists(struct inode *inode);
 #else
 static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type)
 {
 	return NULL;
 }
 # define xfs_set_acl					NULL
-# define posix_acl_access_exists(inode)			0
-# define posix_acl_default_exists(inode)		0
 #endif /* CONFIG_XFS_POSIX_ACL */
 
 extern void xfs_forget_acl(struct inode *inode, const char *name, int xflags);
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 36a4385..110f1d7 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -129,47 +129,19 @@ const struct xattr_handler *xfs_xattr_handlers[] = {
 	NULL
 };
 
-static unsigned int xfs_xattr_prefix_len(int flags)
-{
-	if (flags & XFS_ATTR_SECURE)
-		return sizeof("security");
-	else if (flags & XFS_ATTR_ROOT)
-		return sizeof("trusted");
-	else
-		return sizeof("user");
-}
-
-static const char *xfs_xattr_prefix(int flags)
-{
-	if (flags & XFS_ATTR_SECURE)
-		return xfs_xattr_security_handler.prefix;
-	else if (flags & XFS_ATTR_ROOT)
-		return xfs_xattr_trusted_handler.prefix;
-	else
-		return xfs_xattr_user_handler.prefix;
-}
-
 static int
-xfs_xattr_put_listent(
+__xfs_xattr_put_listent(
 	struct xfs_attr_list_context *context,
-	int		flags,
-	unsigned char	*name,
-	int		namelen,
-	int		valuelen,
-	unsigned char	*value)
+	char *prefix,
+	int prefix_len,
+	unsigned char *name,
+	int namelen)
 {
-	unsigned int prefix_len = xfs_xattr_prefix_len(flags);
 	char *offset;
 	int arraytop;
 
-	ASSERT(context->count >= 0);
-
-	/*
-	 * Only show root namespace entries if we are actually allowed to
-	 * see them.
-	 */
-	if ((flags & XFS_ATTR_ROOT) && !capable(CAP_SYS_ADMIN))
-		return 0;
+	if (!context->alist)
+		goto compute_size;
 
 	arraytop = context->count + prefix_len + namelen + 1;
 	if (arraytop > context->firstu) {
@@ -177,17 +149,19 @@ xfs_xattr_put_listent(
 		return 1;
 	}
 	offset = (char *)context->alist + context->count;
-	strncpy(offset, xfs_xattr_prefix(flags), prefix_len);
+	strncpy(offset, prefix, prefix_len);
 	offset += prefix_len;
 	strncpy(offset, (char *)name, namelen);			/* real name */
 	offset += namelen;
 	*offset = '\0';
+
+compute_size:
 	context->count += prefix_len + namelen + 1;
 	return 0;
 }
 
 static int
-xfs_xattr_put_listent_sizes(
+xfs_xattr_put_listent(
 	struct xfs_attr_list_context *context,
 	int		flags,
 	unsigned char	*name,
@@ -195,24 +169,55 @@ xfs_xattr_put_listent_sizes(
 	int		valuelen,
 	unsigned char	*value)
 {
-	context->count += xfs_xattr_prefix_len(flags) + namelen + 1;
-	return 0;
-}
+	char *prefix;
+	int prefix_len;
 
-static int
-list_one_attr(const char *name, const size_t len, void *data,
-		size_t size, ssize_t *result)
-{
-	char *p = data + *result;
+	ASSERT(context->count >= 0);
 
-	*result += len;
-	if (!size)
-		return 0;
-	if (*result > size)
-		return -ERANGE;
+	if (flags & XFS_ATTR_ROOT) {
+#ifdef CONFIG_XFS_POSIX_ACL
+		if (namelen == SGI_ACL_FILE_SIZE &&
+		    strncmp(name, SGI_ACL_FILE,
+			    SGI_ACL_FILE_SIZE) == 0) {
+			int ret = __xfs_xattr_put_listent(
+					context, XATTR_SYSTEM_PREFIX,
+					XATTR_SYSTEM_PREFIX_LEN,
+					XATTR_POSIX_ACL_ACCESS,
+					strlen(XATTR_POSIX_ACL_ACCESS));
+			if (ret)
+				return ret;
+		} else if (namelen == SGI_ACL_DEFAULT_SIZE &&
+			 strncmp(name, SGI_ACL_DEFAULT,
+				 SGI_ACL_DEFAULT_SIZE) == 0) {
+			int ret = __xfs_xattr_put_listent(
+					context, XATTR_SYSTEM_PREFIX,
+					XATTR_SYSTEM_PREFIX_LEN,
+					XATTR_POSIX_ACL_DEFAULT,
+					strlen(XATTR_POSIX_ACL_DEFAULT));
+			if (ret)
+				return ret;
+		}
+#endif
 
-	strcpy(p, name);
-	return 0;
+		/*
+		 * Only show root namespace entries if we are actually allowed to
+		 * see them.
+		 */
+		if (!capable(CAP_SYS_ADMIN))
+			return 0;
+
+		prefix = XATTR_TRUSTED_PREFIX;
+		prefix_len = XATTR_TRUSTED_PREFIX_LEN;
+	} else if (flags & XFS_ATTR_SECURE) {
+		prefix = XATTR_SECURITY_PREFIX;
+		prefix_len = XATTR_SECURITY_PREFIX_LEN;
+	} else {
+		prefix = XATTR_USER_PREFIX;
+		prefix_len = XATTR_USER_PREFIX_LEN;
+	}
+
+	return __xfs_xattr_put_listent(context, prefix, prefix_len, name,
+				       namelen);
 }
 
 ssize_t
@@ -221,7 +226,6 @@ xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size)
 	struct xfs_attr_list_context context;
 	struct attrlist_cursor_kern cursor = { 0 };
 	struct inode		*inode = d_inode(dentry);
-	int			error;
 
 	/*
 	 * First read the regular on-disk attributes.
@@ -230,37 +234,14 @@ xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size)
 	context.dp = XFS_I(inode);
 	context.cursor = &cursor;
 	context.resynch = 1;
-	context.alist = data;
+	context.alist = size ? data : NULL;
 	context.bufsize = size;
 	context.firstu = context.bufsize;
-
-	if (size)
-		context.put_listent = xfs_xattr_put_listent;
-	else
-		context.put_listent = xfs_xattr_put_listent_sizes;
+	context.put_listent = xfs_xattr_put_listent;
 
 	xfs_attr_list_int(&context);
 	if (context.count < 0)
 		return -ERANGE;
 
-	/*
-	 * Then add the two synthetic ACL attributes.
-	 */
-	if (posix_acl_access_exists(inode)) {
-		error = list_one_attr(XATTR_NAME_POSIX_ACL_ACCESS,
-				strlen(XATTR_NAME_POSIX_ACL_ACCESS) + 1,
-				data, size, &context.count);
-		if (error)
-			return error;
-	}
-
-	if (posix_acl_default_exists(inode)) {
-		error = list_one_attr(XATTR_NAME_POSIX_ACL_DEFAULT,
-				strlen(XATTR_NAME_POSIX_ACL_DEFAULT) + 1,
-				data, size, &context.count);
-		if (error)
-			return error;
-	}
-
 	return context.count;
 }
-- 
cgit v1.1


From 5dcf16df3ce48b2e4f798b1a11b5de2fc3cfd73a Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Mon, 7 Dec 2015 02:36:25 +0000
Subject: perf machine: Pass correct string to dso__adjust_kmod_long_name

There's a mistake in dso__adjust_kmod_long_name() that it use strdup()
to dup the new long_name of a dso, but passes the original string to
dso__set_long_name(). Which causes random crash during cleanup.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Fixes: c03d5184f0e9 ("perf machine: Adjust dso->long_name for offline module")
Link: http://lkml.kernel.org/r/1449455785-42020-1-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/machine.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 95a7f60..bfc289c 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -576,7 +576,7 @@ static void dso__adjust_kmod_long_name(struct dso *dso, const char *filename)
 	if (!dup_filename)
 		return;
 
-	dso__set_long_name(dso, filename, true);
+	dso__set_long_name(dso, dup_filename, true);
 }
 
 struct map *machine__findnew_module_map(struct machine *machine, u64 start,
-- 
cgit v1.1


From bdaba8aee5c3806d78ee4f130048b2238c636d47 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 3 Dec 2015 09:34:12 +0100
Subject: perf test: Use machine__new_host in dwarf unwind test

This is more straightforward than what we have now.

It also fixes a segfault within machine__exit, that's caused by not
creating kernel maps for machine.. We're calling
machine__destroy_kernel_maps in machine__exit since commit:

  ebe9729c8c31 perf machine: Fix to destroy kernel maps when machine exits

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1449131658-1841-2-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/dwarf-unwind.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/tools/perf/tests/dwarf-unwind.c b/tools/perf/tests/dwarf-unwind.c
index 3cce13b1..1c5c022 100644
--- a/tools/perf/tests/dwarf-unwind.c
+++ b/tools/perf/tests/dwarf-unwind.c
@@ -160,14 +160,11 @@ static int krava_1(struct thread *thread)
 
 int test__dwarf_unwind(int subtest __maybe_unused)
 {
-	struct machines machines;
 	struct machine *machine;
 	struct thread *thread;
 	int err = -1;
 
-	machines__init(&machines);
-
-	machine = machines__find(&machines, HOST_KERNEL_ID);
+	machine = machine__new_host();
 	if (!machine) {
 		pr_err("Could not get machine\n");
 		return -1;
@@ -199,7 +196,6 @@ int test__dwarf_unwind(int subtest __maybe_unused)
 
  out:
 	machine__delete_threads(machine);
-	machine__exit(machine);
-	machines__exit(&machines);
+	machine__delete(machine);
 	return err;
 }
-- 
cgit v1.1


From 046847935754f27c2e8334ff15abda0b733a1fd4 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 3 Dec 2015 09:34:13 +0100
Subject: perf test: Use machine__new_host in mmap thread lookup test

This is more straightforward than what we have now.

It also fixes a segfault within machine__exit, that's caused by not
creating kernel maps for machine.. We're calling
machine__destroy_kernel_maps in machine__exit since commit:

  ebe9729c8c31 perf machine: Fix to destroy kernel maps when machine exits

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1449131658-1841-3-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/mmap-thread-lookup.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tools/perf/tests/mmap-thread-lookup.c b/tools/perf/tests/mmap-thread-lookup.c
index 6cdb975..0c5ce44 100644
--- a/tools/perf/tests/mmap-thread-lookup.c
+++ b/tools/perf/tests/mmap-thread-lookup.c
@@ -149,7 +149,6 @@ static int synth_process(struct machine *machine)
 
 static int mmap_events(synth_cb synth)
 {
-	struct machines machines;
 	struct machine *machine;
 	int err, i;
 
@@ -162,8 +161,7 @@ static int mmap_events(synth_cb synth)
 	 */
 	TEST_ASSERT_VAL("failed to create threads", !threads_create());
 
-	machines__init(&machines);
-	machine = &machines.host;
+	machine = machine__new_host();
 
 	dump_trace = verbose > 1 ? 1 : 0;
 
@@ -203,7 +201,7 @@ static int mmap_events(synth_cb synth)
 	}
 
 	machine__delete_threads(machine);
-	machines__exit(&machines);
+	machine__delete(machine);
 	return err;
 }
 
-- 
cgit v1.1


From 0fd4008ed755c52d85117302a3c2c108b2958420 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 3 Dec 2015 09:34:14 +0100
Subject: perf test: Use machine__new_host in mmap thread code reading test

This is more straightforward than what we have now.

It also fixes a segfault within machine__exit, that's caused
by not creating kernel maps for machine.. We're calling
machine__destroy_kernel_maps in machine__exit since commit:

  ebe9729c8c31 perf machine: Fix to destroy kernel maps when machine exits

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1449131658-1841-4-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/code-reading.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c
index 4417b6a..26182ff 100644
--- a/tools/perf/tests/code-reading.c
+++ b/tools/perf/tests/code-reading.c
@@ -433,7 +433,6 @@ enum {
 
 static int do_test_code_reading(bool try_kcore)
 {
-	struct machines machines;
 	struct machine *machine;
 	struct thread *thread;
 	struct record_opts opts = {
@@ -459,8 +458,7 @@ static int do_test_code_reading(bool try_kcore)
 
 	pid = getpid();
 
-	machines__init(&machines);
-	machine = &machines.host;
+	machine = machine__new_host();
 
 	ret = machine__create_kernel_maps(machine);
 	if (ret < 0) {
@@ -594,9 +592,8 @@ out_err:
 		cpu_map__put(cpus);
 		thread_map__put(threads);
 	}
-	machines__destroy_kernel_maps(&machines);
 	machine__delete_threads(machine);
-	machines__exit(&machines);
+	machine__delete(machine);
 
 	return err;
 }
-- 
cgit v1.1


From 7320b1b3d9e6af30adcbead64568be3c40b50e59 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 3 Dec 2015 09:34:15 +0100
Subject: perf test: Fix cpus and thread maps reference in error path

In error path to try user space event, both cpus and threads map now
owned by evlist and freed by perf_evlist__set_maps call.  Getting
reference to keep them alive.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1449131658-1841-5-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/code-reading.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c
index 26182ff..313a48c 100644
--- a/tools/perf/tests/code-reading.c
+++ b/tools/perf/tests/code-reading.c
@@ -547,6 +547,13 @@ static int do_test_code_reading(bool try_kcore)
 		if (ret < 0) {
 			if (!excl_kernel) {
 				excl_kernel = true;
+				/*
+				 * Both cpus and threads are now owned by evlist
+				 * and will be freed by following perf_evlist__set_maps
+				 * call. Getting refference to keep them alive.
+				 */
+				cpu_map__get(cpus);
+				thread_map__get(threads);
 				perf_evlist__set_maps(evlist, NULL, NULL);
 				perf_evlist__delete(evlist);
 				evlist = NULL;
-- 
cgit v1.1


From c0651c41e45dee1d6abb83fd5b25e7097aeac141 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 3 Dec 2015 09:34:16 +0100
Subject: perf test: Prevent using bpf-output event in round trip name test

The bpf-output is added under software events, but is not parse-able
within parse_events, which is what round trip test is expecting.

Checking software events only until dummy event.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1449131658-1841-6-git-send-email-jolsa@kernel.org
[ Make it a one liner by keeping __perf_evsel__name_array_test() around ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/evsel-roundtrip-name.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/perf/tests/evsel-roundtrip-name.c b/tools/perf/tests/evsel-roundtrip-name.c
index 1da92e1..2de4a4f 100644
--- a/tools/perf/tests/evsel-roundtrip-name.c
+++ b/tools/perf/tests/evsel-roundtrip-name.c
@@ -103,7 +103,8 @@ int test__perf_evsel__roundtrip_name_test(int subtest __maybe_unused)
 	if (err)
 		ret = err;
 
-	err = perf_evsel__name_array_test(perf_evsel__sw_names);
+	err = __perf_evsel__name_array_test(perf_evsel__sw_names,
+					    PERF_COUNT_SW_DUMMY + 1);
 	if (err)
 		ret = err;
 
-- 
cgit v1.1


From d6e94fa6b6dab4668e46665bbe766142af32cc15 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 2 Dec 2015 22:08:29 +0100
Subject: perf test: Create kernel maps properly for hist entries test

It fixes segfault within machine__exit, that's caused
but not creating kernel maps for machine.. We're calling
machine__destroy_kernel_maps in machine__exit since commit:

  ebe9729c8c31 perf machine: Fix to destroy kernel maps when machine exits

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/tip-k4snzv5t4dvdckggzwdzyljo@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/hists_common.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tools/perf/tests/hists_common.c b/tools/perf/tests/hists_common.c
index ce80b27..46f453b 100644
--- a/tools/perf/tests/hists_common.c
+++ b/tools/perf/tests/hists_common.c
@@ -87,6 +87,11 @@ struct machine *setup_fake_machine(struct machines *machines)
 		return NULL;
 	}
 
+	if (machine__create_kernel_maps(machine)) {
+		pr_debug("Not enough memory for machine setup\n");
+		goto out;
+	}
+
 	for (i = 0; i < ARRAY_SIZE(fake_threads); i++) {
 		struct thread *thread;
 
-- 
cgit v1.1


From 5cd95fc3f8d84a8bb256838fa3b6b59e9095eaa2 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 3 Dec 2015 10:06:40 +0100
Subject: perf evsel: Use event maps directly in perf_evsel__enable

All events now share proper cpu and thread maps. There's no need to pass
those maps from evlist, it's safe to use evsel maps for enabling event.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1449133606-14429-2-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-stat.c | 5 +----
 tools/perf/util/evsel.c   | 5 ++++-
 tools/perf/util/evsel.h   | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index df2fbf0..813c52a 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -256,12 +256,9 @@ static void handle_initial_delay(void)
 	struct perf_evsel *counter;
 
 	if (initial_delay) {
-		const int ncpus = cpu_map__nr(evsel_list->cpus),
-			nthreads = thread_map__nr(evsel_list->threads);
-
 		usleep(initial_delay * 1000);
 		evlist__for_each(evsel_list, counter)
-			perf_evsel__enable(counter, ncpus, nthreads);
+			perf_evsel__enable(counter);
 	}
 }
 
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 0a1f4d9..3a9b506 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -981,8 +981,11 @@ int perf_evsel__append_filter(struct perf_evsel *evsel,
 	return -1;
 }
 
-int perf_evsel__enable(struct perf_evsel *evsel, int ncpus, int nthreads)
+int perf_evsel__enable(struct perf_evsel *evsel)
 {
+	int nthreads = thread_map__nr(evsel->threads);
+	int ncpus = cpu_map__nr(evsel->cpus);
+
 	return perf_evsel__run_ioctl(evsel, ncpus, nthreads,
 				     PERF_EVENT_IOC_ENABLE,
 				     0);
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 0e49bd7..a721592 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -227,7 +227,7 @@ int perf_evsel__append_filter(struct perf_evsel *evsel,
 			      const char *op, const char *filter);
 int perf_evsel__apply_filter(struct perf_evsel *evsel, int ncpus, int nthreads,
 			     const char *filter);
-int perf_evsel__enable(struct perf_evsel *evsel, int ncpus, int nthreads);
+int perf_evsel__enable(struct perf_evsel *evsel);
 
 int perf_evsel__open_per_cpu(struct perf_evsel *evsel,
 			     struct cpu_map *cpus);
-- 
cgit v1.1


From e98a4cbb01e0ba1110eba5166a425b3eab9b2244 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 3 Dec 2015 10:06:41 +0100
Subject: perf evsel: Introduce disable() method

Adding perf_evsel__disable function to have complement for
perf_evsel__enable function. Both will be used in following patch to
factor perf_evlist__(enable|disable).

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1449133606-14429-3-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/evsel.c | 10 ++++++++++
 tools/perf/util/evsel.h |  1 +
 2 files changed, 11 insertions(+)

diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 3a9b506..47f0330 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -991,6 +991,16 @@ int perf_evsel__enable(struct perf_evsel *evsel)
 				     0);
 }
 
+int perf_evsel__disable(struct perf_evsel *evsel)
+{
+	int nthreads = thread_map__nr(evsel->threads);
+	int ncpus = cpu_map__nr(evsel->cpus);
+
+	return perf_evsel__run_ioctl(evsel, ncpus, nthreads,
+				     PERF_EVENT_IOC_DISABLE,
+				     0);
+}
+
 int perf_evsel__alloc_id(struct perf_evsel *evsel, int ncpus, int nthreads)
 {
 	if (ncpus == 0 || nthreads == 0)
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index a721592..5ded1fc 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -228,6 +228,7 @@ int perf_evsel__append_filter(struct perf_evsel *evsel,
 int perf_evsel__apply_filter(struct perf_evsel *evsel, int ncpus, int nthreads,
 			     const char *filter);
 int perf_evsel__enable(struct perf_evsel *evsel);
+int perf_evsel__disable(struct perf_evsel *evsel);
 
 int perf_evsel__open_per_cpu(struct perf_evsel *evsel,
 			     struct cpu_map *cpus);
-- 
cgit v1.1


From 3e27c92081131738fa4d7dd71673aa6e8c24866d Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 3 Dec 2015 10:06:42 +0100
Subject: perf evlist: Factor perf_evlist__(enable|disable) functions

Use perf_evsel__(enable|disable) functions in perf_evlist__(enable|disable)
functions in order to centralize ioctl enable/disable calls. This way we
eliminate 2 places calling directly ioctl.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1449133606-14429-4-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/evlist.c | 32 ++++++++------------------------
 1 file changed, 8 insertions(+), 24 deletions(-)

diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index d139219..d1b6c20 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -336,20 +336,12 @@ static int perf_evlist__nr_threads(struct perf_evlist *evlist,
 
 void perf_evlist__disable(struct perf_evlist *evlist)
 {
-	int cpu, thread;
 	struct perf_evsel *pos;
-	int nr_cpus = cpu_map__nr(evlist->cpus);
-	int nr_threads;
 
-	for (cpu = 0; cpu < nr_cpus; cpu++) {
-		evlist__for_each(evlist, pos) {
-			if (!perf_evsel__is_group_leader(pos) || !pos->fd)
-				continue;
-			nr_threads = perf_evlist__nr_threads(evlist, pos);
-			for (thread = 0; thread < nr_threads; thread++)
-				ioctl(FD(pos, cpu, thread),
-				      PERF_EVENT_IOC_DISABLE, 0);
-		}
+	evlist__for_each(evlist, pos) {
+		if (!perf_evsel__is_group_leader(pos) || !pos->fd)
+			continue;
+		perf_evsel__disable(pos);
 	}
 
 	evlist->enabled = false;
@@ -357,20 +349,12 @@ void perf_evlist__disable(struct perf_evlist *evlist)
 
 void perf_evlist__enable(struct perf_evlist *evlist)
 {
-	int cpu, thread;
 	struct perf_evsel *pos;
-	int nr_cpus = cpu_map__nr(evlist->cpus);
-	int nr_threads;
 
-	for (cpu = 0; cpu < nr_cpus; cpu++) {
-		evlist__for_each(evlist, pos) {
-			if (!perf_evsel__is_group_leader(pos) || !pos->fd)
-				continue;
-			nr_threads = perf_evlist__nr_threads(evlist, pos);
-			for (thread = 0; thread < nr_threads; thread++)
-				ioctl(FD(pos, cpu, thread),
-				      PERF_EVENT_IOC_ENABLE, 0);
-		}
+	evlist__for_each(evlist, pos) {
+		if (!perf_evsel__is_group_leader(pos) || !pos->fd)
+			continue;
+		perf_evsel__enable(pos);
 	}
 
 	evlist->enabled = true;
-- 
cgit v1.1


From ab46db0a3325a064bb24e826b12995d157565efb Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 3 Dec 2015 10:06:43 +0100
Subject: perf stat: Use perf_evlist__enable in handle_initial_delay

No need to mimic the behaviour of perf_evlist__enable, we can use it
directly.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1449133606-14429-5-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-stat.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 813c52a..8ca40de 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -253,12 +253,9 @@ static void process_interval(void)
 
 static void handle_initial_delay(void)
 {
-	struct perf_evsel *counter;
-
 	if (initial_delay) {
 		usleep(initial_delay * 1000);
-		evlist__for_each(evsel_list, counter)
-			perf_evsel__enable(counter);
+		perf_evlist__enable(evsel_list);
 	}
 }
 
-- 
cgit v1.1


From 67ccdecd09cac818146b1e153ff901cb67570012 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 3 Dec 2015 10:06:44 +0100
Subject: perf stat: Create events as disabled

Currently we have 2 kinds of stat counters based on when the event is
enabled:

  1) tracee command events, which are enable once the
     tracee executes exec syscall (enable_on_exec bit)
  2) all other events which get alive within the
     perf_event_open syscall

And 2) case could raise a problem in case we want additional filter to
be attached for event. In this case we want the event to be enabled
after it's configured with filter.

Changing the behaviour of 2) events, so they all are created as disabled
(disabled bit). Adding extra enable call to make them alive once they
finish setup.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1449133606-14429-6-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-stat.c | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 8ca40de..2e70610 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -168,11 +168,18 @@ static int create_perf_stat_counter(struct perf_evsel *evsel)
 	attr->sample_period = 0;
 	attr->sample_type   = 0;
 
+	/*
+	 * Disabling all counters initially, they will be enabled
+	 * either manually by us or by kernel via enable_on_exec
+	 * set later.
+	 */
+	if (perf_evsel__is_group_leader(evsel))
+		attr->disabled = 1;
+
 	if (target__has_cpu(&target))
 		return perf_evsel__open_per_cpu(evsel, perf_evsel__cpus(evsel));
 
 	if (!target__has_task(&target) && perf_evsel__is_group_leader(evsel)) {
-		attr->disabled = 1;
 		if (!initial_delay)
 			attr->enable_on_exec = 1;
 	}
@@ -251,12 +258,18 @@ static void process_interval(void)
 	print_counters(&rs, 0, NULL);
 }
 
-static void handle_initial_delay(void)
+static void enable_counters(void)
 {
-	if (initial_delay) {
+	if (initial_delay)
 		usleep(initial_delay * 1000);
+
+	/*
+	 * We need to enable counters only if:
+	 * - we don't have tracee (attaching to task or cpu)
+	 * - we have initial delay configured
+	 */
+	if (!target__none(&target) || initial_delay)
 		perf_evlist__enable(evsel_list);
-	}
 }
 
 static volatile int workload_exec_errno;
@@ -353,7 +366,7 @@ static int __run_perf_stat(int argc, const char **argv)
 
 	if (forks) {
 		perf_evlist__start_workload(evsel_list);
-		handle_initial_delay();
+		enable_counters();
 
 		if (interval) {
 			while (!waitpid(child_pid, &status, WNOHANG)) {
@@ -372,7 +385,7 @@ static int __run_perf_stat(int argc, const char **argv)
 		if (WIFSIGNALED(status))
 			psignal(WTERMSIG(status), argv[0]);
 	} else {
-		handle_initial_delay();
+		enable_counters();
 		while (!done) {
 			nanosleep(&ts, NULL);
 			if (interval)
-- 
cgit v1.1


From c8280cec2a196f2ffea83dd755b17eb020ca1b83 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 3 Dec 2015 10:06:45 +0100
Subject: perf stat: Move enable_on_exec setup under earlier code

It's more readable this way and we can save one
perf_evsel__is_group_leader condition in current code.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1449133606-14429-7-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-stat.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 2e70610..e74712d 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -173,17 +173,20 @@ static int create_perf_stat_counter(struct perf_evsel *evsel)
 	 * either manually by us or by kernel via enable_on_exec
 	 * set later.
 	 */
-	if (perf_evsel__is_group_leader(evsel))
+	if (perf_evsel__is_group_leader(evsel)) {
 		attr->disabled = 1;
 
-	if (target__has_cpu(&target))
-		return perf_evsel__open_per_cpu(evsel, perf_evsel__cpus(evsel));
-
-	if (!target__has_task(&target) && perf_evsel__is_group_leader(evsel)) {
-		if (!initial_delay)
+		/*
+		 * In case of initial_delay we enable tracee
+		 * events manually.
+		 */
+		if (target__none(&target) && !initial_delay)
 			attr->enable_on_exec = 1;
 	}
 
+	if (target__has_cpu(&target))
+		return perf_evsel__open_per_cpu(evsel, perf_evsel__cpus(evsel));
+
 	return perf_evsel__open_per_thread(evsel, evsel_list->threads);
 }
 
-- 
cgit v1.1


From cfef25b8daf7e4b49c84e174a904af9d89dc7c46 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@arm.linux.org.uk>
Date: Sun, 6 Dec 2015 23:07:13 +0000
Subject: perf annotate: ARM support

Add basic support to parse ARM assembly.

This:

* enables perf to correctly show the disassembly, rather than chopping
  some constants off at the '#' (which is not a comment character on
  ARM).

* allows perf to identify ARM instructions that branch to other parts
  within the same function, thereby properly annotating them.

* allows perf to identify function calls, allowing called functions to
  be followed in the annotated view.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Will Deacon <will.deacon@arm.com>
Link: http://lkml.kernel.org/n/tip-owp1uj0nmcgfrlppfyeetuyf@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/annotate.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 1dd1949..b795b69 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -65,6 +65,11 @@ static int call__parse(struct ins_operands *ops)
 
 	name++;
 
+#ifdef __arm__
+	if (strchr(name, '+'))
+		return -1;
+#endif
+
 	tok = strchr(name, '>');
 	if (tok == NULL)
 		return -1;
@@ -246,7 +251,11 @@ static int mov__parse(struct ins_operands *ops)
 		return -1;
 
 	target = ++s;
+#ifdef __arm__
+	comment = strchr(s, ';');
+#else
 	comment = strchr(s, '#');
+#endif
 
 	if (comment != NULL)
 		s = comment - 1;
@@ -354,6 +363,20 @@ static struct ins instructions[] = {
 	{ .name = "addq",  .ops  = &mov_ops, },
 	{ .name = "addw",  .ops  = &mov_ops, },
 	{ .name = "and",   .ops  = &mov_ops, },
+#ifdef __arm__
+	{ .name = "b",     .ops  = &jump_ops, }, // might also be a call
+	{ .name = "bcc",   .ops  = &jump_ops, },
+	{ .name = "bcs",   .ops  = &jump_ops, },
+	{ .name = "beq",   .ops  = &jump_ops, },
+	{ .name = "bge",   .ops  = &jump_ops, },
+	{ .name = "bgt",   .ops  = &jump_ops, },
+	{ .name = "bhi",   .ops  = &jump_ops, },
+	{ .name = "bl",    .ops  = &call_ops, },
+	{ .name = "blt",   .ops  = &jump_ops, },
+	{ .name = "bls",   .ops  = &jump_ops, },
+	{ .name = "blx",   .ops  = &call_ops, },
+	{ .name = "bne",   .ops  = &jump_ops, },
+#endif
 	{ .name = "bts",   .ops  = &mov_ops, },
 	{ .name = "call",  .ops  = &call_ops, },
 	{ .name = "callq", .ops  = &call_ops, },
-- 
cgit v1.1


From 79cfea0273876d9c438f3227b8f68c8c7ae31583 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 7 Dec 2015 13:09:52 -0800
Subject: rcu: Remove TINY_RCU bloat from pointless boot parameters

The rcu_expedited, rcu_normal, and rcu_normal_after_boot kernel boot
parameters are pointless in the case of TINY_RCU because in that case
synchronous grace periods, both expedited and normal, are no-ops.
However, these three symbols contribute several hundred bytes of bloat.
This commit therefore uses CPP directives to avoid compiling this code
in TINY_RCU kernels.

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 Documentation/kernel-parameters.txt | 13 ++++++++-----
 include/linux/rcupdate.h            |  9 ++++++++-
 kernel/ksysfs.c                     |  4 ++++
 kernel/rcu/update.c                 |  7 ++++---
 4 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 197305bb..d8186da 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -3308,20 +3308,23 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			of synchronize_rcu().  This reduces latency,
 			but can increase CPU utilization, degrade
 			real-time latency, and degrade energy efficiency.
+			No effect on CONFIG_TINY_RCU kernels.
 
 	rcupdate.rcu_normal= [KNL]
 			Use only normal grace-period primitives,
 			for example, synchronize_rcu() instead of
 			synchronize_rcu_expedited().  This improves
-			real-time latency, CPU utilization, and energy
-			efficiency, but can expose users to increased
-			grace-period latency.  This parameter overrides
-			rcupdate.rcu_expedited.
+			real-time latency, CPU utilization, and
+			energy efficiency, but can expose users to
+			increased grace-period latency.  This parameter
+			overrides rcupdate.rcu_expedited.  No effect on
+			CONFIG_TINY_RCU kernels.
 
 	rcupdate.rcu_normal_after_boot= [KNL]
 			Once boot has completed (that is, after
 			rcu_end_inkernel_boot() has been invoked), use
-			only normal grace-period primitives.
+			only normal grace-period primitives.  No effect
+			on CONFIG_TINY_RCU kernels.
 
 	rcupdate.rcu_task_stall_timeout= [KNL]
 			Set timeout in jiffies for RCU task stall warning
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 98d9f30c0..47e95b8 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -48,8 +48,10 @@
 
 #include <asm/barrier.h>
 
+#ifndef CONFIG_TINY_RCU
 extern int rcu_expedited; /* for sysctl */
 extern int rcu_normal;    /* also for sysctl */
+#endif /* #ifndef CONFIG_TINY_RCU */
 
 #ifdef CONFIG_TINY_RCU
 /* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */
@@ -327,7 +329,6 @@ static inline int rcu_preempt_depth(void)
 
 /* Internal to kernel */
 void rcu_init(void);
-void rcu_end_inkernel_boot(void);
 void rcu_sched_qs(void);
 void rcu_bh_qs(void);
 void rcu_check_callbacks(int user);
@@ -335,6 +336,12 @@ struct notifier_block;
 int rcu_cpu_notify(struct notifier_block *self,
 		   unsigned long action, void *hcpu);
 
+#ifndef CONFIG_TINY_RCU
+void rcu_end_inkernel_boot(void);
+#else /* #ifndef CONFIG_TINY_RCU */
+static inline void rcu_end_inkernel_boot(void) { }
+#endif /* #ifndef CONFIG_TINY_RCU */
+
 #ifdef CONFIG_RCU_STALL_COMMON
 void rcu_sysrq_start(void);
 void rcu_sysrq_end(void);
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index b4e2fa5..152da4a 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -144,6 +144,7 @@ static ssize_t fscaps_show(struct kobject *kobj,
 }
 KERNEL_ATTR_RO(fscaps);
 
+#ifndef CONFIG_TINY_RCU
 int rcu_expedited;
 static ssize_t rcu_expedited_show(struct kobject *kobj,
 				  struct kobj_attribute *attr, char *buf)
@@ -177,6 +178,7 @@ static ssize_t rcu_normal_store(struct kobject *kobj,
 	return count;
 }
 KERNEL_ATTR_RW(rcu_normal);
+#endif /* #ifndef CONFIG_TINY_RCU */
 
 /*
  * Make /sys/kernel/notes give the raw contents of our kernel .notes section.
@@ -219,8 +221,10 @@ static struct attribute * kernel_attrs[] = {
 	&kexec_crash_size_attr.attr,
 	&vmcoreinfo_attr.attr,
 #endif
+#ifndef CONFIG_TINY_RCU
 	&rcu_expedited_attr.attr,
 	&rcu_normal_attr.attr,
+#endif
 	NULL
 };
 
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 12b91f5..76b94e1 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -60,11 +60,12 @@ MODULE_ALIAS("rcupdate");
 #endif
 #define MODULE_PARAM_PREFIX "rcupdate."
 
+#ifndef CONFIG_TINY_RCU
 module_param(rcu_expedited, int, 0);
 module_param(rcu_normal, int, 0);
-
 static int rcu_normal_after_boot;
 module_param(rcu_normal_after_boot, int, 0);
+#endif /* #ifndef CONFIG_TINY_RCU */
 
 #if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_PREEMPT_COUNT)
 /**
@@ -172,8 +173,6 @@ void rcu_unexpedite_gp(void)
 }
 EXPORT_SYMBOL_GPL(rcu_unexpedite_gp);
 
-#endif /* #ifndef CONFIG_TINY_RCU */
-
 /*
  * Inform RCU of the end of the in-kernel boot sequence.
  */
@@ -185,6 +184,8 @@ void rcu_end_inkernel_boot(void)
 		WRITE_ONCE(rcu_normal, 1);
 }
 
+#endif /* #ifndef CONFIG_TINY_RCU */
+
 #ifdef CONFIG_PREEMPT_RCU
 
 /*
-- 
cgit v1.1


From a87f203e2731ab477386c678e59033ee103018c0 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 20 Oct 2015 12:38:49 -0700
Subject: rcu: Eliminate unused rcu_init_one() argument

Now that the rcu_state structure's ->rda field is compile-time initialized,
there is no need to pass the per-CPU rcu_data structure into rcu_init_one().
This commit therefore eliminates this now-unused parameter.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c        | 7 +++----
 kernel/rcu/tree_plugin.h | 2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 93941d3..9a4c8c0 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4351,8 +4351,7 @@ static void __init rcu_init_levelspread(int *levelspread, const int *levelcnt)
 /*
  * Helper function for rcu_init() that initializes one rcu_state structure.
  */
-static void __init rcu_init_one(struct rcu_state *rsp,
-		struct rcu_data __percpu *rda)
+static void __init rcu_init_one(struct rcu_state *rsp)
 {
 	static const char * const buf[] = RCU_NODE_NAME_INIT;
 	static const char * const fqs[] = RCU_FQS_NAME_INIT;
@@ -4545,8 +4544,8 @@ void __init rcu_init(void)
 
 	rcu_bootup_announce();
 	rcu_init_geometry();
-	rcu_init_one(&rcu_bh_state, &rcu_bh_data);
-	rcu_init_one(&rcu_sched_state, &rcu_sched_data);
+	rcu_init_one(&rcu_bh_state);
+	rcu_init_one(&rcu_sched_state);
 	if (dump_tree)
 		rcu_dump_rcu_node_tree(&rcu_sched_state);
 	__rcu_init_preempt();
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index e6da888..fccef5d 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -777,7 +777,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier);
  */
 static void __init __rcu_init_preempt(void)
 {
-	rcu_init_one(rcu_state_p, rcu_data_p);
+	rcu_init_one(rcu_state_p);
 }
 
 /*
-- 
cgit v1.1


From d117c8aa1d511f76401337620b9c4ffb4c886579 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 31 Oct 2015 00:01:18 -0700
Subject: rcu: Make cpu_needs_another_gp() be bool

The cpu_needs_another_gp() function is currently of type int, but only
returns zero or one.  Bow to reality and make it be of type bool.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 9a4c8c0..d6863bc 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -597,25 +597,25 @@ static int rcu_future_needs_gp(struct rcu_state *rsp)
  * The caller must have disabled interrupts to prevent races with
  * normal callback registry.
  */
-static int
+static bool
 cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
 {
 	int i;
 
 	if (rcu_gp_in_progress(rsp))
-		return 0;  /* No, a grace period is already in progress. */
+		return false;  /* No, a grace period is already in progress. */
 	if (rcu_future_needs_gp(rsp))
-		return 1;  /* Yes, a no-CBs CPU needs one. */
+		return true;  /* Yes, a no-CBs CPU needs one. */
 	if (!rdp->nxttail[RCU_NEXT_TAIL])
-		return 0;  /* No, this is a no-CBs (or offline) CPU. */
+		return false;  /* No, this is a no-CBs (or offline) CPU. */
 	if (*rdp->nxttail[RCU_NEXT_READY_TAIL])
-		return 1;  /* Yes, this CPU has newly registered callbacks. */
+		return true;  /* Yes, CPU has newly registered callbacks. */
 	for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
 		if (rdp->nxttail[i - 1] != rdp->nxttail[i] &&
 		    ULONG_CMP_LT(READ_ONCE(rsp->completed),
 				 rdp->nxtcompleted[i]))
-			return 1;  /* Yes, CBs for future grace period. */
-	return 0; /* No grace period needed. */
+			return true;  /* Yes, CBs for future grace period. */
+	return false; /* No grace period needed. */
 }
 
 /*
-- 
cgit v1.1


From 7c9906ca5e582a773fff696975e312cef58a7386 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 31 Oct 2015 00:59:01 -0700
Subject: rcu: Don't redundantly disable irqs in rcu_irq_{enter,exit}()

This commit replaces a local_irq_save()/local_irq_restore() pair with
a lockdep assertion that interrupts are already disabled.  This should
remove the corresponding overhead from the interrupt entry/exit fastpaths.

This change was inspired by the fact that Iftekhar Ahmed's mutation
testing showed that removing rcu_irq_enter()'s call to local_ird_restore()
had no effect, which might indicate that interrupts were always enabled
anyway.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h   |  4 ++--
 include/linux/rcutiny.h    |  8 ++++++++
 include/linux/rcutree.h    |  2 ++
 include/linux/tracepoint.h |  4 ++--
 kernel/rcu/tree.c          | 32 ++++++++++++++++++++++++++------
 5 files changed, 40 insertions(+), 10 deletions(-)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index a0189ba..f2b667d 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -379,9 +379,9 @@ static inline void rcu_init_nohz(void)
  */
 #define RCU_NONIDLE(a) \
 	do { \
-		rcu_irq_enter(); \
+		rcu_irq_enter_irqson(); \
 		do { a; } while (0); \
-		rcu_irq_exit(); \
+		rcu_irq_exit_irqson(); \
 	} while (0)
 
 /*
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 4c1aaf9..64809ae 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -181,6 +181,14 @@ static inline void rcu_irq_enter(void)
 {
 }
 
+static inline void rcu_irq_exit_irqson(void)
+{
+}
+
+static inline void rcu_irq_enter_irqson(void)
+{
+}
+
 static inline void rcu_irq_exit(void)
 {
 }
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 9d3eda3..ad1eda9 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -97,6 +97,8 @@ void rcu_idle_enter(void);
 void rcu_idle_exit(void);
 void rcu_irq_enter(void);
 void rcu_irq_exit(void);
+void rcu_irq_enter_irqson(void);
+void rcu_irq_exit_irqson(void);
 
 void exit_rcu(void);
 
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 696a339c..7834a8a 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -171,8 +171,8 @@ extern void syscall_unregfunc(void);
 				TP_PROTO(data_proto),			\
 				TP_ARGS(data_args),			\
 				TP_CONDITION(cond),			\
-				rcu_irq_enter(),			\
-				rcu_irq_exit());			\
+				rcu_irq_enter_irqson(),			\
+				rcu_irq_exit_irqson());			\
 	}
 #else
 #define __DECLARE_TRACE_RCU(name, proto, args, cond, data_proto, data_args)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index d6863bc..40940b0 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -732,7 +732,7 @@ void rcu_user_enter(void)
  *
  * Exit from an interrupt handler, which might possibly result in entering
  * idle mode, in other words, leaving the mode in which read-side critical
- * sections can occur.
+ * sections can occur.  The caller must have disabled interrupts.
  *
  * This code assumes that the idle loop never does anything that might
  * result in unbalanced calls to irq_enter() and irq_exit().  If your
@@ -745,11 +745,10 @@ void rcu_user_enter(void)
  */
 void rcu_irq_exit(void)
 {
-	unsigned long flags;
 	long long oldval;
 	struct rcu_dynticks *rdtp;
 
-	local_irq_save(flags);
+	RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!");
 	rdtp = this_cpu_ptr(&rcu_dynticks);
 	oldval = rdtp->dynticks_nesting;
 	rdtp->dynticks_nesting--;
@@ -760,6 +759,17 @@ void rcu_irq_exit(void)
 	else
 		rcu_eqs_enter_common(oldval, true);
 	rcu_sysidle_enter(1);
+}
+
+/*
+ * Wrapper for rcu_irq_exit() where interrupts are enabled.
+ */
+void rcu_irq_exit_irqson(void)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	rcu_irq_exit();
 	local_irq_restore(flags);
 }
 
@@ -857,7 +867,7 @@ void rcu_user_exit(void)
  *
  * Enter an interrupt handler, which might possibly result in exiting
  * idle mode, in other words, entering the mode in which read-side critical
- * sections can occur.
+ * sections can occur.  The caller must have disabled interrupts.
  *
  * Note that the Linux kernel is fully capable of entering an interrupt
  * handler that it never exits, for example when doing upcalls to
@@ -873,11 +883,10 @@ void rcu_user_exit(void)
  */
 void rcu_irq_enter(void)
 {
-	unsigned long flags;
 	struct rcu_dynticks *rdtp;
 	long long oldval;
 
-	local_irq_save(flags);
+	RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_enter() invoked with irqs enabled!!!");
 	rdtp = this_cpu_ptr(&rcu_dynticks);
 	oldval = rdtp->dynticks_nesting;
 	rdtp->dynticks_nesting++;
@@ -888,6 +897,17 @@ void rcu_irq_enter(void)
 	else
 		rcu_eqs_exit_common(oldval, true);
 	rcu_sysidle_exit(1);
+}
+
+/*
+ * Wrapper for rcu_irq_enter() where interrupts are enabled.
+ */
+void rcu_irq_enter_irqson(void)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	rcu_irq_enter();
 	local_irq_restore(flags);
 }
 
-- 
cgit v1.1


From f039f0af081746933d5dec3229637a18fab791ed Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Mon, 2 Nov 2015 13:21:47 +1100
Subject: rcu: Fix comment for rcu_dereference_raw_notrace

rcu_dereference_raw() calls indirectly rcu_read_lock_held() while
rcu_dereference_raw_notrace() does not so fix the comment about the latter.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index f2b667d..85aabcd 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -741,7 +741,7 @@ static inline void rcu_preempt_sleep_check(void)
  * The tracing infrastructure traces RCU (we want that), but unfortunately
  * some of the RCU checks causes tracing to lock up the system.
  *
- * The tracing version of rcu_dereference_raw() must not call
+ * The no-tracing version of rcu_dereference_raw() must not call
  * rcu_read_lock_held().
  */
 #define rcu_dereference_raw_notrace(p) __rcu_dereference_check((p), 1, __rcu)
-- 
cgit v1.1


From e11f13355b09df970495c45ed0eac1dc85dcf5c1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 4 Nov 2015 08:22:05 -0800
Subject: rcu: Move wakeup out from under rnp->lock

This patch removes a potential deadlock hazard by moving the
wake_up_process() in rcu_spawn_gp_kthread() out from under rnp->lock.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 40940b0..87b604d 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4319,8 +4319,8 @@ static int __init rcu_spawn_gp_kthread(void)
 			sp.sched_priority = kthread_prio;
 			sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
 		}
-		wake_up_process(t);
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
+		wake_up_process(t);
 	}
 	rcu_spawn_nocb_kthreads();
 	rcu_spawn_boost_kthreads();
-- 
cgit v1.1


From 45fed3e7cfb4001c80cd4bd25249d194a52bfed3 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 7 Nov 2015 23:35:00 -0800
Subject: rcu: Make rcu_gp_init() be bool rather than int

The return value from rcu_gp_init() is always used as a bool, so
this commit makes it be a bool.

Reported-by: Iftekhar Ahmed <ahmedi@oregonstate.edu>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 87b604d..01a90a3 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1814,9 +1814,9 @@ static void rcu_gp_slow(struct rcu_state *rsp, int delay)
 }
 
 /*
- * Initialize a new grace period.  Return 0 if no grace period required.
+ * Initialize a new grace period.  Return false if no grace period required.
  */
-static int rcu_gp_init(struct rcu_state *rsp)
+static bool rcu_gp_init(struct rcu_state *rsp)
 {
 	unsigned long oldmask;
 	struct rcu_data *rdp;
@@ -1827,7 +1827,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
 	if (!READ_ONCE(rsp->gp_flags)) {
 		/* Spurious wakeup, tell caller to go back to sleep.  */
 		raw_spin_unlock_irq(&rnp->lock);
-		return 0;
+		return false;
 	}
 	WRITE_ONCE(rsp->gp_flags, 0); /* Clear all flags: New grace period. */
 
@@ -1837,7 +1837,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
 		 * Not supposed to be able to happen.
 		 */
 		raw_spin_unlock_irq(&rnp->lock);
-		return 0;
+		return false;
 	}
 
 	/* Advance to a new grace period and initialize state. */
@@ -1929,7 +1929,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
 		WRITE_ONCE(rsp->gp_activity, jiffies);
 	}
 
-	return 1;
+	return true;
 }
 
 /*
-- 
cgit v1.1


From 69b907297f4edf13182e3fa3adc0160df077746c Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Sat, 5 Dec 2015 18:14:19 -0800
Subject: list: Add lockless list traversal primitives

Although list_for_each_entry_rcu() can in theory be used anywhere
preemption is disabled, it can result in calls to lockdep, which cannot
be used in certain constrained execution environments, such as exception
handlers that do not map the entire kernel into their address spaces.
This commit therefore adds list_entry_lockless() and
list_for_each_entry_lockless(), which never invoke lockdep and can
therefore safely be used from these constrained environments, but only
as long as those environments are non-preemptible (or items are never
deleted from the list).

Use synchronize_sched(), call_rcu_sched(), or synchronize_sched_expedited()
in updates for the needed grace periods.  Of course, if items are never
deleted from the list, there is no need to wait for grace periods.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rculist.h | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index 5ed5409..1fad798 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -305,6 +305,42 @@ static inline void list_splice_init_rcu(struct list_head *list,
 		pos = list_entry_rcu(pos->member.next, typeof(*pos), member))
 
 /**
+ * list_entry_lockless - get the struct for this entry
+ * @ptr:        the &struct list_head pointer.
+ * @type:       the type of the struct this is embedded in.
+ * @member:     the name of the list_head within the struct.
+ *
+ * This primitive may safely run concurrently with the _rcu list-mutation
+ * primitives such as list_add_rcu(), but requires some implicit RCU
+ * read-side guarding.  One example is running within a special
+ * exception-time environment where preemption is disabled and where
+ * lockdep cannot be invoked (in which case updaters must use RCU-sched,
+ * as in synchronize_sched(), call_rcu_sched(), and friends).  Another
+ * example is when items are added to the list, but never deleted.
+ */
+#define list_entry_lockless(ptr, type, member) \
+	container_of((typeof(ptr))lockless_dereference(ptr), type, member)
+
+/**
+ * list_for_each_entry_lockless - iterate over rcu list of given type
+ * @pos:	the type * to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ *
+ * This primitive may safely run concurrently with the _rcu list-mutation
+ * primitives such as list_add_rcu(), but requires some implicit RCU
+ * read-side guarding.  One example is running within a special
+ * exception-time environment where preemption is disabled and where
+ * lockdep cannot be invoked (in which case updaters must use RCU-sched,
+ * as in synchronize_sched(), call_rcu_sched(), and friends).  Another
+ * example is when items are added to the list, but never deleted.
+ */
+#define list_for_each_entry_lockless(pos, head, member) \
+	for (pos = list_entry_lockless((head)->next, typeof(*pos), member); \
+	     &pos->member != (head); \
+	     pos = list_entry_lockless(pos->member.next, typeof(*pos), member))
+
+/**
  * list_for_each_entry_continue_rcu - continue iteration over list of given type
  * @pos:	the type * to use as a loop cursor.
  * @head:	the head for your list.
-- 
cgit v1.1


From 21fc61c73c3903c4c312d0802da01ec2b323d174 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 17 Nov 2015 01:07:57 -0500
Subject: don't put symlink bodies in pagecache into highmem

kmap() in page_follow_link_light() needed to go - allowing to hold
an arbitrary number of kmaps for long is a great way to deadlocking
the system.

new helper (inode_nohighmem(inode)) needs to be used for pagecache
symlinks inodes; done for all in-tree cases.  page_follow_link_light()
instrumented to yell about anything missed.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/porting |  5 +++++
 fs/affs/inode.c                   |  1 +
 fs/affs/namei.c                   |  1 +
 fs/affs/symlink.c                 |  4 +---
 fs/afs/inode.c                    |  1 +
 fs/befs/linuxvfs.c                |  5 ++---
 fs/btrfs/inode.c                  |  2 ++
 fs/coda/cnode.c                   |  2 ++
 fs/coda/symlink.c                 |  4 +---
 fs/cramfs/inode.c                 |  1 +
 fs/efs/inode.c                    |  1 +
 fs/efs/symlink.c                  |  4 +---
 fs/exofs/inode.c                  |  1 +
 fs/exofs/namei.c                  |  1 +
 fs/ext2/inode.c                   |  1 +
 fs/ext2/namei.c                   |  1 +
 fs/ext4/inode.c                   |  1 +
 fs/ext4/namei.c                   |  1 +
 fs/ext4/symlink.c                 | 10 +++-------
 fs/f2fs/inode.c                   |  1 +
 fs/f2fs/namei.c                   |  5 ++---
 fs/freevxfs/vxfs_inode.c          |  1 +
 fs/hfsplus/inode.c                |  2 ++
 fs/hpfs/inode.c                   |  1 +
 fs/hpfs/namei.c                   |  5 ++---
 fs/hugetlbfs/inode.c              |  1 +
 fs/inode.c                        |  6 ++++++
 fs/isofs/inode.c                  |  1 +
 fs/isofs/rock.c                   |  4 +---
 fs/jfs/inode.c                    |  1 +
 fs/jfs/namei.c                    |  1 +
 fs/logfs/dir.c                    |  1 +
 fs/logfs/inode.c                  |  1 +
 fs/minix/inode.c                  |  1 +
 fs/namei.c                        |  9 +++------
 fs/ncpfs/inode.c                  |  1 +
 fs/nfs/inode.c                    |  5 +++--
 fs/nfs/symlink.c                  |  2 +-
 fs/nilfs2/inode.c                 |  1 +
 fs/nilfs2/namei.c                 |  1 +
 fs/ocfs2/inode.c                  |  1 +
 fs/ocfs2/namei.c                  |  1 +
 fs/qnx4/inode.c                   |  1 +
 fs/qnx6/inode.c                   |  1 +
 fs/ramfs/inode.c                  |  1 +
 fs/reiserfs/inode.c               |  1 +
 fs/reiserfs/namei.c               |  1 +
 fs/romfs/super.c                  |  1 +
 fs/squashfs/inode.c               |  2 ++
 fs/sysv/inode.c                   |  1 +
 fs/udf/inode.c                    |  1 +
 fs/udf/namei.c                    |  1 +
 fs/udf/symlink.c                  |  4 +---
 fs/ufs/inode.c                    |  1 +
 fs/ufs/namei.c                    |  1 +
 include/linux/fs.h                |  1 +
 mm/shmem.c                        |  9 +++------
 57 files changed, 81 insertions(+), 46 deletions(-)

diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index f24d1b8..3eb7c35 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -504,3 +504,8 @@ in your dentry operations instead.
 [mandatory]
 	__fd_install() & fd_install() can now sleep. Callers should not
 	hold a spinlock	or other resources that do not allow a schedule.
+--
+[mandatory]
+	any symlink that might use page_follow_link_light/page_put_link() must
+	have inode_nohighmem(inode) called before anything might start playing with
+	its pagecache.
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 1734950..0fdb0f5 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -140,6 +140,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
 		break;
 	case ST_SOFTLINK:
 		inode->i_mode |= S_IFLNK;
+		inode_nohighmem(inode);
 		inode->i_op = &affs_symlink_inode_operations;
 		inode->i_data.a_ops = &affs_symlink_aops;
 		break;
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 181e05b..00d3002 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -344,6 +344,7 @@ affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
 		return -ENOSPC;
 
 	inode->i_op = &affs_symlink_inode_operations;
+	inode_nohighmem(inode);
 	inode->i_data.a_ops = &affs_symlink_aops;
 	inode->i_mode = S_IFLNK | 0777;
 	mode_to_prot(inode);
diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c
index ea5b69a..e3f9dc3 100644
--- a/fs/affs/symlink.c
+++ b/fs/affs/symlink.c
@@ -14,7 +14,7 @@ static int affs_symlink_readpage(struct file *file, struct page *page)
 {
 	struct buffer_head *bh;
 	struct inode *inode = page->mapping->host;
-	char *link = kmap(page);
+	char *link = page_address(page);
 	struct slink_front *lf;
 	int			 i, j;
 	char			 c;
@@ -57,12 +57,10 @@ static int affs_symlink_readpage(struct file *file, struct page *page)
 	link[i] = '\0';
 	affs_brelse(bh);
 	SetPageUptodate(page);
-	kunmap(page);
 	unlock_page(page);
 	return 0;
 fail:
 	SetPageError(page);
-	kunmap(page);
 	unlock_page(page);
 	return -EIO;
 }
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index e06f5a2..86cc726 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -56,6 +56,7 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
 	case AFS_FTYPE_SYMLINK:
 		inode->i_mode	= S_IFLNK | vnode->status.mode;
 		inode->i_op	= &page_symlink_inode_operations;
+		inode_nohighmem(inode);
 		break;
 	default:
 		printk("kAFS: AFS vnode with undefined type\n");
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 1c8b0dc..25250fa 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -397,6 +397,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
 	} else if (S_ISLNK(inode->i_mode)) {
 		if (befs_ino->i_flags & BEFS_LONG_SYMLINK) {
 			inode->i_op = &page_symlink_inode_operations;
+			inode_nohighmem(inode);
 			inode->i_mapping->a_ops = &befs_symlink_aops;
 		} else {
 			inode->i_link = befs_ino->i_data.symlink;
@@ -469,7 +470,7 @@ static int befs_symlink_readpage(struct file *unused, struct page *page)
 	struct befs_inode_info *befs_ino = BEFS_I(inode);
 	befs_data_stream *data = &befs_ino->i_data.ds;
 	befs_off_t len = data->size;
-	char *link = kmap(page);
+	char *link = page_address(page);
 
 	if (len == 0 || len > PAGE_SIZE) {
 		befs_error(sb, "Long symlink with illegal length");
@@ -483,12 +484,10 @@ static int befs_symlink_readpage(struct file *unused, struct page *page)
 	}
 	link[len - 1] = '\0';
 	SetPageUptodate(page);
-	kunmap(page);
 	unlock_page(page);
 	return 0;
 fail:
 	SetPageError(page);
-	kunmap(page);
 	unlock_page(page);
 	return -EIO;
 }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a70c579..70f98bf 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3774,6 +3774,7 @@ cache_acl:
 		break;
 	case S_IFLNK:
 		inode->i_op = &btrfs_symlink_inode_operations;
+		inode_nohighmem(inode);
 		inode->i_mapping->a_ops = &btrfs_symlink_aops;
 		break;
 	default:
@@ -9705,6 +9706,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	btrfs_free_path(path);
 
 	inode->i_op = &btrfs_symlink_inode_operations;
+	inode_nohighmem(inode);
 	inode->i_mapping->a_ops = &btrfs_symlink_aops;
 	inode_set_bytes(inode, name_len);
 	btrfs_i_size_write(inode, name_len);
diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c
index 7740b1c..dd6a79e 100644
--- a/fs/coda/cnode.c
+++ b/fs/coda/cnode.c
@@ -8,6 +8,7 @@
 
 #include <linux/coda.h>
 #include <linux/coda_psdev.h>
+#include <linux/pagemap.h>
 #include "coda_linux.h"
 
 static inline int coda_fideq(struct CodaFid *fid1, struct CodaFid *fid2)
@@ -35,6 +36,7 @@ static void coda_fill_inode(struct inode *inode, struct coda_vattr *attr)
                 inode->i_fop = &coda_dir_operations;
         } else if (S_ISLNK(inode->i_mode)) {
 		inode->i_op = &coda_symlink_inode_operations;
+		inode_nohighmem(inode);
 		inode->i_data.a_ops = &coda_symlink_aops;
 		inode->i_mapping = &inode->i_data;
 	} else
diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c
index ab94ef6..03736e2 100644
--- a/fs/coda/symlink.c
+++ b/fs/coda/symlink.c
@@ -26,7 +26,7 @@ static int coda_symlink_filler(struct file *file, struct page *page)
 	int error;
 	struct coda_inode_info *cii;
 	unsigned int len = PAGE_SIZE;
-	char *p = kmap(page);
+	char *p = page_address(page);
 
 	cii = ITOC(inode);
 
@@ -34,13 +34,11 @@ static int coda_symlink_filler(struct file *file, struct page *page)
 	if (error)
 		goto fail;
 	SetPageUptodate(page);
-	kunmap(page);
 	unlock_page(page);
 	return 0;
 
 fail:
 	SetPageError(page);
-	kunmap(page);
 	unlock_page(page);
 	return error;
 }
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 355c522..b862bc2 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -100,6 +100,7 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
 		break;
 	case S_IFLNK:
 		inode->i_op = &page_symlink_inode_operations;
+		inode_nohighmem(inode);
 		inode->i_data.a_ops = &cramfs_aops;
 		break;
 	default:
diff --git a/fs/efs/inode.c b/fs/efs/inode.c
index 079d203..cdf0872 100644
--- a/fs/efs/inode.c
+++ b/fs/efs/inode.c
@@ -151,6 +151,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino)
 			break;
 		case S_IFLNK:
 			inode->i_op = &page_symlink_inode_operations;
+			inode_nohighmem(inode);
 			inode->i_data.a_ops = &efs_symlink_aops;
 			break;
 		case S_IFCHR:
diff --git a/fs/efs/symlink.c b/fs/efs/symlink.c
index 75117d0..4870cc8 100644
--- a/fs/efs/symlink.c
+++ b/fs/efs/symlink.c
@@ -13,7 +13,7 @@
 
 static int efs_symlink_readpage(struct file *file, struct page *page)
 {
-	char *link = kmap(page);
+	char *link = page_address(page);
 	struct buffer_head * bh;
 	struct inode * inode = page->mapping->host;
 	efs_block_t size = inode->i_size;
@@ -39,12 +39,10 @@ static int efs_symlink_readpage(struct file *file, struct page *page)
 	}
 	link[size] = '\0';
 	SetPageUptodate(page);
-	kunmap(page);
 	unlock_page(page);
 	return 0;
 fail:
 	SetPageError(page);
-	kunmap(page);
 	unlock_page(page);
 	return err;
 }
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 73c64da..d8e9c181 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -1227,6 +1227,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
 			inode->i_link = (char *)oi->i_data;
 		} else {
 			inode->i_op = &page_symlink_inode_operations;
+			inode_nohighmem(inode);
 			inode->i_mapping->a_ops = &exofs_aops;
 		}
 	} else {
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
index 994e078..c20d77d 100644
--- a/fs/exofs/namei.c
+++ b/fs/exofs/namei.c
@@ -111,6 +111,7 @@ static int exofs_symlink(struct inode *dir, struct dentry *dentry,
 	if (l > sizeof(oi->i_data)) {
 		/* slow symlink */
 		inode->i_op = &page_symlink_inode_operations;
+		inode_nohighmem(inode);
 		inode->i_mapping->a_ops = &exofs_aops;
 		memset(oi->i_data, 0, sizeof(oi->i_data));
 
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 0aa9bf6..338eefd 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1420,6 +1420,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
 				sizeof(ei->i_data) - 1);
 		} else {
 			inode->i_op = &ext2_symlink_inode_operations;
+			inode_nohighmem(inode);
 			if (test_opt(inode->i_sb, NOBH))
 				inode->i_mapping->a_ops = &ext2_nobh_aops;
 			else
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 3267a80..7a2be8f 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -183,6 +183,7 @@ static int ext2_symlink (struct inode * dir, struct dentry * dentry,
 	if (l > sizeof (EXT2_I(inode)->i_data)) {
 		/* slow symlink */
 		inode->i_op = &ext2_symlink_inode_operations;
+		inode_nohighmem(inode);
 		if (test_opt(inode->i_sb, NOBH))
 			inode->i_mapping->a_ops = &ext2_nobh_aops;
 		else
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index ea433a7..b3bd912 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4283,6 +4283,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 			inode->i_op = &ext4_symlink_inode_operations;
 			ext4_set_aops(inode);
 		}
+		inode_nohighmem(inode);
 	} else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
 	      S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
 		inode->i_op = &ext4_special_inode_operations;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index a969ab3..f27e0c2 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -3132,6 +3132,7 @@ static int ext4_symlink(struct inode *dir,
 	if ((disk_link.len > EXT4_N_BLOCKS * 4)) {
 		if (!encryption_required)
 			inode->i_op = &ext4_symlink_inode_operations;
+		inode_nohighmem(inode);
 		ext4_set_aops(inode);
 		/*
 		 * We cannot call page_symlink() with transaction started
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index abe2401..0e6dc44 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -45,7 +45,7 @@ static const char *ext4_encrypted_follow_link(struct dentry *dentry, void **cook
 		cpage = read_mapping_page(inode->i_mapping, 0, NULL);
 		if (IS_ERR(cpage))
 			return ERR_CAST(cpage);
-		caddr = kmap(cpage);
+		caddr = page_address(cpage);
 		caddr[size] = 0;
 	}
 
@@ -75,16 +75,12 @@ static const char *ext4_encrypted_follow_link(struct dentry *dentry, void **cook
 	/* Null-terminate the name */
 	if (res <= plen)
 		paddr[res] = '\0';
-	if (cpage) {
-		kunmap(cpage);
+	if (cpage)
 		page_cache_release(cpage);
-	}
 	return *cookie = paddr;
 errout:
-	if (cpage) {
-		kunmap(cpage);
+	if (cpage)
 		page_cache_release(cpage);
-	}
 	kfree(paddr);
 	return ERR_PTR(res);
 }
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 97e20de..5528801 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -202,6 +202,7 @@ make_now:
 			inode->i_op = &f2fs_encrypted_symlink_inode_operations;
 		else
 			inode->i_op = &f2fs_symlink_inode_operations;
+		inode_nohighmem(inode);
 		inode->i_mapping->a_ops = &f2fs_dblock_aops;
 	} else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
 			S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 2c32110..484df68 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -351,6 +351,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
 		inode->i_op = &f2fs_encrypted_symlink_inode_operations;
 	else
 		inode->i_op = &f2fs_symlink_inode_operations;
+	inode_nohighmem(inode);
 	inode->i_mapping->a_ops = &f2fs_dblock_aops;
 
 	f2fs_lock_op(sbi);
@@ -942,7 +943,7 @@ static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cook
 	cpage = read_mapping_page(inode->i_mapping, 0, NULL);
 	if (IS_ERR(cpage))
 		return ERR_CAST(cpage);
-	caddr = kmap(cpage);
+	caddr = page_address(cpage);
 	caddr[size] = 0;
 
 	/* Symlink is encrypted */
@@ -982,13 +983,11 @@ static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cook
 	/* Null-terminate the name */
 	paddr[res] = '\0';
 
-	kunmap(cpage);
 	page_cache_release(cpage);
 	return *cookie = paddr;
 errout:
 	kfree(cstr.name);
 	f2fs_fname_crypto_free_buffer(&pstr);
-	kunmap(cpage);
 	page_cache_release(cpage);
 	return ERR_PTR(res);
 }
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index ef73ed6..3e2ccad 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -326,6 +326,7 @@ vxfs_iget(struct super_block *sbp, ino_t ino)
 	} else if (S_ISLNK(ip->i_mode)) {
 		if (!VXFS_ISIMMED(vip)) {
 			ip->i_op = &page_symlink_inode_operations;
+			inode_nohighmem(ip);
 			ip->i_mapping->a_ops = &vxfs_aops;
 		} else {
 			ip->i_op = &simple_symlink_inode_operations;
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 6dd107d..19b33f8 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -403,6 +403,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, umode_t mode)
 	} else if (S_ISLNK(inode->i_mode)) {
 		sbi->file_count++;
 		inode->i_op = &page_symlink_inode_operations;
+		inode_nohighmem(inode);
 		inode->i_mapping->a_ops = &hfsplus_aops;
 		hip->clump_blocks = 1;
 	} else
@@ -526,6 +527,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
 			inode->i_mapping->a_ops = &hfsplus_aops;
 		} else if (S_ISLNK(inode->i_mode)) {
 			inode->i_op = &page_symlink_inode_operations;
+			inode_nohighmem(inode);
 			inode->i_mapping->a_ops = &hfsplus_aops;
 		} else {
 			init_special_inode(inode, inode->i_mode,
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 933c737..1f3c6d7 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -77,6 +77,7 @@ void hpfs_read_inode(struct inode *i)
 			kfree(ea);
 			i->i_mode = S_IFLNK | 0777;
 			i->i_op = &page_symlink_inode_operations;
+			inode_nohighmem(i);
 			i->i_data.a_ops = &hpfs_symlink_aops;
 			set_nlink(i, 1);
 			i->i_size = ea_size;
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index ae4d5a1..506765a 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -332,6 +332,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy
 	result->i_blocks = 1;
 	set_nlink(result, 1);
 	result->i_size = strlen(symlink);
+	inode_nohighmem(result);
 	result->i_op = &page_symlink_inode_operations;
 	result->i_data.a_ops = &hpfs_symlink_aops;
 
@@ -500,7 +501,7 @@ out:
 
 static int hpfs_symlink_readpage(struct file *file, struct page *page)
 {
-	char *link = kmap(page);
+	char *link = page_address(page);
 	struct inode *i = page->mapping->host;
 	struct fnode *fnode;
 	struct buffer_head *bh;
@@ -516,14 +517,12 @@ static int hpfs_symlink_readpage(struct file *file, struct page *page)
 		goto fail;
 	hpfs_unlock(i->i_sb);
 	SetPageUptodate(page);
-	kunmap(page);
 	unlock_page(page);
 	return 0;
 
 fail:
 	hpfs_unlock(i->i_sb);
 	SetPageError(page);
-	kunmap(page);
 	unlock_page(page);
 	return err;
 }
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index de4bdfa..d8f51ee 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -760,6 +760,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
 			break;
 		case S_IFLNK:
 			inode->i_op = &page_symlink_inode_operations;
+			inode_nohighmem(inode);
 			break;
 		}
 		lockdep_annotate_inode_mutex_key(inode);
diff --git a/fs/inode.c b/fs/inode.c
index 1be5f90..5bb85a0 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2028,3 +2028,9 @@ void inode_set_flags(struct inode *inode, unsigned int flags,
 				  new_flags) != old_flags));
 }
 EXPORT_SYMBOL(inode_set_flags);
+
+void inode_nohighmem(struct inode *inode)
+{
+	mapping_set_gfp_mask(inode->i_mapping, GFP_USER);
+}
+EXPORT_SYMBOL(inode_nohighmem);
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index d67a16f..61abdc4 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -1417,6 +1417,7 @@ static int isofs_read_inode(struct inode *inode, int relocated)
 		inode->i_fop = &isofs_dir_operations;
 	} else if (S_ISLNK(inode->i_mode)) {
 		inode->i_op = &page_symlink_inode_operations;
+		inode_nohighmem(inode);
 		inode->i_data.a_ops = &isofs_symlink_aops;
 	} else
 		/* XXX - parse_rock_ridge_inode() had already set i_rdev. */
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index 735d752..5384ceb 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -687,7 +687,7 @@ static int rock_ridge_symlink_readpage(struct file *file, struct page *page)
 	struct inode *inode = page->mapping->host;
 	struct iso_inode_info *ei = ISOFS_I(inode);
 	struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb);
-	char *link = kmap(page);
+	char *link = page_address(page);
 	unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
 	struct buffer_head *bh;
 	char *rpnt = link;
@@ -774,7 +774,6 @@ repeat:
 	brelse(bh);
 	*rpnt = '\0';
 	SetPageUptodate(page);
-	kunmap(page);
 	unlock_page(page);
 	return 0;
 
@@ -791,7 +790,6 @@ fail:
 	brelse(bh);
 error:
 	SetPageError(page);
-	kunmap(page);
 	unlock_page(page);
 	return -EIO;
 }
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 41aa3ca..9d9bae6 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -60,6 +60,7 @@ struct inode *jfs_iget(struct super_block *sb, unsigned long ino)
 	} else if (S_ISLNK(inode->i_mode)) {
 		if (inode->i_size >= IDATASIZE) {
 			inode->i_op = &page_symlink_inode_operations;
+			inode_nohighmem(inode);
 			inode->i_mapping->a_ops = &jfs_aops;
 		} else {
 			inode->i_op = &jfs_fast_symlink_inode_operations;
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 9d7551f..701f893 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -983,6 +983,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
 		jfs_info("jfs_symlink: allocate extent ip:0x%p", ip);
 
 		ip->i_op = &jfs_symlink_inode_operations;
+		inode_nohighmem(ip);
 		ip->i_mapping->a_ops = &jfs_aops;
 
 		/*
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 99944a4..542468e 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -529,6 +529,7 @@ static int logfs_symlink(struct inode *dir, struct dentry *dentry,
 		return PTR_ERR(inode);
 
 	inode->i_op = &page_symlink_inode_operations;
+	inode_nohighmem(inode);
 	inode->i_mapping->a_ops = &logfs_reg_aops;
 
 	return __logfs_create(dir, dentry, inode, target, destlen);
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index 06baa92..0fce46d 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -65,6 +65,7 @@ static void logfs_inode_setops(struct inode *inode)
 		break;
 	case S_IFLNK:
 		inode->i_op = &page_symlink_inode_operations;
+		inode_nohighmem(inode);
 		inode->i_mapping->a_ops = &logfs_reg_aops;
 		break;
 	case S_IFSOCK:	/* fall through */
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 086cd0a..67a23bf 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -452,6 +452,7 @@ void minix_set_inode(struct inode *inode, dev_t rdev)
 		inode->i_mapping->a_ops = &minix_aops;
 	} else if (S_ISLNK(inode->i_mode)) {
 		inode->i_op = &minix_symlink_inode_operations;
+		inode_nohighmem(inode);
 		inode->i_mapping->a_ops = &minix_aops;
 	} else
 		init_special_inode(inode, inode->i_mode, rdev);
diff --git a/fs/namei.c b/fs/namei.c
index 4bae5cb..2808958 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4527,7 +4527,8 @@ static const char *page_getlink(struct dentry * dentry, void **cookie)
 	if (IS_ERR(page))
 		return (char*)page;
 	*cookie = page;
-	kaddr = kmap(page);
+	BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM);
+	kaddr = page_address(page);
 	nd_terminate_link(kaddr, dentry->d_inode->i_size, PAGE_SIZE - 1);
 	return kaddr;
 }
@@ -4541,7 +4542,6 @@ EXPORT_SYMBOL(page_follow_link_light);
 void page_put_link(struct inode *unused, void *cookie)
 {
 	struct page *page = cookie;
-	kunmap(page);
 	page_cache_release(page);
 }
 EXPORT_SYMBOL(page_put_link);
@@ -4565,7 +4565,6 @@ int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
 	struct page *page;
 	void *fsdata;
 	int err;
-	char *kaddr;
 	unsigned int flags = AOP_FLAG_UNINTERRUPTIBLE;
 	if (nofs)
 		flags |= AOP_FLAG_NOFS;
@@ -4576,9 +4575,7 @@ retry:
 	if (err)
 		goto fail;
 
-	kaddr = kmap_atomic(page);
-	memcpy(kaddr, symname, len-1);
-	kunmap_atomic(kaddr);
+	memcpy(page_address(page), symname, len-1);
 
 	err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
 							page, fsdata);
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 9605a2f..bb856f7 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -283,6 +283,7 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info)
 #if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS)
 		} else if (S_ISLNK(inode->i_mode)) {
 			inode->i_op = &ncp_symlink_inode_operations;
+			inode_nohighmem(inode);
 			inode->i_data.a_ops = &ncp_symlink_aops;
 #endif
 		} else {
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 31b0a52..ae9aa0b 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -408,9 +408,10 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
 				inode->i_fop = NULL;
 				inode->i_flags |= S_AUTOMOUNT;
 			}
-		} else if (S_ISLNK(inode->i_mode))
+		} else if (S_ISLNK(inode->i_mode)) {
 			inode->i_op = &nfs_symlink_inode_operations;
-		else
+			inode_nohighmem(inode);
+		} else
 			init_special_inode(inode, inode->i_mode, fattr->rdev);
 
 		memset(&inode->i_atime, 0, sizeof(inode->i_atime));
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index b6de433..abd93bf 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -56,7 +56,7 @@ static const char *nfs_follow_link(struct dentry *dentry, void **cookie)
 	if (IS_ERR(page))
 		return ERR_CAST(page);
 	*cookie = page;
-	return kmap(page);
+	return page_address(page);
 }
 
 /*
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index ac2f649..10b2252 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -510,6 +510,7 @@ static int __nilfs_read_inode(struct super_block *sb,
 		inode->i_mapping->a_ops = &nilfs_aops;
 	} else if (S_ISLNK(inode->i_mode)) {
 		inode->i_op = &nilfs_symlink_inode_operations;
+		inode_nohighmem(inode);
 		inode->i_mapping->a_ops = &nilfs_aops;
 	} else {
 		inode->i_op = &nilfs_special_inode_operations;
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index c9a1a49..90b3ba9 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -161,6 +161,7 @@ static int nilfs_symlink(struct inode *dir, struct dentry *dentry,
 
 	/* slow symlink */
 	inode->i_op = &nilfs_symlink_inode_operations;
+	inode_nohighmem(inode);
 	inode->i_mapping->a_ops = &nilfs_aops;
 	err = page_symlink(inode, symname, l);
 	if (err)
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 8f87e05..97a563b 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -361,6 +361,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 		    break;
 	    case S_IFLNK:
 		    inode->i_op = &ocfs2_symlink_inode_operations;
+		    inode_nohighmem(inode);
 		    i_size_write(inode, le64_to_cpu(fe->i_size));
 		    break;
 	    default:
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index a03f6f4..2efe8af 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1960,6 +1960,7 @@ static int ocfs2_symlink(struct inode *dir,
 	inode->i_rdev = 0;
 	newsize = l - 1;
 	inode->i_op = &ocfs2_symlink_inode_operations;
+	inode_nohighmem(inode);
 	if (l > ocfs2_fast_symlink_chars(sb)) {
 		u32 offset = 0;
 
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index c4bcb77..f37b3de 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -316,6 +316,7 @@ struct inode *qnx4_iget(struct super_block *sb, unsigned long ino)
 		inode->i_fop = &qnx4_dir_operations;
 	} else if (S_ISLNK(inode->i_mode)) {
 		inode->i_op = &page_symlink_inode_operations;
+		inode_nohighmem(inode);
 		inode->i_mapping->a_ops = &qnx4_aops;
 		qnx4_i(inode)->mmu_private = inode->i_size;
 	} else {
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index 32d2e1a..9728b54 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -582,6 +582,7 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino)
 		inode->i_mapping->a_ops = &qnx6_aops;
 	} else if (S_ISLNK(inode->i_mode)) {
 		inode->i_op = &page_symlink_inode_operations;
+		inode_nohighmem(inode);
 		inode->i_mapping->a_ops = &qnx6_aops;
 	} else
 		init_special_inode(inode, inode->i_mode, 0);
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 889d558..38981b0 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -79,6 +79,7 @@ struct inode *ramfs_get_inode(struct super_block *sb,
 			break;
 		case S_IFLNK:
 			inode->i_op = &page_symlink_inode_operations;
+			inode_nohighmem(inode);
 			break;
 		}
 	}
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 3d8e7e6..ae9e5b3 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1361,6 +1361,7 @@ static void init_inode(struct inode *inode, struct treepath *path)
 		inode->i_fop = &reiserfs_dir_operations;
 	} else if (S_ISLNK(inode->i_mode)) {
 		inode->i_op = &reiserfs_symlink_inode_operations;
+		inode_nohighmem(inode);
 		inode->i_mapping->a_ops = &reiserfs_address_space_operations;
 	} else {
 		inode->i_blocks = 0;
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 47f9698..4fc2326 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -1170,6 +1170,7 @@ static int reiserfs_symlink(struct inode *parent_dir,
 	reiserfs_update_inode_transaction(parent_dir);
 
 	inode->i_op = &reiserfs_symlink_inode_operations;
+	inode_nohighmem(inode);
 	inode->i_mapping->a_ops = &reiserfs_address_space_operations;
 
 	retval = reiserfs_add_entry(&th, parent_dir, dentry->d_name.name,
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 268733c..bb894e7 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -360,6 +360,7 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos)
 		break;
 	case ROMFH_SYM:
 		i->i_op = &page_symlink_inode_operations;
+		inode_nohighmem(i);
 		i->i_data.a_ops = &romfs_aops;
 		mode |= S_IRWXUGO;
 		break;
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index a1ce5ce..0927b1e 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -41,6 +41,7 @@
 #include <linux/fs.h>
 #include <linux/vfs.h>
 #include <linux/xattr.h>
+#include <linux/pagemap.h>
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
@@ -291,6 +292,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
 		set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
 		inode->i_size = le32_to_cpu(sqsh_ino->symlink_size);
 		inode->i_op = &squashfs_symlink_inode_ops;
+		inode_nohighmem(inode);
 		inode->i_data.a_ops = &squashfs_symlink_aops;
 		inode->i_mode |= S_IFLNK;
 		squashfs_i(inode)->start = block;
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 02fa1dc..ef8bcdb 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -163,6 +163,7 @@ void sysv_set_inode(struct inode *inode, dev_t rdev)
 		inode->i_mapping->a_ops = &sysv_aops;
 	} else if (S_ISLNK(inode->i_mode)) {
 		inode->i_op = &sysv_symlink_inode_operations;
+		inode_nohighmem(inode);
 		inode->i_mapping->a_ops = &sysv_aops;
 	} else
 		init_special_inode(inode, inode->i_mode, rdev);
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 8675c2b..0557463 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1541,6 +1541,7 @@ reread:
 	case ICBTAG_FILE_TYPE_SYMLINK:
 		inode->i_data.a_ops = &udf_symlink_aops;
 		inode->i_op = &page_symlink_inode_operations;
+		inode_nohighmem(inode);
 		inode->i_mode = S_IFLNK | S_IRWXUGO;
 		break;
 	case ICBTAG_FILE_TYPE_MAIN:
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index d0e6de1..42eafb9 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -922,6 +922,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
 
 	inode->i_data.a_ops = &udf_symlink_aops;
 	inode->i_op = &page_symlink_inode_operations;
+	inode_nohighmem(inode);
 
 	if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
 		struct kernel_lb_addr eloc;
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index 862535b..8d61977 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -107,7 +107,7 @@ static int udf_symlink_filler(struct file *file, struct page *page)
 	struct buffer_head *bh = NULL;
 	unsigned char *symlink;
 	int err;
-	unsigned char *p = kmap(page);
+	unsigned char *p = page_address(page);
 	struct udf_inode_info *iinfo;
 	uint32_t pos;
 
@@ -141,7 +141,6 @@ static int udf_symlink_filler(struct file *file, struct page *page)
 
 	up_read(&iinfo->i_data_sem);
 	SetPageUptodate(page);
-	kunmap(page);
 	unlock_page(page);
 	return 0;
 
@@ -149,7 +148,6 @@ out_unlock_inode:
 	up_read(&iinfo->i_data_sem);
 	SetPageError(page);
 out_unmap:
-	kunmap(page);
 	unlock_page(page);
 	return err;
 }
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 737160a..d897e16 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -533,6 +533,7 @@ static void ufs_set_inode_ops(struct inode *inode)
 		} else {
 			inode->i_mapping->a_ops = &ufs_aops;
 			inode->i_op = &page_symlink_inode_operations;
+			inode_nohighmem(inode);
 		}
 	} else
 		init_special_inode(inode, inode->i_mode,
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 24b0cbd..acf4a3b 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -124,6 +124,7 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
 	if (l > UFS_SB(sb)->s_uspi->s_maxsymlinklen) {
 		/* slow symlink */
 		inode->i_op = &page_symlink_inode_operations;
+		inode_nohighmem(inode);
 		inode->i_mapping->a_ops = &ufs_aops;
 		err = page_symlink(inode, symname, l);
 		if (err)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3aa5142..dfeda44 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3025,5 +3025,6 @@ static inline bool dir_relax(struct inode *inode)
 }
 
 extern bool path_noexec(const struct path *path);
+extern void inode_nohighmem(struct inode *inode);
 
 #endif /* _LINUX_FS_H */
diff --git a/mm/shmem.c b/mm/shmem.c
index 9187eee..64bf5ac 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2444,7 +2444,6 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
 	int len;
 	struct inode *inode;
 	struct page *page;
-	char *kaddr;
 	struct shmem_inode_info *info;
 
 	len = strlen(symname) + 1;
@@ -2483,9 +2482,8 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
 		}
 		inode->i_mapping->a_ops = &shmem_aops;
 		inode->i_op = &shmem_symlink_inode_operations;
-		kaddr = kmap_atomic(page);
-		memcpy(kaddr, symname, len);
-		kunmap_atomic(kaddr);
+		inode_nohighmem(inode);
+		memcpy(page_address(page), symname, len);
 		SetPageUptodate(page);
 		set_page_dirty(page);
 		unlock_page(page);
@@ -2506,13 +2504,12 @@ static const char *shmem_follow_link(struct dentry *dentry, void **cookie)
 		return ERR_PTR(error);
 	unlock_page(page);
 	*cookie = page;
-	return kmap(page);
+	return page_address(page);
 }
 
 static void shmem_put_link(struct inode *unused, void *cookie)
 {
 	struct page *page = cookie;
-	kunmap(page);
 	mark_page_accessed(page);
 	page_cache_release(page);
 }
-- 
cgit v1.1


From 6b2553918d8b4e6de9853fd6315bec7271a2e592 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 17 Nov 2015 10:20:54 -0500
Subject: replace ->follow_link() with new method that could stay in RCU mode

new method: ->get_link(); replacement of ->follow_link().  The differences
are:
	* inode and dentry are passed separately
	* might be called both in RCU and non-RCU mode;
the former is indicated by passing it a NULL dentry.
	* when called that way it isn't allowed to block
and should return ERR_PTR(-ECHILD) if it needs to be called
in non-RCU mode.

It's a flagday change - the old method is gone, all in-tree instances
converted.  Conversion isn't hard; said that, so far very few instances
do not immediately bail out when called in RCU mode.  That'll change
in the next commits.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/Locking             |  4 +--
 Documentation/filesystems/porting             |  6 ++++
 drivers/staging/lustre/lustre/llite/symlink.c |  8 +++--
 fs/9p/vfs_inode.c                             | 17 +++++++---
 fs/9p/vfs_inode_dotl.c                        | 14 ++++++---
 fs/affs/symlink.c                             |  4 +--
 fs/autofs4/symlink.c                          | 13 +++++---
 fs/btrfs/inode.c                              |  2 +-
 fs/ceph/inode.c                               |  2 +-
 fs/cifs/cifsfs.c                              |  2 +-
 fs/cifs/cifsfs.h                              |  4 +--
 fs/cifs/link.c                                |  6 ++--
 fs/coda/cnode.c                               |  2 +-
 fs/configfs/symlink.c                         | 11 +++++--
 fs/dcache.c                                   |  2 +-
 fs/ecryptfs/inode.c                           | 12 +++++--
 fs/ext2/symlink.c                             |  4 +--
 fs/ext4/symlink.c                             | 13 +++++---
 fs/f2fs/namei.c                               | 16 ++++++----
 fs/fuse/dir.c                                 |  9 ++++--
 fs/gfs2/inode.c                               | 15 ++++++---
 fs/hostfs/hostfs_kern.c                       | 10 ++++--
 fs/jffs2/symlink.c                            |  2 +-
 fs/jfs/symlink.c                              |  4 +--
 fs/kernfs/symlink.c                           | 11 +++++--
 fs/libfs.c                                    |  9 +++---
 fs/minix/inode.c                              |  2 +-
 fs/namei.c                                    | 45 +++++++++++++++++----------
 fs/ncpfs/inode.c                              |  2 +-
 fs/nfs/symlink.c                              |  9 ++++--
 fs/nilfs2/namei.c                             |  2 +-
 fs/ocfs2/symlink.c                            |  2 +-
 fs/overlayfs/inode.c                          | 12 ++++---
 fs/proc/base.c                                | 22 +++++++------
 fs/proc/inode.c                               |  7 +++--
 fs/proc/namespaces.c                          |  9 ++++--
 fs/proc/self.c                                |  9 ++++--
 fs/proc/thread_self.c                         |  9 ++++--
 fs/reiserfs/namei.c                           |  2 +-
 fs/squashfs/symlink.c                         |  2 +-
 fs/sysv/inode.c                               |  2 +-
 fs/ubifs/file.c                               |  2 +-
 fs/xfs/xfs_iops.c                             |  8 +++--
 include/linux/fs.h                            |  6 ++--
 mm/shmem.c                                    | 12 ++++---
 45 files changed, 234 insertions(+), 132 deletions(-)

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 06d4434..4fba54b 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -50,7 +50,7 @@ prototypes:
 	int (*rename2) (struct inode *, struct dentry *,
 			struct inode *, struct dentry *, unsigned int);
 	int (*readlink) (struct dentry *, char __user *,int);
-	const char *(*follow_link) (struct dentry *, void **);
+	const char *(*get_link) (struct dentry *, struct inode *, void **);
 	void (*put_link) (struct inode *, void *);
 	void (*truncate) (struct inode *);
 	int (*permission) (struct inode *, int, unsigned int);
@@ -83,7 +83,7 @@ rmdir:		yes (both)	(see below)
 rename:		yes (all)	(see below)
 rename2:	yes (all)	(see below)
 readlink:	no
-follow_link:	no
+get_link:	no
 put_link:	no
 setattr:	yes
 permission:	no (may not block if called in rcu-walk mode)
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index 3eb7c35..cf92a8c 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -509,3 +509,9 @@ in your dentry operations instead.
 	any symlink that might use page_follow_link_light/page_put_link() must
 	have inode_nohighmem(inode) called before anything might start playing with
 	its pagecache.
+--
+[mandatory]
+	->follow_link() is replaced with ->get_link(); same API, except that
+		* ->get_link() gets inode as a separate argument
+		* ->get_link() may be called in RCU mode - in that case NULL
+		  dentry is passed
diff --git a/drivers/staging/lustre/lustre/llite/symlink.c b/drivers/staging/lustre/lustre/llite/symlink.c
index 69b2036..153fdf9 100644
--- a/drivers/staging/lustre/lustre/llite/symlink.c
+++ b/drivers/staging/lustre/lustre/llite/symlink.c
@@ -118,12 +118,14 @@ failed:
 	return rc;
 }
 
-static const char *ll_follow_link(struct dentry *dentry, void **cookie)
+static const char *ll_get_link(struct dentry *dentry,
+			       struct inode *inode, void **cookie)
 {
-	struct inode *inode = d_inode(dentry);
 	struct ptlrpc_request *request = NULL;
 	int rc;
 	char *symname = NULL;
+	if (!dentry)
+		return ERR_PTR(-ECHILD);
 
 	CDEBUG(D_VFSTRACE, "VFS Op\n");
 	ll_inode_size_lock(inode);
@@ -149,7 +151,7 @@ static void ll_put_link(struct inode *unused, void *cookie)
 struct inode_operations ll_fast_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.setattr	= ll_setattr,
-	.follow_link	= ll_follow_link,
+	.get_link	= ll_get_link,
 	.put_link	= ll_put_link,
 	.getattr	= ll_getattr,
 	.permission	= ll_inode_permission,
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 699941e..8ba5a89 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1223,18 +1223,25 @@ ino_t v9fs_qid2ino(struct p9_qid *qid)
 }
 
 /**
- * v9fs_vfs_follow_link - follow a symlink path
+ * v9fs_vfs_get_link - follow a symlink path
  * @dentry: dentry for symlink
+ * @inode: inode for symlink
  * @cookie: place to pass the data to put_link()
  */
 
-static const char *v9fs_vfs_follow_link(struct dentry *dentry, void **cookie)
+static const char *v9fs_vfs_get_link(struct dentry *dentry,
+				     struct inode *inode, void **cookie)
 {
-	struct v9fs_session_info *v9ses = v9fs_dentry2v9ses(dentry);
-	struct p9_fid *fid = v9fs_fid_lookup(dentry);
+	struct v9fs_session_info *v9ses;
+	struct p9_fid *fid;
 	struct p9_wstat *st;
 	char *res;
 
+	if (!dentry)
+		return ERR_PTR(-ECHILD);
+
+	v9ses = v9fs_dentry2v9ses(dentry);
+	fid = v9fs_fid_lookup(dentry);
 	p9_debug(P9_DEBUG_VFS, "%pd\n", dentry);
 
 	if (IS_ERR(fid))
@@ -1452,7 +1459,7 @@ static const struct inode_operations v9fs_file_inode_operations = {
 
 static const struct inode_operations v9fs_symlink_inode_operations = {
 	.readlink = generic_readlink,
-	.follow_link = v9fs_vfs_follow_link,
+	.get_link = v9fs_vfs_get_link,
 	.put_link = kfree_put_link,
 	.getattr = v9fs_vfs_getattr,
 	.setattr = v9fs_vfs_setattr,
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index cb899af..0cc105d 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -899,20 +899,26 @@ error:
 }
 
 /**
- * v9fs_vfs_follow_link_dotl - follow a symlink path
+ * v9fs_vfs_get_link_dotl - follow a symlink path
  * @dentry: dentry for symlink
+ * @inode: inode for symlink
  * @cookie: place to pass the data to put_link()
  */
 
 static const char *
-v9fs_vfs_follow_link_dotl(struct dentry *dentry, void **cookie)
+v9fs_vfs_get_link_dotl(struct dentry *dentry,
+		       struct inode *inode, void **cookie)
 {
-	struct p9_fid *fid = v9fs_fid_lookup(dentry);
+	struct p9_fid *fid;
 	char *target;
 	int retval;
 
+	if (!dentry)
+		return ERR_PTR(-ECHILD);
+
 	p9_debug(P9_DEBUG_VFS, "%pd\n", dentry);
 
+	fid = v9fs_fid_lookup(dentry);
 	if (IS_ERR(fid))
 		return ERR_CAST(fid);
 	retval = p9_client_readlink(fid, &target);
@@ -984,7 +990,7 @@ const struct inode_operations v9fs_file_inode_operations_dotl = {
 
 const struct inode_operations v9fs_symlink_inode_operations_dotl = {
 	.readlink = generic_readlink,
-	.follow_link = v9fs_vfs_follow_link_dotl,
+	.get_link = v9fs_vfs_get_link_dotl,
 	.put_link = kfree_put_link,
 	.getattr = v9fs_vfs_getattr_dotl,
 	.setattr = v9fs_vfs_setattr_dotl,
diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c
index e3f9dc3..39d1194 100644
--- a/fs/affs/symlink.c
+++ b/fs/affs/symlink.c
@@ -20,7 +20,7 @@ static int affs_symlink_readpage(struct file *file, struct page *page)
 	char			 c;
 	char			 lc;
 
-	pr_debug("follow_link(ino=%lu)\n", inode->i_ino);
+	pr_debug("get_link(ino=%lu)\n", inode->i_ino);
 
 	bh = affs_bread(inode->i_sb, inode->i_ino);
 	if (!bh)
@@ -71,7 +71,7 @@ const struct address_space_operations affs_symlink_aops = {
 
 const struct inode_operations affs_symlink_inode_operations = {
 	.readlink	= generic_readlink,
-	.follow_link	= page_follow_link_light,
+	.get_link	= page_get_link,
 	.put_link	= page_put_link,
 	.setattr	= affs_notify_change,
 };
diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c
index da0c334..39e6f0b 100644
--- a/fs/autofs4/symlink.c
+++ b/fs/autofs4/symlink.c
@@ -12,10 +12,15 @@
 
 #include "autofs_i.h"
 
-static const char *autofs4_follow_link(struct dentry *dentry, void **cookie)
+static const char *autofs4_get_link(struct dentry *dentry,
+				    struct inode *inode, void **cookie)
 {
-	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
-	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+	struct autofs_sb_info *sbi;
+	struct autofs_info *ino;
+	if (!dentry)
+		return ERR_PTR(-ECHILD);
+	sbi = autofs4_sbi(dentry->d_sb);
+	ino = autofs4_dentry_ino(dentry);
 	if (ino && !autofs4_oz_mode(sbi))
 		ino->last_used = jiffies;
 	return d_inode(dentry)->i_private;
@@ -23,5 +28,5 @@ static const char *autofs4_follow_link(struct dentry *dentry, void **cookie)
 
 const struct inode_operations autofs4_symlink_inode_operations = {
 	.readlink	= generic_readlink,
-	.follow_link	= autofs4_follow_link
+	.get_link	= autofs4_get_link
 };
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 70f98bf..3d4aa69 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -10096,7 +10096,7 @@ static const struct inode_operations btrfs_special_inode_operations = {
 };
 static const struct inode_operations btrfs_symlink_inode_operations = {
 	.readlink	= generic_readlink,
-	.follow_link	= page_follow_link_light,
+	.get_link	= page_get_link,
 	.put_link	= page_put_link,
 	.getattr	= btrfs_getattr,
 	.setattr	= btrfs_setattr,
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 498dcfa..da55eb8 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1756,7 +1756,7 @@ retry:
  */
 static const struct inode_operations ceph_symlink_iops = {
 	.readlink = generic_readlink,
-	.follow_link = simple_follow_link,
+	.get_link = simple_get_link,
 	.setattr = ceph_setattr,
 	.getattr = ceph_getattr,
 	.setxattr = ceph_setxattr,
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index cbc0f4b..4593f416 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -900,7 +900,7 @@ const struct inode_operations cifs_file_inode_ops = {
 
 const struct inode_operations cifs_symlink_inode_ops = {
 	.readlink = generic_readlink,
-	.follow_link = cifs_follow_link,
+	.get_link = cifs_get_link,
 	.put_link = kfree_put_link,
 	.permission = cifs_permission,
 	/* BB add the following two eventually */
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index c3cc160..6886328 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -120,9 +120,7 @@ extern struct vfsmount *cifs_dfs_d_automount(struct path *path);
 #endif
 
 /* Functions related to symlinks */
-extern const char *cifs_follow_link(struct dentry *direntry, void **cookie);
-extern int cifs_readlink(struct dentry *direntry, char __user *buffer,
-			 int buflen);
+extern const char *cifs_get_link(struct dentry *, struct inode *, void **);
 extern int cifs_symlink(struct inode *inode, struct dentry *direntry,
 			const char *symname);
 extern int	cifs_removexattr(struct dentry *, const char *);
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index e3548f7..6f2439b5 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -627,9 +627,8 @@ cifs_hl_exit:
 }
 
 const char *
-cifs_follow_link(struct dentry *direntry, void **cookie)
+cifs_get_link(struct dentry *direntry, struct inode *inode, void **cookie)
 {
-	struct inode *inode = d_inode(direntry);
 	int rc = -ENOMEM;
 	unsigned int xid;
 	char *full_path = NULL;
@@ -639,6 +638,9 @@ cifs_follow_link(struct dentry *direntry, void **cookie)
 	struct cifs_tcon *tcon;
 	struct TCP_Server_Info *server;
 
+	if (!direntry)
+		return ERR_PTR(-ECHILD);
+
 	xid = get_xid();
 
 	tlink = cifs_sb_tlink(cifs_sb);
diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c
index dd6a79e..f18139c 100644
--- a/fs/coda/cnode.c
+++ b/fs/coda/cnode.c
@@ -18,7 +18,7 @@ static inline int coda_fideq(struct CodaFid *fid1, struct CodaFid *fid2)
 
 static const struct inode_operations coda_symlink_inode_operations = {
 	.readlink	= generic_readlink,
-	.follow_link	= page_follow_link_light,
+	.get_link	= page_get_link,
 	.put_link	= page_put_link,
 	.setattr	= coda_setattr,
 };
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index ec5c832..b91c01e 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -279,11 +279,16 @@ static int configfs_getlink(struct dentry *dentry, char * path)
 
 }
 
-static const char *configfs_follow_link(struct dentry *dentry, void **cookie)
+static const char *configfs_get_link(struct dentry *dentry,
+				     struct inode *inode, void **cookie)
 {
-	unsigned long page = get_zeroed_page(GFP_KERNEL);
+	unsigned long page;
 	int error;
 
+	if (!dentry)
+		return ERR_PTR(-ECHILD);
+
+	page = get_zeroed_page(GFP_KERNEL);
 	if (!page)
 		return ERR_PTR(-ENOMEM);
 
@@ -297,7 +302,7 @@ static const char *configfs_follow_link(struct dentry *dentry, void **cookie)
 }
 
 const struct inode_operations configfs_symlink_inode_operations = {
-	.follow_link = configfs_follow_link,
+	.get_link = configfs_get_link,
 	.readlink = generic_readlink,
 	.put_link = free_page_put_link,
 	.setattr = configfs_setattr,
diff --git a/fs/dcache.c b/fs/dcache.c
index 5c33aeb..d27f090 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1734,7 +1734,7 @@ static unsigned d_flags_for_inode(struct inode *inode)
 	}
 
 	if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) {
-		if (unlikely(inode->i_op->follow_link)) {
+		if (unlikely(inode->i_op->get_link)) {
 			add_flags = DCACHE_SYMLINK_TYPE;
 			goto type_determined;
 		}
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index e2e47ba..5a05559 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -674,10 +674,16 @@ out:
 	return rc ? ERR_PTR(rc) : buf;
 }
 
-static const char *ecryptfs_follow_link(struct dentry *dentry, void **cookie)
+static const char *ecryptfs_get_link(struct dentry *dentry,
+				     struct inode *inode, void **cookie)
 {
 	size_t len;
-	char *buf = ecryptfs_readlink_lower(dentry, &len);
+	char *buf;
+
+	if (!dentry)
+		return ERR_PTR(-ECHILD);
+
+	buf = ecryptfs_readlink_lower(dentry, &len);
 	if (IS_ERR(buf))
 		return buf;
 	fsstack_copy_attr_atime(d_inode(dentry),
@@ -1095,7 +1101,7 @@ out:
 
 const struct inode_operations ecryptfs_symlink_iops = {
 	.readlink = generic_readlink,
-	.follow_link = ecryptfs_follow_link,
+	.get_link = ecryptfs_get_link,
 	.put_link = kfree_put_link,
 	.permission = ecryptfs_permission,
 	.setattr = ecryptfs_setattr,
diff --git a/fs/ext2/symlink.c b/fs/ext2/symlink.c
index ae17179..4690511 100644
--- a/fs/ext2/symlink.c
+++ b/fs/ext2/symlink.c
@@ -22,7 +22,7 @@
 
 const struct inode_operations ext2_symlink_inode_operations = {
 	.readlink	= generic_readlink,
-	.follow_link	= page_follow_link_light,
+	.get_link	= page_get_link,
 	.put_link	= page_put_link,
 	.setattr	= ext2_setattr,
 #ifdef CONFIG_EXT2_FS_XATTR
@@ -35,7 +35,7 @@ const struct inode_operations ext2_symlink_inode_operations = {
  
 const struct inode_operations ext2_fast_symlink_inode_operations = {
 	.readlink	= generic_readlink,
-	.follow_link	= simple_follow_link,
+	.get_link	= simple_get_link,
 	.setattr	= ext2_setattr,
 #ifdef CONFIG_EXT2_FS_XATTR
 	.setxattr	= generic_setxattr,
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index 0e6dc44..3b4bfe2 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -23,17 +23,20 @@
 #include "xattr.h"
 
 #ifdef CONFIG_EXT4_FS_ENCRYPTION
-static const char *ext4_encrypted_follow_link(struct dentry *dentry, void **cookie)
+static const char *ext4_encrypted_get_link(struct dentry *dentry,
+					   struct inode *inode, void **cookie)
 {
 	struct page *cpage = NULL;
 	char *caddr, *paddr = NULL;
 	struct ext4_str cstr, pstr;
-	struct inode *inode = d_inode(dentry);
 	struct ext4_encrypted_symlink_data *sd;
 	loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1);
 	int res;
 	u32 plen, max_size = inode->i_sb->s_blocksize;
 
+	if (!dentry)
+		return ERR_PTR(-ECHILD);
+
 	res = ext4_get_encryption_info(inode);
 	if (res)
 		return ERR_PTR(res);
@@ -87,7 +90,7 @@ errout:
 
 const struct inode_operations ext4_encrypted_symlink_inode_operations = {
 	.readlink	= generic_readlink,
-	.follow_link    = ext4_encrypted_follow_link,
+	.get_link	= ext4_encrypted_get_link,
 	.put_link       = kfree_put_link,
 	.setattr	= ext4_setattr,
 	.setxattr	= generic_setxattr,
@@ -99,7 +102,7 @@ const struct inode_operations ext4_encrypted_symlink_inode_operations = {
 
 const struct inode_operations ext4_symlink_inode_operations = {
 	.readlink	= generic_readlink,
-	.follow_link	= page_follow_link_light,
+	.get_link	= page_get_link,
 	.put_link	= page_put_link,
 	.setattr	= ext4_setattr,
 	.setxattr	= generic_setxattr,
@@ -110,7 +113,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
 
 const struct inode_operations ext4_fast_symlink_inode_operations = {
 	.readlink	= generic_readlink,
-	.follow_link    = simple_follow_link,
+	.get_link	= simple_get_link,
 	.setattr	= ext4_setattr,
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 484df68..2a8d84b 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -315,9 +315,10 @@ fail:
 	return err;
 }
 
-static const char *f2fs_follow_link(struct dentry *dentry, void **cookie)
+static const char *f2fs_get_link(struct dentry *dentry,
+				 struct inode *inode, void **cookie)
 {
-	const char *link = page_follow_link_light(dentry, cookie);
+	const char *link = page_get_link(dentry, inode, cookie);
 	if (!IS_ERR(link) && !*link) {
 		/* this is broken symlink case */
 		page_put_link(NULL, *cookie);
@@ -924,18 +925,21 @@ static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry,
 }
 
 #ifdef CONFIG_F2FS_FS_ENCRYPTION
-static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cookie)
+static const char *f2fs_encrypted_get_link(struct dentry *dentry,
+					   struct inode *inode, void **cookie)
 {
 	struct page *cpage = NULL;
 	char *caddr, *paddr = NULL;
 	struct f2fs_str cstr;
 	struct f2fs_str pstr = FSTR_INIT(NULL, 0);
-	struct inode *inode = d_inode(dentry);
 	struct f2fs_encrypted_symlink_data *sd;
 	loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1);
 	u32 max_size = inode->i_sb->s_blocksize;
 	int res;
 
+	if (!dentry)
+		return ERR_PTR(-ECHILD);
+
 	res = f2fs_get_encryption_info(inode);
 	if (res)
 		return ERR_PTR(res);
@@ -994,7 +998,7 @@ errout:
 
 const struct inode_operations f2fs_encrypted_symlink_inode_operations = {
 	.readlink       = generic_readlink,
-	.follow_link    = f2fs_encrypted_follow_link,
+	.get_link       = f2fs_encrypted_get_link,
 	.put_link       = kfree_put_link,
 	.getattr	= f2fs_getattr,
 	.setattr	= f2fs_setattr,
@@ -1030,7 +1034,7 @@ const struct inode_operations f2fs_dir_inode_operations = {
 
 const struct inode_operations f2fs_symlink_inode_operations = {
 	.readlink       = generic_readlink,
-	.follow_link    = f2fs_follow_link,
+	.get_link       = f2fs_get_link,
 	.put_link       = page_put_link,
 	.getattr	= f2fs_getattr,
 	.setattr	= f2fs_setattr,
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 5e2e087..148e8ef 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1365,14 +1365,17 @@ static int fuse_readdir(struct file *file, struct dir_context *ctx)
 	return err;
 }
 
-static const char *fuse_follow_link(struct dentry *dentry, void **cookie)
+static const char *fuse_get_link(struct dentry *dentry,
+				 struct inode *inode, void **cookie)
 {
-	struct inode *inode = d_inode(dentry);
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	FUSE_ARGS(args);
 	char *link;
 	ssize_t ret;
 
+	if (!dentry)
+		return ERR_PTR(-ECHILD);
+
 	link = (char *) __get_free_page(GFP_KERNEL);
 	if (!link)
 		return ERR_PTR(-ENOMEM);
@@ -1909,7 +1912,7 @@ static const struct inode_operations fuse_common_inode_operations = {
 
 static const struct inode_operations fuse_symlink_inode_operations = {
 	.setattr	= fuse_setattr,
-	.follow_link	= fuse_follow_link,
+	.get_link	= fuse_get_link,
 	.put_link	= free_page_put_link,
 	.readlink	= generic_readlink,
 	.getattr	= fuse_getattr,
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 063fdfc..1095056 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1712,24 +1712,29 @@ static int gfs2_rename2(struct inode *odir, struct dentry *odentry,
 }
 
 /**
- * gfs2_follow_link - Follow a symbolic link
+ * gfs2_get_link - Follow a symbolic link
  * @dentry: The dentry of the link
- * @nd: Data that we pass to vfs_follow_link()
+ * @inode: The inode of the link
+ * @cookie: place to store the information for ->put_link()
  *
  * This can handle symlinks of any size.
  *
  * Returns: 0 on success or error code
  */
 
-static const char *gfs2_follow_link(struct dentry *dentry, void **cookie)
+static const char *gfs2_get_link(struct dentry *dentry,
+				 struct inode *inode, void **cookie)
 {
-	struct gfs2_inode *ip = GFS2_I(d_inode(dentry));
+	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_holder i_gh;
 	struct buffer_head *dibh;
 	unsigned int size;
 	char *buf;
 	int error;
 
+	if (!dentry)
+		return ERR_PTR(-ECHILD);
+
 	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
 	error = gfs2_glock_nq(&i_gh);
 	if (error) {
@@ -2132,7 +2137,7 @@ const struct inode_operations gfs2_dir_iops = {
 
 const struct inode_operations gfs2_symlink_iops = {
 	.readlink = generic_readlink,
-	.follow_link = gfs2_follow_link,
+	.get_link = gfs2_get_link,
 	.put_link = kfree_put_link,
 	.permission = gfs2_permission,
 	.setattr = gfs2_setattr,
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 2ac99db..6ce5309 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -892,9 +892,13 @@ static const struct inode_operations hostfs_dir_iops = {
 	.setattr	= hostfs_setattr,
 };
 
-static const char *hostfs_follow_link(struct dentry *dentry, void **cookie)
+static const char *hostfs_get_link(struct dentry *dentry,
+				   struct inode *inode, void **cookie)
 {
-	char *link = __getname();
+	char *link;
+	if (!dentry)
+		return ERR_PTR(-ECHILD);
+	link = __getname();
 	if (link) {
 		char *path = dentry_name(dentry);
 		int err = -ENOMEM;
@@ -922,7 +926,7 @@ static void hostfs_put_link(struct inode *unused, void *cookie)
 
 static const struct inode_operations hostfs_link_iops = {
 	.readlink	= generic_readlink,
-	.follow_link	= hostfs_follow_link,
+	.get_link	= hostfs_get_link,
 	.put_link	= hostfs_put_link,
 };
 
diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c
index 8ce2f24..2cabd64 100644
--- a/fs/jffs2/symlink.c
+++ b/fs/jffs2/symlink.c
@@ -14,7 +14,7 @@
 const struct inode_operations jffs2_symlink_inode_operations =
 {
 	.readlink =	generic_readlink,
-	.follow_link =	simple_follow_link,
+	.get_link =	simple_get_link,
 	.setattr =	jffs2_setattr,
 	.setxattr =	jffs2_setxattr,
 	.getxattr =	jffs2_getxattr,
diff --git a/fs/jfs/symlink.c b/fs/jfs/symlink.c
index 5929e23..0211328 100644
--- a/fs/jfs/symlink.c
+++ b/fs/jfs/symlink.c
@@ -23,7 +23,7 @@
 
 const struct inode_operations jfs_fast_symlink_inode_operations = {
 	.readlink	= generic_readlink,
-	.follow_link	= simple_follow_link,
+	.get_link	= simple_get_link,
 	.setattr	= jfs_setattr,
 	.setxattr	= jfs_setxattr,
 	.getxattr	= jfs_getxattr,
@@ -33,7 +33,7 @@ const struct inode_operations jfs_fast_symlink_inode_operations = {
 
 const struct inode_operations jfs_symlink_inode_operations = {
 	.readlink	= generic_readlink,
-	.follow_link	= page_follow_link_light,
+	.get_link	= page_get_link,
 	.put_link	= page_put_link,
 	.setattr	= jfs_setattr,
 	.setxattr	= jfs_setxattr,
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index db27252..ffae857 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -112,10 +112,15 @@ static int kernfs_getlink(struct dentry *dentry, char *path)
 	return error;
 }
 
-static const char *kernfs_iop_follow_link(struct dentry *dentry, void **cookie)
+static const char *kernfs_iop_get_link(struct dentry *dentry,
+				       struct inode *inode, void **cookie)
 {
 	int error = -ENOMEM;
-	unsigned long page = get_zeroed_page(GFP_KERNEL);
+	unsigned long page;
+
+	if (!dentry)
+		return ERR_PTR(-ECHILD);
+	page = get_zeroed_page(GFP_KERNEL);
 	if (!page)
 		return ERR_PTR(-ENOMEM);
 	error = kernfs_getlink(dentry, (char *)page);
@@ -132,7 +137,7 @@ const struct inode_operations kernfs_symlink_iops = {
 	.getxattr	= kernfs_iop_getxattr,
 	.listxattr	= kernfs_iop_listxattr,
 	.readlink	= generic_readlink,
-	.follow_link	= kernfs_iop_follow_link,
+	.get_link	= kernfs_iop_get_link,
 	.put_link	= free_page_put_link,
 	.setattr	= kernfs_iop_setattr,
 	.getattr	= kernfs_iop_getattr,
diff --git a/fs/libfs.c b/fs/libfs.c
index c7cbfb0..8dc37fc 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1092,14 +1092,15 @@ simple_nosetlease(struct file *filp, long arg, struct file_lock **flp,
 }
 EXPORT_SYMBOL(simple_nosetlease);
 
-const char *simple_follow_link(struct dentry *dentry, void **cookie)
+const char *simple_get_link(struct dentry *dentry, struct inode *inode,
+			    void **cookie)
 {
-	return d_inode(dentry)->i_link;
+	return inode->i_link;
 }
-EXPORT_SYMBOL(simple_follow_link);
+EXPORT_SYMBOL(simple_get_link);
 
 const struct inode_operations simple_symlink_inode_operations = {
-	.follow_link = simple_follow_link,
+	.get_link = simple_get_link,
 	.readlink = generic_readlink
 };
 EXPORT_SYMBOL(simple_symlink_inode_operations);
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 67a23bf..3cce709 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -435,7 +435,7 @@ static const struct address_space_operations minix_aops = {
 
 static const struct inode_operations minix_symlink_inode_operations = {
 	.readlink	= generic_readlink,
-	.follow_link	= page_follow_link_light,
+	.get_link	= page_get_link,
 	.put_link	= page_put_link,
 	.getattr	= minix_getattr,
 };
diff --git a/fs/namei.c b/fs/namei.c
index 2808958..1da3064 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -842,7 +842,7 @@ static inline void path_to_nameidata(const struct path *path,
 }
 
 /*
- * Helper to directly jump to a known parsed path from ->follow_link,
+ * Helper to directly jump to a known parsed path from ->get_link,
  * caller must have taken a reference to path beforehand.
  */
 void nd_jump_link(struct path *path)
@@ -1005,10 +1005,18 @@ const char *get_link(struct nameidata *nd)
 	res = inode->i_link;
 	if (!res) {
 		if (nd->flags & LOOKUP_RCU) {
-			if (unlikely(unlazy_walk(nd, NULL, 0)))
-				return ERR_PTR(-ECHILD);
+			res = inode->i_op->get_link(NULL, inode,
+						    &last->cookie);
+			if (res == ERR_PTR(-ECHILD)) {
+				if (unlikely(unlazy_walk(nd, NULL, 0)))
+					return ERR_PTR(-ECHILD);
+				res = inode->i_op->get_link(dentry, inode,
+						            &last->cookie);
+			}
+		} else {
+			res = inode->i_op->get_link(dentry, inode,
+						    &last->cookie);
 		}
-		res = inode->i_op->follow_link(dentry, &last->cookie);
 		if (IS_ERR_OR_NULL(res)) {
 			last->cookie = NULL;
 			return res;
@@ -4495,8 +4503,8 @@ EXPORT_SYMBOL(readlink_copy);
 
 /*
  * A helper for ->readlink().  This should be used *ONLY* for symlinks that
- * have ->follow_link() touching nd only in nd_set_link().  Using (or not
- * using) it for any given inode is up to filesystem.
+ * have ->get_link() not calling nd_jump_link().  Using (or not using) it
+ * for any given inode is up to filesystem.
  */
 int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
 {
@@ -4506,7 +4514,7 @@ int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
 	int res;
 
 	if (!link) {
-		link = inode->i_op->follow_link(dentry, &cookie);
+		link = inode->i_op->get_link(dentry, inode, &cookie);
 		if (IS_ERR(link))
 			return PTR_ERR(link);
 	}
@@ -4518,26 +4526,27 @@ int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
 EXPORT_SYMBOL(generic_readlink);
 
 /* get the link contents into pagecache */
-static const char *page_getlink(struct dentry * dentry, void **cookie)
+const char *page_get_link(struct dentry *dentry, struct inode *inode,
+				 void **cookie)
 {
 	char *kaddr;
 	struct page *page;
-	struct address_space *mapping = dentry->d_inode->i_mapping;
+	struct address_space *mapping = inode->i_mapping;
+
+	if (!dentry)
+		return ERR_PTR(-ECHILD);
+
 	page = read_mapping_page(mapping, 0, NULL);
 	if (IS_ERR(page))
 		return (char*)page;
 	*cookie = page;
 	BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM);
 	kaddr = page_address(page);
-	nd_terminate_link(kaddr, dentry->d_inode->i_size, PAGE_SIZE - 1);
+	nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1);
 	return kaddr;
 }
 
-const char *page_follow_link_light(struct dentry *dentry, void **cookie)
-{
-	return page_getlink(dentry, cookie);
-}
-EXPORT_SYMBOL(page_follow_link_light);
+EXPORT_SYMBOL(page_get_link);
 
 void page_put_link(struct inode *unused, void *cookie)
 {
@@ -4549,7 +4558,9 @@ EXPORT_SYMBOL(page_put_link);
 int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
 {
 	void *cookie = NULL;
-	int res = readlink_copy(buffer, buflen, page_getlink(dentry, &cookie));
+	int res = readlink_copy(buffer, buflen,
+				page_get_link(dentry, d_inode(dentry),
+					      &cookie));
 	if (cookie)
 		page_put_link(NULL, cookie);
 	return res;
@@ -4600,7 +4611,7 @@ EXPORT_SYMBOL(page_symlink);
 
 const struct inode_operations page_symlink_inode_operations = {
 	.readlink	= generic_readlink,
-	.follow_link	= page_follow_link_light,
+	.get_link	= page_get_link,
 	.put_link	= page_put_link,
 };
 EXPORT_SYMBOL(page_symlink_inode_operations);
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index bb856f7..3ab6cdb 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -244,7 +244,7 @@ static void ncp_set_attr(struct inode *inode, struct ncp_entry_info *nwinfo)
 #if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS)
 static const struct inode_operations ncp_symlink_inode_operations = {
 	.readlink	= generic_readlink,
-	.follow_link	= page_follow_link_light,
+	.get_link	= page_get_link,
 	.put_link	= page_put_link,
 	.setattr	= ncp_notify_change,
 };
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index abd93bf..8ade8a8 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -42,12 +42,15 @@ error:
 	return -EIO;
 }
 
-static const char *nfs_follow_link(struct dentry *dentry, void **cookie)
+static const char *nfs_get_link(struct dentry *dentry,
+				struct inode *inode, void **cookie)
 {
-	struct inode *inode = d_inode(dentry);
 	struct page *page;
 	void *err;
 
+	if (!dentry)
+		return ERR_PTR(-ECHILD);
+
 	err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping));
 	if (err)
 		return err;
@@ -64,7 +67,7 @@ static const char *nfs_follow_link(struct dentry *dentry, void **cookie)
  */
 const struct inode_operations nfs_symlink_inode_operations = {
 	.readlink	= generic_readlink,
-	.follow_link	= nfs_follow_link,
+	.get_link	= nfs_get_link,
 	.put_link	= page_put_link,
 	.getattr	= nfs_getattr,
 	.setattr	= nfs_setattr,
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 90b3ba9..63dddb7 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -569,7 +569,7 @@ const struct inode_operations nilfs_special_inode_operations = {
 
 const struct inode_operations nilfs_symlink_inode_operations = {
 	.readlink	= generic_readlink,
-	.follow_link	= page_follow_link_light,
+	.get_link	= page_get_link,
 	.put_link	= page_put_link,
 	.permission     = nilfs_permission,
 };
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 66edce7..b4e79bc 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -88,7 +88,7 @@ const struct address_space_operations ocfs2_fast_symlink_aops = {
 
 const struct inode_operations ocfs2_symlink_inode_operations = {
 	.readlink	= generic_readlink,
-	.follow_link	= page_follow_link_light,
+	.get_link	= page_get_link,
 	.put_link	= page_put_link,
 	.getattr	= ocfs2_getattr,
 	.setattr	= ocfs2_setattr,
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 4060ffd..38a0b8b 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -137,17 +137,21 @@ struct ovl_link_data {
 	void *cookie;
 };
 
-static const char *ovl_follow_link(struct dentry *dentry, void **cookie)
+static const char *ovl_get_link(struct dentry *dentry,
+				struct inode *inode, void **cookie)
 {
 	struct dentry *realdentry;
 	struct inode *realinode;
 	struct ovl_link_data *data = NULL;
 	const char *ret;
 
+	if (!dentry)
+		return ERR_PTR(-ECHILD);
+
 	realdentry = ovl_dentry_real(dentry);
 	realinode = realdentry->d_inode;
 
-	if (WARN_ON(!realinode->i_op->follow_link))
+	if (WARN_ON(!realinode->i_op->get_link))
 		return ERR_PTR(-EPERM);
 
 	if (realinode->i_op->put_link) {
@@ -157,7 +161,7 @@ static const char *ovl_follow_link(struct dentry *dentry, void **cookie)
 		data->realdentry = realdentry;
 	}
 
-	ret = realinode->i_op->follow_link(realdentry, cookie);
+	ret = realinode->i_op->get_link(realdentry, realinode, cookie);
 	if (IS_ERR_OR_NULL(ret)) {
 		kfree(data);
 		return ret;
@@ -378,7 +382,7 @@ static const struct inode_operations ovl_file_inode_operations = {
 
 static const struct inode_operations ovl_symlink_inode_operations = {
 	.setattr	= ovl_setattr,
-	.follow_link	= ovl_follow_link,
+	.get_link	= ovl_get_link,
 	.put_link	= ovl_put_link,
 	.readlink	= ovl_readlink,
 	.getattr	= ovl_getattr,
diff --git a/fs/proc/base.c b/fs/proc/base.c
index bd3e9e6..1a489e2 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1564,12 +1564,15 @@ static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
 		return -ENOENT;
 }
 
-static const char *proc_pid_follow_link(struct dentry *dentry, void **cookie)
+static const char *proc_pid_get_link(struct dentry *dentry,
+				     struct inode *inode, void **cookie)
 {
-	struct inode *inode = d_inode(dentry);
 	struct path path;
 	int error = -EACCES;
 
+	if (!dentry)
+		return ERR_PTR(-ECHILD);
+
 	/* Are we allowed to snoop on the tasks file descriptors? */
 	if (!proc_fd_access_allowed(inode))
 		goto out;
@@ -1630,7 +1633,7 @@ out:
 
 const struct inode_operations proc_pid_link_inode_operations = {
 	.readlink	= proc_pid_readlink,
-	.follow_link	= proc_pid_follow_link,
+	.get_link	= proc_pid_get_link,
 	.setattr	= proc_setattr,
 };
 
@@ -1895,7 +1898,7 @@ static const struct dentry_operations tid_map_files_dentry_operations = {
 	.d_delete	= pid_delete_dentry,
 };
 
-static int proc_map_files_get_link(struct dentry *dentry, struct path *path)
+static int map_files_get_link(struct dentry *dentry, struct path *path)
 {
 	unsigned long vm_start, vm_end;
 	struct vm_area_struct *vma;
@@ -1945,20 +1948,21 @@ struct map_files_info {
  * path to the file in question.
  */
 static const char *
-proc_map_files_follow_link(struct dentry *dentry, void **cookie)
+proc_map_files_get_link(struct dentry *dentry,
+			struct inode *inode, void **cookie)
 {
 	if (!capable(CAP_SYS_ADMIN))
 		return ERR_PTR(-EPERM);
 
-	return proc_pid_follow_link(dentry, NULL);
+	return proc_pid_get_link(dentry, inode, NULL);
 }
 
 /*
- * Identical to proc_pid_link_inode_operations except for follow_link()
+ * Identical to proc_pid_link_inode_operations except for get_link()
  */
 static const struct inode_operations proc_map_files_link_inode_operations = {
 	.readlink	= proc_pid_readlink,
-	.follow_link	= proc_map_files_follow_link,
+	.get_link	= proc_map_files_get_link,
 	.setattr	= proc_setattr,
 };
 
@@ -1975,7 +1979,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
 		return -ENOENT;
 
 	ei = PROC_I(inode);
-	ei->op.proc_get_link = proc_map_files_get_link;
+	ei->op.proc_get_link = map_files_get_link;
 
 	inode->i_op = &proc_map_files_link_inode_operations;
 	inode->i_size = 64;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index bd95b9f..10360b2 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -393,9 +393,10 @@ static const struct file_operations proc_reg_file_ops_no_compat = {
 };
 #endif
 
-static const char *proc_follow_link(struct dentry *dentry, void **cookie)
+static const char *proc_get_link(struct dentry *dentry,
+				 struct inode *inode, void **cookie)
 {
-	struct proc_dir_entry *pde = PDE(d_inode(dentry));
+	struct proc_dir_entry *pde = PDE(inode);
 	if (unlikely(!use_pde(pde)))
 		return ERR_PTR(-EINVAL);
 	*cookie = pde;
@@ -409,7 +410,7 @@ static void proc_put_link(struct inode *unused, void *p)
 
 const struct inode_operations proc_link_inode_operations = {
 	.readlink	= generic_readlink,
-	.follow_link	= proc_follow_link,
+	.get_link	= proc_get_link,
 	.put_link	= proc_put_link,
 };
 
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index f6e8354..63861c1 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -30,14 +30,17 @@ static const struct proc_ns_operations *ns_entries[] = {
 	&mntns_operations,
 };
 
-static const char *proc_ns_follow_link(struct dentry *dentry, void **cookie)
+static const char *proc_ns_get_link(struct dentry *dentry,
+				    struct inode *inode, void **cookie)
 {
-	struct inode *inode = d_inode(dentry);
 	const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops;
 	struct task_struct *task;
 	struct path ns_path;
 	void *error = ERR_PTR(-EACCES);
 
+	if (!dentry)
+		return ERR_PTR(-ECHILD);
+
 	task = get_proc_task(inode);
 	if (!task)
 		return error;
@@ -74,7 +77,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl
 
 static const struct inode_operations proc_ns_link_inode_operations = {
 	.readlink	= proc_ns_readlink,
-	.follow_link	= proc_ns_follow_link,
+	.get_link	= proc_ns_get_link,
 	.setattr	= proc_setattr,
 };
 
diff --git a/fs/proc/self.c b/fs/proc/self.c
index 113b8d0..9dd0ae6 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -18,12 +18,15 @@ static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
 	return readlink_copy(buffer, buflen, tmp);
 }
 
-static const char *proc_self_follow_link(struct dentry *dentry, void **cookie)
+static const char *proc_self_get_link(struct dentry *dentry,
+				      struct inode *inode, void **cookie)
 {
-	struct pid_namespace *ns = dentry->d_sb->s_fs_info;
+	struct pid_namespace *ns = inode->i_sb->s_fs_info;
 	pid_t tgid = task_tgid_nr_ns(current, ns);
 	char *name;
 
+	if (!dentry)
+		return ERR_PTR(-ECHILD);
 	if (!tgid)
 		return ERR_PTR(-ENOENT);
 	/* 11 for max length of signed int in decimal + NULL term */
@@ -36,7 +39,7 @@ static const char *proc_self_follow_link(struct dentry *dentry, void **cookie)
 
 static const struct inode_operations proc_self_inode_operations = {
 	.readlink	= proc_self_readlink,
-	.follow_link	= proc_self_follow_link,
+	.get_link	= proc_self_get_link,
 	.put_link	= kfree_put_link,
 };
 
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index 947b0f4..50eef6f 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -19,13 +19,16 @@ static int proc_thread_self_readlink(struct dentry *dentry, char __user *buffer,
 	return readlink_copy(buffer, buflen, tmp);
 }
 
-static const char *proc_thread_self_follow_link(struct dentry *dentry, void **cookie)
+static const char *proc_thread_self_get_link(struct dentry *dentry,
+					     struct inode *inode, void **cookie)
 {
-	struct pid_namespace *ns = dentry->d_sb->s_fs_info;
+	struct pid_namespace *ns = inode->i_sb->s_fs_info;
 	pid_t tgid = task_tgid_nr_ns(current, ns);
 	pid_t pid = task_pid_nr_ns(current, ns);
 	char *name;
 
+	if (!dentry)
+		return ERR_PTR(-ECHILD);
 	if (!pid)
 		return ERR_PTR(-ENOENT);
 	name = kmalloc(PROC_NUMBUF + 6 + PROC_NUMBUF, GFP_KERNEL);
@@ -37,7 +40,7 @@ static const char *proc_thread_self_follow_link(struct dentry *dentry, void **co
 
 static const struct inode_operations proc_thread_self_inode_operations = {
 	.readlink	= proc_thread_self_readlink,
-	.follow_link	= proc_thread_self_follow_link,
+	.get_link	= proc_thread_self_get_link,
 	.put_link	= kfree_put_link,
 };
 
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 4fc2326..ecbf11e 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -1665,7 +1665,7 @@ const struct inode_operations reiserfs_dir_inode_operations = {
  */
 const struct inode_operations reiserfs_symlink_inode_operations = {
 	.readlink = generic_readlink,
-	.follow_link = page_follow_link_light,
+	.get_link	= page_get_link,
 	.put_link = page_put_link,
 	.setattr = reiserfs_setattr,
 	.setxattr = reiserfs_setxattr,
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
index 12806df..7c635a5 100644
--- a/fs/squashfs/symlink.c
+++ b/fs/squashfs/symlink.c
@@ -119,7 +119,7 @@ const struct address_space_operations squashfs_symlink_aops = {
 
 const struct inode_operations squashfs_symlink_inode_ops = {
 	.readlink = generic_readlink,
-	.follow_link = page_follow_link_light,
+	.get_link = page_get_link,
 	.put_link = page_put_link,
 	.getxattr = generic_getxattr,
 	.listxattr = squashfs_listxattr
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index ef8bcdb..80a40bc 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -146,7 +146,7 @@ static inline void write3byte(struct sysv_sb_info *sbi,
 
 static const struct inode_operations sysv_symlink_inode_operations = {
 	.readlink	= generic_readlink,
-	.follow_link	= page_follow_link_light,
+	.get_link	= page_get_link,
 	.put_link	= page_put_link,
 	.getattr	= sysv_getattr,
 };
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 0edc128..eff6280 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1608,7 +1608,7 @@ const struct inode_operations ubifs_file_inode_operations = {
 
 const struct inode_operations ubifs_symlink_inode_operations = {
 	.readlink    = generic_readlink,
-	.follow_link = simple_follow_link,
+	.get_link    = simple_get_link,
 	.setattr     = ubifs_setattr,
 	.getattr     = ubifs_getattr,
 	.setxattr    = ubifs_setxattr,
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 245268a..f638fd5 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -414,13 +414,17 @@ xfs_vn_rename(
  * uio is kmalloced for this reason...
  */
 STATIC const char *
-xfs_vn_follow_link(
+xfs_vn_get_link(
 	struct dentry		*dentry,
+	struct inode		*inode,
 	void			**cookie)
 {
 	char			*link;
 	int			error = -ENOMEM;
 
+	if (!dentry)
+		return ERR_PTR(-ECHILD);
+
 	link = kmalloc(MAXPATHLEN+1, GFP_KERNEL);
 	if (!link)
 		goto out_err;
@@ -1172,7 +1176,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = {
 
 static const struct inode_operations xfs_symlink_inode_operations = {
 	.readlink		= generic_readlink,
-	.follow_link		= xfs_vn_follow_link,
+	.get_link		= xfs_vn_get_link,
 	.put_link		= kfree_put_link,
 	.getattr		= xfs_vn_getattr,
 	.setattr		= xfs_vn_setattr,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index dfeda44..d2fdf09 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1633,7 +1633,7 @@ struct file_operations {
 
 struct inode_operations {
 	struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
-	const char * (*follow_link) (struct dentry *, void **);
+	const char * (*get_link) (struct dentry *, struct inode *, void **);
 	int (*permission) (struct inode *, int);
 	struct posix_acl * (*get_acl)(struct inode *, int);
 
@@ -2736,7 +2736,7 @@ extern const struct file_operations generic_ro_fops;
 
 extern int readlink_copy(char __user *, int, const char *);
 extern int page_readlink(struct dentry *, char __user *, int);
-extern const char *page_follow_link_light(struct dentry *, void **);
+extern const char *page_get_link(struct dentry *, struct inode *, void **);
 extern void page_put_link(struct inode *, void *);
 extern int __page_symlink(struct inode *inode, const char *symname, int len,
 		int nofs);
@@ -2754,7 +2754,7 @@ void __inode_sub_bytes(struct inode *inode, loff_t bytes);
 void inode_sub_bytes(struct inode *inode, loff_t bytes);
 loff_t inode_get_bytes(struct inode *inode);
 void inode_set_bytes(struct inode *inode, loff_t bytes);
-const char *simple_follow_link(struct dentry *, void **);
+const char *simple_get_link(struct dentry *, struct inode *, void **);
 extern const struct inode_operations simple_symlink_inode_operations;
 
 extern int iterate_dir(struct file *, struct dir_context *);
diff --git a/mm/shmem.c b/mm/shmem.c
index 64bf5ac..684dbc3 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2496,10 +2496,14 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
 	return 0;
 }
 
-static const char *shmem_follow_link(struct dentry *dentry, void **cookie)
+static const char *shmem_get_link(struct dentry *dentry,
+				  struct inode *inode, void **cookie)
 {
 	struct page *page = NULL;
-	int error = shmem_getpage(d_inode(dentry), 0, &page, SGP_READ, NULL);
+	int error;
+	if (!dentry)
+		return ERR_PTR(-ECHILD);
+	error = shmem_getpage(inode, 0, &page, SGP_READ, NULL);
 	if (error)
 		return ERR_PTR(error);
 	unlock_page(page);
@@ -2656,7 +2660,7 @@ static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
 
 static const struct inode_operations shmem_short_symlink_operations = {
 	.readlink	= generic_readlink,
-	.follow_link	= simple_follow_link,
+	.get_link	= simple_get_link,
 #ifdef CONFIG_TMPFS_XATTR
 	.setxattr	= shmem_setxattr,
 	.getxattr	= shmem_getxattr,
@@ -2667,7 +2671,7 @@ static const struct inode_operations shmem_short_symlink_operations = {
 
 static const struct inode_operations shmem_symlink_inode_operations = {
 	.readlink	= generic_readlink,
-	.follow_link	= shmem_follow_link,
+	.get_link	= shmem_get_link,
 	.put_link	= shmem_put_link,
 #ifdef CONFIG_TMPFS_XATTR
 	.setxattr	= shmem_setxattr,
-- 
cgit v1.1


From d3883d4f93449343be6296e2274360db39b6842a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 17 Nov 2015 10:41:04 -0500
Subject: teach page_get_link() to work in RCU mode

more or less along the lines of Neil's patchset, sans the insanity
around kmap().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 1da3064..8f51788 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4533,12 +4533,19 @@ const char *page_get_link(struct dentry *dentry, struct inode *inode,
 	struct page *page;
 	struct address_space *mapping = inode->i_mapping;
 
-	if (!dentry)
-		return ERR_PTR(-ECHILD);
-
-	page = read_mapping_page(mapping, 0, NULL);
-	if (IS_ERR(page))
-		return (char*)page;
+	if (!dentry) {
+		page = find_get_page(mapping, 0);
+		if (!page)
+			return ERR_PTR(-ECHILD);
+		if (!PageUptodate(page)) {
+			put_page(page);
+			return ERR_PTR(-ECHILD);
+		}
+	} else {
+		page = read_mapping_page(mapping, 0, NULL);
+		if (IS_ERR(page))
+			return (char*)page;
+	}
 	*cookie = page;
 	BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM);
 	kaddr = page_address(page);
-- 
cgit v1.1


From 6a6c99049635473b64c384135a6906a10df2c916 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 17 Nov 2015 10:54:32 -0500
Subject: teach shmem_get_link() to work in RCU mode

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 mm/shmem.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/mm/shmem.c b/mm/shmem.c
index 684dbc3..0605716 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2501,12 +2501,20 @@ static const char *shmem_get_link(struct dentry *dentry,
 {
 	struct page *page = NULL;
 	int error;
-	if (!dentry)
-		return ERR_PTR(-ECHILD);
-	error = shmem_getpage(inode, 0, &page, SGP_READ, NULL);
-	if (error)
-		return ERR_PTR(error);
-	unlock_page(page);
+	if (!dentry) {
+		page = find_get_page(inode->i_mapping, 0);
+		if (!page)
+			return ERR_PTR(-ECHILD);
+		if (!PageUptodate(page)) {
+			put_page(page);
+			return ERR_PTR(-ECHILD);
+		}
+	} else {
+		error = shmem_getpage(inode, 0, &page, SGP_READ, NULL);
+		if (error)
+			return ERR_PTR(error);
+		unlock_page(page);
+	}
 	*cookie = page;
 	return page_address(page);
 }
-- 
cgit v1.1


From 1a384eaac265b57961c9696d9177f82eb84319e9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 17 Nov 2015 10:58:42 -0500
Subject: teach proc_self_get_link()/proc_thread_self_get_link() to work in RCU
 mode

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/self.c        | 8 +++-----
 fs/proc/thread_self.c | 9 ++++-----
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/fs/proc/self.c b/fs/proc/self.c
index 9dd0ae6..7a8b19e 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -25,14 +25,12 @@ static const char *proc_self_get_link(struct dentry *dentry,
 	pid_t tgid = task_tgid_nr_ns(current, ns);
 	char *name;
 
-	if (!dentry)
-		return ERR_PTR(-ECHILD);
 	if (!tgid)
 		return ERR_PTR(-ENOENT);
 	/* 11 for max length of signed int in decimal + NULL term */
-	name = kmalloc(12, GFP_KERNEL);
-	if (!name)
-		return ERR_PTR(-ENOMEM);
+	name = kmalloc(12, dentry ? GFP_KERNEL : GFP_ATOMIC);
+	if (unlikely(!name))
+		return dentry ? ERR_PTR(-ENOMEM) : ERR_PTR(-ECHILD);
 	sprintf(name, "%d", tgid);
 	return *cookie = name;
 }
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index 50eef6f..03eaa84 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -27,13 +27,12 @@ static const char *proc_thread_self_get_link(struct dentry *dentry,
 	pid_t pid = task_pid_nr_ns(current, ns);
 	char *name;
 
-	if (!dentry)
-		return ERR_PTR(-ECHILD);
 	if (!pid)
 		return ERR_PTR(-ENOENT);
-	name = kmalloc(PROC_NUMBUF + 6 + PROC_NUMBUF, GFP_KERNEL);
-	if (!name)
-		return ERR_PTR(-ENOMEM);
+	name = kmalloc(PROC_NUMBUF + 6 + PROC_NUMBUF,
+				dentry ? GFP_KERNEL : GFP_ATOMIC);
+	if (unlikely(!name))
+		return dentry ? ERR_PTR(-ENOMEM) : ERR_PTR(-ECHILD);
 	sprintf(name, "%d/task/%d", tgid, pid);
 	return *cookie = name;
 }
-- 
cgit v1.1


From 0d0def49d05ae988936268b0e57d19aeef8c3ad2 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 17 Nov 2015 21:14:24 -0500
Subject: teach nfs_get_link() to work in RCU mode

based upon the corresponding patch from Neil's March patchset,
again with kmap-related horrors removed.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfs/inode.c         | 21 +++++++++++++++++++++
 fs/nfs/symlink.c       | 30 ++++++++++++++++++++----------
 include/linux/nfs_fs.h |  1 +
 3 files changed, 42 insertions(+), 10 deletions(-)

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index ae9aa0b..aa828e8 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1087,6 +1087,27 @@ static bool nfs_mapping_need_revalidate_inode(struct inode *inode)
 		|| NFS_STALE(inode);
 }
 
+int nfs_revalidate_mapping_rcu(struct inode *inode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	unsigned long *bitlock = &nfsi->flags;
+	int ret = 0;
+
+	if (IS_SWAPFILE(inode))
+		goto out;
+	if (nfs_mapping_need_revalidate_inode(inode)) {
+		ret = -ECHILD;
+		goto out;
+	}
+	spin_lock(&inode->i_lock);
+	if (test_bit(NFS_INO_INVALIDATING, bitlock) ||
+	    (nfsi->cache_validity & NFS_INO_INVALID_DATA))
+		ret = -ECHILD;
+	spin_unlock(&inode->i_lock);
+out:
+	return ret;
+}
+
 /**
  * __nfs_revalidate_mapping - Revalidate the pagecache
  * @inode - pointer to host inode
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index 8ade8a8..95c69af 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -48,16 +48,26 @@ static const char *nfs_get_link(struct dentry *dentry,
 	struct page *page;
 	void *err;
 
-	if (!dentry)
-		return ERR_PTR(-ECHILD);
-
-	err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping));
-	if (err)
-		return err;
-	page = read_cache_page(&inode->i_data, 0,
-				(filler_t *)nfs_symlink_filler, inode);
-	if (IS_ERR(page))
-		return ERR_CAST(page);
+	if (!dentry) {
+		err = ERR_PTR(nfs_revalidate_mapping_rcu(inode));
+		if (err)
+			return err;
+		page = find_get_page(inode->i_mapping, 0);
+		if (!page)
+			return ERR_PTR(-ECHILD);
+		if (!PageUptodate(page)) {
+			put_page(page);
+			return ERR_PTR(-ECHILD);
+		}
+	} else {
+		err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping));
+		if (err)
+			return err;
+		page = read_cache_page(&inode->i_data, 0,
+					(filler_t *)nfs_symlink_filler, inode);
+		if (IS_ERR(page))
+			return ERR_CAST(page);
+	}
 	*cookie = page;
 	return page_address(page);
 }
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index c0e9614..37a3d29 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -359,6 +359,7 @@ extern int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode);
 extern int nfs_revalidate_inode_rcu(struct nfs_server *server, struct inode *inode);
 extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *);
 extern int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping);
+extern int nfs_revalidate_mapping_rcu(struct inode *inode);
 extern int nfs_revalidate_mapping_protected(struct inode *inode, struct address_space *mapping);
 extern int nfs_setattr(struct dentry *, struct iattr *);
 extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, struct nfs_fattr *);
-- 
cgit v1.1


From 8d7d377c2bea16afa6b600a4517615a9eebb259b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 5 Mar 2015 11:32:13 +0100
Subject: perf tui: Change default selection background color to yellow

Boris reported that 'perf top' is unusable on his default 'black on
white' terminal, which uses (eye friendly) light-grey as a background
color.

The reason is that the TUI cursor for the current selection line uses
HE_COLORSET_SELECTED, and that has a default background color of
'lightgrey' - which is a common terminal background choice and thus
the colors conflict.

Use yellow as the background color instead: that should be an uncommon
terminal background, yet it's still ergonomic on both black and
white/grey terminals.

[ It would be a better solution to straight out detect color
  collisions and resolve them reasonably by converting them to RGB and
  calculating color space distances, but I was unable to find
  proper documentation for SLtt_get_color_object() to recover the
  current color scheme so I gave up ... Yellow works well enough. ]

Reported-and-Tested-by: Borislav Petkov <bp@alien8.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Binderman <dcb314@hotmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20150305103213.GA23046@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/ui/browser.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/ui/browser.c b/tools/perf/ui/browser.c
index e9703c0..d372021 100644
--- a/tools/perf/ui/browser.c
+++ b/tools/perf/ui/browser.c
@@ -528,7 +528,7 @@ static struct ui_browser_colorset {
 		.colorset = HE_COLORSET_SELECTED,
 		.name	  = "selected",
 		.fg	  = "black",
-		.bg	  = "lightgray",
+		.bg	  = "yellow",
 	},
 	{
 		.colorset = HE_COLORSET_CODE,
-- 
cgit v1.1


From bae32b50ea96ca0f8702ea55e62095e8cc4745e2 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Wed, 9 Dec 2015 11:11:20 +0900
Subject: perf tools: Fix map_groups__clone to put cloned map

Fix map_groups__clone to put cloned map after inserting it to the
map_groups.

Refcnt debugger shows:
  ----
  ==== [0] ====
  Unreclaimed map: 0x2a27ee0
  Refcount +1 => 1 at
    ./perf(map_groups__clone+0x8d) [0x4bb7ed]
    ./perf(thread__fork+0xbe) [0x4c1f9e]
    ./perf(machine__process_fork_event+0x216) [0x4b79a6]
    ./perf(perf_event__synthesize_threads+0x38b) [0x48135b]
    ./perf(cmd_top+0xdc6) [0x43cb76]
    ./perf() [0x477223]
    ./perf(main+0x617) [0x422077]
    /lib64/libc.so.6(__libc_start_main+0xf0) [0x7ff806af8fe0]
    ./perf() [0x4221ed]
  Refcount +1 => 2 at
    ./perf(map_groups__clone+0x128) [0x4bb888]
    ./perf(thread__fork+0xbe) [0x4c1f9e]
    ./perf(machine__process_fork_event+0x216) [0x4b79a6]
    ./perf(perf_event__synthesize_threads+0x38b) [0x48135b]
    ./perf(cmd_top+0xdc6) [0x43cb76]
    ./perf() [0x477223]
    ./perf(main+0x617) [0x422077]
    /lib64/libc.so.6(__libc_start_main+0xf0) [0x7ff806af8fe0]
    ./perf() [0x4221ed]
  Refcount -1 => 1 at
    ./perf(map_groups__exit+0x87) [0x4ba757]
    ./perf(map_groups__put+0x68) [0x4ba9a8]
    ./perf(thread__put+0x8b) [0x4c1aeb]
    ./perf(machine__delete_threads+0x81) [0x4b48f1]
    ./perf(perf_session__delete+0x4f) [0x4be63f]
    ./perf(cmd_top+0x1094) [0x43ce44]
    ./perf() [0x477223]
    ./perf(main+0x617) [0x422077]
    /lib64/libc.so.6(__libc_start_main+0xf0) [0x7ff806af8fe0]
    ./perf() [0x4221ed]
  ----

This shows map_groups__clone get the map twice and put it when
map_groups__exit.

Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20151209021120.10245.95388.stgit@localhost.localdomain
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/map.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c
index 93d9f1c..7b1c720 100644
--- a/tools/perf/util/map.c
+++ b/tools/perf/util/map.c
@@ -742,6 +742,7 @@ int map_groups__clone(struct map_groups *mg,
 		if (new == NULL)
 			goto out_unlock;
 		map_groups__insert(mg, new);
+		map__put(new);
 	}
 
 	err = 0;
-- 
cgit v1.1


From 544c2ae7b1a794ad0bc5ec24d832ab5658d5aef6 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Wed, 9 Dec 2015 11:11:27 +0900
Subject: perf stat: Fix cmd_stat to release cpu_map

Fix cmd_stat() to release cpu_map objects (aggr_map and
cpus_aggr_map) afterwards.

refcnt debugger shows that the cmd_stat initializes cpu_map
but not puts it.
  ----
  # ./perf stat -v ls
  ....
  REFCNT: BUG: Unreclaimed objects found.
  ==== [0] ====
  Unreclaimed cpu_map@0x29339c0
  Refcount +1 => 1 at
    ./perf(cpu_map__empty_new+0x6d) [0x4e64bd]
    ./perf(cmd_stat+0x5fe) [0x43594e]
    ./perf() [0x47b785]
    ./perf(main+0x617) [0x422587]
    /lib64/libc.so.6(__libc_start_main+0xf5) [0x7f2dff420af5]
    ./perf() [0x4226fd]
  REFCNT: Total 1 objects are not reclaimed.
    "cpu_map" leaks 1 objects
  ----

Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20151209021127.10245.93697.stgit@localhost.localdomain
[ Remove NULL checks before calling the put operation, it checks it already ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-stat.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index e74712d..25a95f4 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1094,6 +1094,14 @@ static int perf_stat_init_aggr_mode(void)
 	return cpus_aggr_map ? 0 : -ENOMEM;
 }
 
+static void perf_stat__exit_aggr_mode(void)
+{
+	cpu_map__put(aggr_map);
+	cpu_map__put(cpus_aggr_map);
+	aggr_map = NULL;
+	cpus_aggr_map = NULL;
+}
+
 /*
  * Add default attributes, if there were no attributes specified or
  * if -d/--detailed, -d -d or -d -d -d is used:
@@ -1442,6 +1450,7 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
 	if (!forever && status != -1 && !interval)
 		print_counters(NULL, argc, argv);
 
+	perf_stat__exit_aggr_mode();
 	perf_evlist__free_stats(evsel_list);
 out:
 	perf_evlist__delete(evsel_list);
-- 
cgit v1.1


From 17577decb2ddae28f5a449ddb79cf0ed3e2312c5 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Wed, 9 Dec 2015 11:11:29 +0900
Subject: perf hists: Fix hists_evsel to release hists

Since hists__init doesn't set the destructor of hists_evsel (which is an
extended evsel structure), when hists_evsel is released, the extended
part of the hists_evsel is not deleted (note that the hists_evsel object
itself is freed).

This fixes it to add a destructor for hists__evsel and to set it up.

Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20151209021129.10245.28710.stgit@localhost.localdomain
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/hist.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 6e8e0ee..565ea35 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -1567,6 +1567,13 @@ static int hists_evsel__init(struct perf_evsel *evsel)
 	return 0;
 }
 
+static void hists_evsel__exit(struct perf_evsel *evsel)
+{
+	struct hists *hists = evsel__hists(evsel);
+
+	hists__delete_entries(hists);
+}
+
 /*
  * XXX We probably need a hists_evsel__exit() to free the hist_entries
  * stored in the rbtree...
@@ -1575,7 +1582,8 @@ static int hists_evsel__init(struct perf_evsel *evsel)
 int hists__init(void)
 {
 	int err = perf_evsel__object_config(sizeof(struct hists_evsel),
-					    hists_evsel__init, NULL);
+					    hists_evsel__init,
+					    hists_evsel__exit);
 	if (err)
 		fputs("FATAL ERROR: Couldn't setup hists class\n", stderr);
 
-- 
cgit v1.1


From d91130e90a005876b488b6d52b743149d95b4a59 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Wed, 9 Dec 2015 11:11:31 +0900
Subject: perf tools: Fix maps__fixup_overlappings to put used maps

Since the __map_groups__insert got the given map, we don't need to keep
it. So put the maps.

Refcnt debugger shows that map_groups__fixup_overlappings() got a map
twice but the group released it just once. This pattern usually
indicates the leak happens in caller site.

  ----
  ==== [0] ====
  Unreclaimed map@0x39d3ae0
  Refcount +1 => 1 at
    ./perf(map_groups__fixup_overlappings+0x335) [0x4c1865]
    ./perf(thread__insert_map+0x30) [0x4c8e00]
    ./perf(machine__process_mmap2_event+0x106) [0x4bd876]
    ./perf() [0x4c378e]
    ./perf() [0x4c4393]
    ./perf(perf_session__process_events+0x38a) [0x4c654a]
    ./perf(cmd_record+0xe24) [0x42fc94]
    ./perf() [0x47b745]
    ./perf(main+0x617) [0x422547]
    /lib64/libc.so.6(__libc_start_main+0xf5) [0x7f2eca2deaf5]
    ./perf() [0x4226bd]
  Refcount +1 => 2 at
    ./perf(map_groups__fixup_overlappings+0x3c5) [0x4c18f5]
    ./perf(thread__insert_map+0x30) [0x4c8e00]
    ./perf(machine__process_mmap2_event+0x106) [0x4bd876]
    ./perf() [0x4c378e]
    ./perf() [0x4c4393]
    ./perf(perf_session__process_events+0x38a) [0x4c654a]
    ./perf(cmd_record+0xe24) [0x42fc94]
    ./perf() [0x47b745]
    ./perf(main+0x617) [0x422547]
    /lib64/libc.so.6(__libc_start_main+0xf5) [0x7f2eca2deaf5]
    ./perf() [0x4226bd]
  Refcount -1 => 1 at
    ./perf(map_groups__exit+0x92) [0x4c0962]
    ./perf(map_groups__put+0x60) [0x4c0bc0]
    ./perf(thread__put+0x90) [0x4c8a40]
    ./perf(machine__delete_threads+0x7e) [0x4bad9e]
    ./perf(perf_session__delete+0x4f) [0x4c499f]
    ./perf(cmd_record+0xb6d) [0x42f9dd]
    ./perf() [0x47b745]
    ./perf(main+0x617) [0x422547]
    /lib64/libc.so.6(__libc_start_main+0xf5) [0x7f2eca2deaf5]
    ./perf() [0x4226bd]
  ----

Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20151209021131.10245.41485.stgit@localhost.localdomain
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/map.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c
index 7b1c720..171b6d1 100644
--- a/tools/perf/util/map.c
+++ b/tools/perf/util/map.c
@@ -691,6 +691,7 @@ static int maps__fixup_overlappings(struct maps *maps, struct map *map, FILE *fp
 			__map_groups__insert(pos->groups, before);
 			if (verbose >= 2)
 				map__fprintf(before, fp);
+			map__put(before);
 		}
 
 		if (map->end < pos->end) {
@@ -705,6 +706,7 @@ static int maps__fixup_overlappings(struct maps *maps, struct map *map, FILE *fp
 			__map_groups__insert(pos->groups, after);
 			if (verbose >= 2)
 				map__fprintf(after, fp);
+			map__put(after);
 		}
 put_map:
 		map__put(pos);
-- 
cgit v1.1


From cc1121ab9687d660cc02f50b1a4974112f87a8e6 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Wed, 9 Dec 2015 11:11:33 +0900
Subject: perf machine: Fix machine.vmlinux_maps to make sure to clear the old
 one

Fix machine.vmlinux_maps to make sure to clear the old one if it is
renewal. This can leak the previous maps on the vmlinux_maps because
those are just overwritten.

Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20151209021133.10245.93730.stgit@localhost.localdomain
[ Simplified the memset, same end result ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/machine.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index bfc289c..f5882b8 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -44,6 +44,8 @@ int machine__init(struct machine *machine, const char *root_dir, pid_t pid)
 	machine->comm_exec = false;
 	machine->kernel_start = 0;
 
+	memset(machine->vmlinux_maps, 0, sizeof(machine->vmlinux_maps));
+
 	machine->root_dir = strdup(root_dir);
 	if (machine->root_dir == NULL)
 		return -ENOMEM;
@@ -770,6 +772,9 @@ int __machine__create_kernel_maps(struct machine *machine, struct dso *kernel)
 	enum map_type type;
 	u64 start = machine__get_running_kernel_start(machine, NULL);
 
+	/* In case of renewal the kernel map, destroy previous one */
+	machine__destroy_kernel_maps(machine);
+
 	for (type = 0; type < MAP__NR_TYPES; ++type) {
 		struct kmap *kmap;
 		struct map *map;
-- 
cgit v1.1


From 5191d887681dd34ba3993a438d5746378952885a Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Wed, 9 Dec 2015 11:11:35 +0900
Subject: perf tools: Fix write_numa_topology to put cpu_map instead of free

Fix write_numa_topology to put cpu_map instead of free because cpu_map
is managed based on refcnt.

Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20151209021135.10245.79046.stgit@localhost.localdomain
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/header.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 4383800..5ac7bdb 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -724,7 +724,7 @@ static int write_numa_topology(int fd, struct perf_header *h __maybe_unused,
 done:
 	free(buf);
 	fclose(fp);
-	free(node_map);
+	cpu_map__put(node_map);
 	return ret;
 }
 
-- 
cgit v1.1


From de7cf7cadca3a3c32c1f1dbf4593a54f236e2dcf Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 7 Dec 2015 22:21:43 -0600
Subject: perf tools: Remove unused pager_use_color variable

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/e540c61b3068761181db6d9b1b3411990bafdb2f.1449548395.git.jpoimboe@redhat.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/Build         | 1 -
 tools/perf/util/cache.h       | 1 -
 tools/perf/util/color.c       | 2 +-
 tools/perf/util/environment.c | 8 --------
 4 files changed, 1 insertion(+), 11 deletions(-)
 delete mode 100644 tools/perf/util/environment.c

diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index 0513dd5..62392ab 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -6,7 +6,6 @@ libperf-y += config.o
 libperf-y += ctype.o
 libperf-y += db-export.o
 libperf-y += env.o
-libperf-y += environment.o
 libperf-y += event.o
 libperf-y += evlist.o
 libperf-y += evsel.o
diff --git a/tools/perf/util/cache.h b/tools/perf/util/cache.h
index c861373..4c2b764 100644
--- a/tools/perf/util/cache.h
+++ b/tools/perf/util/cache.h
@@ -31,7 +31,6 @@ extern const char *perf_config_dirname(const char *, const char *);
 /* pager.c */
 extern void setup_pager(void);
 extern int pager_in_use(void);
-extern int pager_use_color;
 
 char *alias_lookup(const char *alias);
 int split_cmdline(char *cmdline, const char ***argv);
diff --git a/tools/perf/util/color.c b/tools/perf/util/color.c
index 9b95654..e5fb88b 100644
--- a/tools/perf/util/color.c
+++ b/tools/perf/util/color.c
@@ -24,7 +24,7 @@ int perf_config_colorbool(const char *var, const char *value, int stdout_is_tty)
  auto_color:
 	if (stdout_is_tty < 0)
 		stdout_is_tty = isatty(1);
-	if (stdout_is_tty || (pager_in_use() && pager_use_color)) {
+	if (stdout_is_tty || pager_in_use()) {
 		char *term = getenv("TERM");
 		if (term && strcmp(term, "dumb"))
 			return 1;
diff --git a/tools/perf/util/environment.c b/tools/perf/util/environment.c
deleted file mode 100644
index 7405123..0000000
--- a/tools/perf/util/environment.c
+++ /dev/null
@@ -1,8 +0,0 @@
-/*
- * We put all the perf config variables in this same object
- * file, so that programs can link against the config parser
- * without having to link against all the rest of perf.
- */
-#include "cache.h"
-
-int pager_use_color = 1;
-- 
cgit v1.1


From 1fe143c5f928e3d117355ce2655bac0eb80c1aa3 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 7 Dec 2015 22:21:42 -0600
Subject: perf tools: Move term functions out of util.c

The term functions are needed by help.c which is going to be moved into
a separate library.  Move them out of util.c and into their own file.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/9a39c854dd156b55ebda57e427594c9a59dcb40f.1449548395.git.jpoimboe@redhat.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/Build  |  1 +
 tools/perf/util/term.c | 35 +++++++++++++++++++++++++++++++++++
 tools/perf/util/term.h | 10 ++++++++++
 tools/perf/util/util.c | 34 ----------------------------------
 tools/perf/util/util.h |  4 +---
 5 files changed, 47 insertions(+), 37 deletions(-)
 create mode 100644 tools/perf/util/term.c
 create mode 100644 tools/perf/util/term.h

diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index 62392ab..65fef59 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -86,6 +86,7 @@ libperf-$(CONFIG_AUXTRACE) += intel-pt.o
 libperf-$(CONFIG_AUXTRACE) += intel-bts.o
 libperf-y += parse-branch-options.o
 libperf-y += parse-regs-options.o
+libperf-y += term.o
 
 libperf-$(CONFIG_LIBBPF) += bpf-loader.o
 libperf-$(CONFIG_BPF_PROLOGUE) += bpf-prologue.o
diff --git a/tools/perf/util/term.c b/tools/perf/util/term.c
new file mode 100644
index 0000000..90b47d8
--- /dev/null
+++ b/tools/perf/util/term.c
@@ -0,0 +1,35 @@
+#include "util.h"
+
+void get_term_dimensions(struct winsize *ws)
+{
+	char *s = getenv("LINES");
+
+	if (s != NULL) {
+		ws->ws_row = atoi(s);
+		s = getenv("COLUMNS");
+		if (s != NULL) {
+			ws->ws_col = atoi(s);
+			if (ws->ws_row && ws->ws_col)
+				return;
+		}
+	}
+#ifdef TIOCGWINSZ
+	if (ioctl(1, TIOCGWINSZ, ws) == 0 &&
+	    ws->ws_row && ws->ws_col)
+		return;
+#endif
+	ws->ws_row = 25;
+	ws->ws_col = 80;
+}
+
+void set_term_quiet_input(struct termios *old)
+{
+	struct termios tc;
+
+	tcgetattr(0, old);
+	tc = *old;
+	tc.c_lflag &= ~(ICANON | ECHO);
+	tc.c_cc[VMIN] = 0;
+	tc.c_cc[VTIME] = 0;
+	tcsetattr(0, TCSANOW, &tc);
+}
diff --git a/tools/perf/util/term.h b/tools/perf/util/term.h
new file mode 100644
index 0000000..2c06a61
--- /dev/null
+++ b/tools/perf/util/term.h
@@ -0,0 +1,10 @@
+#ifndef __PERF_TERM_H
+#define __PERF_TERM_H
+
+struct termios;
+struct winsize;
+
+void get_term_dimensions(struct winsize *ws);
+void set_term_quiet_input(struct termios *old);
+
+#endif /* __PERF_TERM_H */
diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c
index 75759ae..07da970 100644
--- a/tools/perf/util/util.c
+++ b/tools/perf/util/util.c
@@ -355,40 +355,6 @@ void sighandler_dump_stack(int sig)
 	exit(sig);
 }
 
-void get_term_dimensions(struct winsize *ws)
-{
-	char *s = getenv("LINES");
-
-	if (s != NULL) {
-		ws->ws_row = atoi(s);
-		s = getenv("COLUMNS");
-		if (s != NULL) {
-			ws->ws_col = atoi(s);
-			if (ws->ws_row && ws->ws_col)
-				return;
-		}
-	}
-#ifdef TIOCGWINSZ
-	if (ioctl(1, TIOCGWINSZ, ws) == 0 &&
-	    ws->ws_row && ws->ws_col)
-		return;
-#endif
-	ws->ws_row = 25;
-	ws->ws_col = 80;
-}
-
-void set_term_quiet_input(struct termios *old)
-{
-	struct termios tc;
-
-	tcgetattr(0, old);
-	tc = *old;
-	tc.c_lflag &= ~(ICANON | ECHO);
-	tc.c_cc[VMIN] = 0;
-	tc.c_cc[VTIME] = 0;
-	tcsetattr(0, TCSANOW, &tc);
-}
-
 int parse_nsec_time(const char *str, u64 *ptime)
 {
 	u64 time_sec, time_nsec;
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
index dcc6590..150858f 100644
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -53,6 +53,7 @@
 #include <stdlib.h>
 #include <stdarg.h>
 #include <string.h>
+#include <term.h>
 #include <errno.h>
 #include <limits.h>
 #include <sys/param.h>
@@ -282,9 +283,6 @@ void sighandler_dump_stack(int sig);
 extern unsigned int page_size;
 extern int cacheline_size;
 
-void get_term_dimensions(struct winsize *ws);
-void set_term_quiet_input(struct termios *old);
-
 struct parse_tag {
 	char tag;
 	int mult;
-- 
cgit v1.1


From 2bdb2c2729d2ba2f2f90b729d04254308096c5a0 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 7 Dec 2015 22:21:46 -0600
Subject: perf tools: Save cmdline arguments earlier

perf_env__set_cmdline() only saves the arguments the first time it's
called.  It doesn't need to be called every time the options and
suboptions are parsed.  Instead it can just be called once.

This also has the advantage of making the option parsing code less
perf-specific so it can be moved out to a library.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/19b76a5aa1b688bd635bd65d80bbc103a978d75e.1449548395.git.jpoimboe@redhat.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/perf.c               | 1 +
 tools/perf/util/env.c           | 9 ---------
 tools/perf/util/parse-options.c | 2 --
 3 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index 4bee53c..59ea48c 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -384,6 +384,7 @@ static int run_builtin(struct cmd_struct *p, int argc, const char **argv)
 		use_pager = 1;
 	commit_pager_choice();
 
+	perf_env__set_cmdline(&perf_env, argc, argv);
 	status = p->fn(argc, argv, prefix);
 	exit_browser(status);
 	perf_env__exit(&perf_env);
diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c
index 6af4f7c..7dd5939 100644
--- a/tools/perf/util/env.c
+++ b/tools/perf/util/env.c
@@ -25,15 +25,6 @@ int perf_env__set_cmdline(struct perf_env *env, int argc, const char *argv[])
 {
 	int i;
 
-	/*
-	 * If env->cmdline_argv has already been set, do not override it.  This allows
-	 * a command to set the cmdline, parse args and then call another
-	 * builtin function that implements a command -- e.g, cmd_kvm calling
-	 * cmd_record.
-	 */
-	if (env->cmdline_argv != NULL)
-		return 0;
-
 	/* do not include NULL termination */
 	env->cmdline_argv = calloc(argc, sizeof(char *));
 	if (env->cmdline_argv == NULL)
diff --git a/tools/perf/util/parse-options.c b/tools/perf/util/parse-options.c
index 9fca092..d09aff9 100644
--- a/tools/perf/util/parse-options.c
+++ b/tools/perf/util/parse-options.c
@@ -501,8 +501,6 @@ int parse_options_subcommand(int argc, const char **argv, const struct option *o
 {
 	struct parse_opt_ctx_t ctx;
 
-	perf_env__set_cmdline(&perf_env, argc, argv);
-
 	/* build usage string if it's not provided */
 	if (subcommands && !usagestr[0]) {
 		struct strbuf buf = STRBUF_INIT;
-- 
cgit v1.1


From 0a4bb5da957b83ece8b4723c5bac7a5d29fbfb33 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 7 Dec 2015 22:21:48 -0600
Subject: perf tools: Move cmd_version() to builtin-version.c

Move cmd_version() to its own file so that help.c can be moved to a
library.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/e908b1b68f20ab6d8d33941d5571c23110622e60.1449548395.git.jpoimboe@redhat.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Build             |  1 +
 tools/perf/builtin-version.c | 10 ++++++++++
 tools/perf/util/help.c       |  7 -------
 3 files changed, 11 insertions(+), 7 deletions(-)
 create mode 100644 tools/perf/builtin-version.c

diff --git a/tools/perf/Build b/tools/perf/Build
index 2c7aaf2..2a41217 100644
--- a/tools/perf/Build
+++ b/tools/perf/Build
@@ -20,6 +20,7 @@ perf-y += builtin-kvm.o
 perf-y += builtin-inject.o
 perf-y += builtin-mem.o
 perf-y += builtin-data.o
+perf-y += builtin-version.o
 
 perf-$(CONFIG_AUDIT) += builtin-trace.o
 perf-$(CONFIG_LIBELF) += builtin-probe.o
diff --git a/tools/perf/builtin-version.c b/tools/perf/builtin-version.c
new file mode 100644
index 0000000..9b10cda
--- /dev/null
+++ b/tools/perf/builtin-version.c
@@ -0,0 +1,10 @@
+#include "util/util.h"
+#include "builtin.h"
+#include "perf.h"
+
+int cmd_version(int argc __maybe_unused, const char **argv __maybe_unused,
+		const char *prefix __maybe_unused)
+{
+	printf("perf version %s\n", perf_version_string);
+	return 0;
+}
diff --git a/tools/perf/util/help.c b/tools/perf/util/help.c
index fa1fc4a..929c93f 100644
--- a/tools/perf/util/help.c
+++ b/tools/perf/util/help.c
@@ -332,10 +332,3 @@ const char *help_unknown_cmd(const char *cmd)
 
 	exit(1);
 }
-
-int cmd_version(int argc __maybe_unused, const char **argv __maybe_unused,
-		const char *prefix __maybe_unused)
-{
-	printf("perf version %s\n", perf_version_string);
-	return 0;
-}
-- 
cgit v1.1


From 50e19ef978158a3d1f790568eccd8e4a802190c2 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 10 Dec 2015 12:00:53 +0900
Subject: perf annotate: Check argument before calling setup_browser()

This is necessary to get rid of the browser dependency from
usage_with_options() and its friends.  Because there's no code changing
the argc and argv, it'd be ok to check it early.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1449716459-23004-2-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-annotate.c | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 2bf9b3f..55f6f8d 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -343,6 +343,16 @@ int cmd_annotate(int argc, const char **argv, const char *prefix __maybe_unused)
 		return ret;
 
 	argc = parse_options(argc, argv, options, annotate_usage, 0);
+	if (argc) {
+		/*
+		 * Special case: if there's an argument left then assume that
+		 * it's a symbol filter:
+		 */
+		if (argc > 1)
+			usage_with_options(annotate_usage, options);
+
+		annotate.sym_hist_filter = argv[0];
+	}
 
 	if (annotate.use_stdio)
 		use_browser = 0;
@@ -369,17 +379,6 @@ int cmd_annotate(int argc, const char **argv, const char *prefix __maybe_unused)
 	if (setup_sorting() < 0)
 		usage_with_options(annotate_usage, options);
 
-	if (argc) {
-		/*
-		 * Special case: if there's an argument left then assume that
-		 * it's a symbol filter:
-		 */
-		if (argc > 1)
-			usage_with_options(annotate_usage, options);
-
-		annotate.sym_hist_filter = argv[0];
-	}
-
 	ret = __cmd_annotate(&annotate);
 
 out_delete:
-- 
cgit v1.1


From 3df668e74a5bc60d74c2ce0b3498af2d77b4b556 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 10 Dec 2015 12:00:54 +0900
Subject: perf annotate: Delay UI browser setup after initialization is done

Move setup_browser after all necessary initialization is done.  This is
to remove the browser dependency from usage_with_options and friends.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1449716459-23004-3-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-annotate.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 55f6f8d..1f00dc7 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -354,17 +354,8 @@ int cmd_annotate(int argc, const char **argv, const char *prefix __maybe_unused)
 		annotate.sym_hist_filter = argv[0];
 	}
 
-	if (annotate.use_stdio)
-		use_browser = 0;
-	else if (annotate.use_tui)
-		use_browser = 1;
-	else if (annotate.use_gtk)
-		use_browser = 2;
-
 	file.path  = input_name;
 
-	setup_browser(true);
-
 	annotate.session = perf_session__new(&file, false, &annotate.tool);
 	if (annotate.session == NULL)
 		return -1;
@@ -379,6 +370,15 @@ int cmd_annotate(int argc, const char **argv, const char *prefix __maybe_unused)
 	if (setup_sorting() < 0)
 		usage_with_options(annotate_usage, options);
 
+	if (annotate.use_stdio)
+		use_browser = 0;
+	else if (annotate.use_tui)
+		use_browser = 1;
+	else if (annotate.use_gtk)
+		use_browser = 2;
+
+	setup_browser(true);
+
 	ret = __cmd_annotate(&annotate);
 
 out_delete:
-- 
cgit v1.1


From 1b0344e64d7b4e512a8e5d2bc88b022fbb7a9ee6 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 10 Dec 2015 12:00:55 +0900
Subject: perf kvm: Remove invocation of setup/exit_browser()

Calling setup_browser(false) with use_browser = 0 is meaningless.
Just get rid of it.  This is necessary to remove the browser
dependency from usage_with_options() and friends.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1449716459-23004-4-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-kvm.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c
index dd94b4c..031f9f5 100644
--- a/tools/perf/builtin-kvm.c
+++ b/tools/perf/builtin-kvm.c
@@ -1351,7 +1351,6 @@ static int kvm_events_live(struct perf_kvm_stat *kvm,
 	disable_buildid_cache();
 
 	use_browser = 0;
-	setup_browser(false);
 
 	if (argc) {
 		argc = parse_options(argc, argv, live_options,
@@ -1409,8 +1408,6 @@ static int kvm_events_live(struct perf_kvm_stat *kvm,
 	err = kvm_events_live_report(kvm);
 
 out:
-	exit_browser(0);
-
 	if (kvm->session)
 		perf_session__delete(kvm->session);
 	kvm->session = NULL;
-- 
cgit v1.1


From b3f38fc2422ace049110d1588a67b35bd15b81ce Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 10 Dec 2015 12:00:56 +0900
Subject: perf report: Check argument before calling setup_browser()

This is necessary to get rid of the browser dependency from
usage_with_options() and its friends.  Because there's no code
changing the argc and argv, it'd be ok to check it early.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1449716459-23004-5-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-report.c | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index af5db88..5a45466 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -801,6 +801,16 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
 	perf_config(report__config, &report);
 
 	argc = parse_options(argc, argv, options, report_usage, 0);
+	if (argc) {
+		/*
+		 * Special case: if there's an argument left then assume that
+		 * it's a symbol filter:
+		 */
+		if (argc > 1)
+			usage_with_options(report_usage, options);
+
+		report.symbol_filter_str = argv[0];
+	}
 
 	if (symbol_conf.vmlinux_name &&
 	    access(symbol_conf.vmlinux_name, R_OK)) {
@@ -946,17 +956,6 @@ repeat:
 	if (symbol__init(&session->header.env) < 0)
 		goto error;
 
-	if (argc) {
-		/*
-		 * Special case: if there's an argument left then assume that
-		 * it's a symbol filter:
-		 */
-		if (argc > 1)
-			usage_with_options(report_usage, options);
-
-		report.symbol_filter_str = argv[0];
-	}
-
 	sort__setup_elide(stdout);
 
 	ret = __cmd_report(&report);
-- 
cgit v1.1


From f8a5c0b24b8b1e77a0812b0c8251db0afc0524b7 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 10 Dec 2015 14:48:45 -0300
Subject: perf top: Do show usage message when failing to create cpu/thread
 maps

This is necessary to get rid of the browser dependency from
usage_with_options() and its friends.  Because we validate the targets
which are used to create the cpu/thread maps and inform the user about
any override performed via the chosen UI, we don't need to call the
usage routine for that.

Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/n/tip-slu7lj7buzpwgop1vo9la8ma@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-top.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 7e2e72e..84fd636 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -1279,8 +1279,11 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
 	if (target__none(target))
 		target->system_wide = true;
 
-	if (perf_evlist__create_maps(top.evlist, target) < 0)
-		usage_with_options(top_usage, options);
+	if (perf_evlist__create_maps(top.evlist, target) < 0) {
+		ui__error("Couldn't create thread/CPU maps: %s\n",
+			  errno == ENOENT ? "No such process" : strerror_r(errno, errbuf, sizeof(errbuf)));
+		goto out_delete_evlist;
+	}
 
 	if (!top.evlist->nr_entries &&
 	    perf_evlist__add_default(top.evlist) < 0) {
-- 
cgit v1.1


From 7ecb48fde39e1d61ab8aff95581dcdfb572bcc28 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 10 Dec 2015 12:00:58 +0900
Subject: perf thread_map: Free strlist on constructor error path

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1449716459-23004-7-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/thread_map.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/perf/util/thread_map.c b/tools/perf/util/thread_map.c
index 6ec3c5c..371fb28 100644
--- a/tools/perf/util/thread_map.c
+++ b/tools/perf/util/thread_map.c
@@ -304,6 +304,7 @@ out:
 
 out_free_threads:
 	zfree(&threads);
+	strlist__delete(slist);
 	goto out;
 }
 
-- 
cgit v1.1


From 3f86eb6b0771d785099c91354838d3f8d8126630 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 10 Dec 2015 12:00:59 +0900
Subject: perf tools: Get rid of exit_browser() from usage_with_options()

Since all of its users call before setup_browser(), there's no need to
call exit_browser() inside of the function.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1449716459-23004-8-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/parse-options.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tools/perf/util/parse-options.c b/tools/perf/util/parse-options.c
index d09aff9..de3290b 100644
--- a/tools/perf/util/parse-options.c
+++ b/tools/perf/util/parse-options.c
@@ -766,7 +766,6 @@ int usage_with_options_internal(const char * const *usagestr,
 void usage_with_options(const char * const *usagestr,
 			const struct option *opts)
 {
-	exit_browser(false);
 	usage_with_options_internal(usagestr, opts, 0, NULL);
 	exit(129);
 }
@@ -776,8 +775,6 @@ void usage_with_options_msg(const char * const *usagestr,
 {
 	va_list ap;
 
-	exit_browser(false);
-
 	va_start(ap, fmt);
 	strbuf_addv(&error_buf, fmt, ap);
 	va_end(ap);
-- 
cgit v1.1


From 61fa0e94ca6ab62db5e095a5528150bf9962196d Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 10 Dec 2015 16:53:20 +0900
Subject: perf top: Delete half-processed hist entries when exit

After sample processing is done, hist entries are in both of
hists->entries and hists->entries_in (or hists->entries_collapsed).  So
I guess perf report does not have leaks on hists.

But for perf top, it's possible to have half-processed entries which are
only in hists->entries_in.  Eventually they will go to the
hists->entries and get freed but they cannot be deleted by current
hists__delete_entries().  This patch adds hists__delete_all_entries
function to delete those entries.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-and-Acked-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/1449734015-9148-2-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/hist.c | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 565ea35..56e97f5 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -270,6 +270,8 @@ static void hists__delete_entry(struct hists *hists, struct hist_entry *he)
 
 	if (sort__need_collapse)
 		rb_erase(&he->rb_node_in, &hists->entries_collapsed);
+	else
+		rb_erase(&he->rb_node_in, hists->entries_in);
 
 	--hists->nr_entries;
 	if (!he->filtered)
@@ -1567,11 +1569,33 @@ static int hists_evsel__init(struct perf_evsel *evsel)
 	return 0;
 }
 
+static void hists__delete_remaining_entries(struct rb_root *root)
+{
+	struct rb_node *node;
+	struct hist_entry *he;
+
+	while (!RB_EMPTY_ROOT(root)) {
+		node = rb_first(root);
+		rb_erase(node, root);
+
+		he = rb_entry(node, struct hist_entry, rb_node_in);
+		hist_entry__delete(he);
+	}
+}
+
+static void hists__delete_all_entries(struct hists *hists)
+{
+	hists__delete_entries(hists);
+	hists__delete_remaining_entries(&hists->entries_in_array[0]);
+	hists__delete_remaining_entries(&hists->entries_in_array[1]);
+	hists__delete_remaining_entries(&hists->entries_collapsed);
+}
+
 static void hists_evsel__exit(struct perf_evsel *evsel)
 {
 	struct hists *hists = evsel__hists(evsel);
 
-	hists__delete_entries(hists);
+	hists__delete_all_entries(hists);
 }
 
 /*
-- 
cgit v1.1


From 8488335c039ff4917754332763e21c01a81435b4 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 8 Dec 2015 16:51:24 -0300
Subject: Revert "perf tools: Improve setting of gcc debug option"

This reverts commit e8b7ea4356fdd3c4de5478f3418eb84f8dce2b61.

Martin created a gcc PR:

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=68836

Reported-by: Jiri Olsa <jolsa@redhat.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Martin Liska <mliska@suse.cz>
Cc: Namhyung Kim <namhyung@kernel.org>,
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20151202164827.GA21124@krava.brq.redhat.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/config/Makefile      |  2 --
 tools/perf/config/utilities.mak | 19 -------------------
 2 files changed, 21 deletions(-)

diff --git a/tools/perf/config/Makefile b/tools/perf/config/Makefile
index 6eb9a95..a552417 100644
--- a/tools/perf/config/Makefile
+++ b/tools/perf/config/Makefile
@@ -135,8 +135,6 @@ endif
 
 ifeq ($(DEBUG),0)
   CFLAGS += -O6
-else
-  CFLAGS += $(call cc-option,-Og,-O0)
 endif
 
 ifdef PARSER_DEBUG
diff --git a/tools/perf/config/utilities.mak b/tools/perf/config/utilities.mak
index 0ebef09..c16ce83 100644
--- a/tools/perf/config/utilities.mak
+++ b/tools/perf/config/utilities.mak
@@ -177,22 +177,3 @@ $(if $($(1)),$(call _ge_attempt,$($(1)),$(1)),$(call _ge_attempt,$(2)))
 endef
 _ge_attempt = $(if $(get-executable),$(get-executable),$(call _gea_err,$(2)))
 _gea_err  = $(if $(1),$(error Please set '$(1)' appropriately))
-
-# try-run
-# Usage: option = $(call try-run, $(CC)...-o "$$TMP",option-ok,otherwise)
-# Exit code chooses option. "$$TMP" is can be used as temporary file and
-# is automatically cleaned up.
-try-run = $(shell set -e;		\
-	TMP="$(TMPOUT).$$$$.tmp";	\
-	TMPO="$(TMPOUT).$$$$.o";	\
-	if ($(1)) >/dev/null 2>&1;	\
-	then echo "$(2)";		\
-	else echo "$(3)";		\
-	fi;				\
-	rm -f "$$TMP" "$$TMPO")
-
-# cc-option
-# Usage: cflags-y += $(call cc-option,-march=winchip-c6,-march=i586)
-
-cc-option = $(call try-run,\
-	$(CC) $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS) $(1) -c -x c /dev/null -o "$$TMP",$(1),$(2))
-- 
cgit v1.1


From 9d8b172f29ac0e5d1923d348e395e9643625ef7f Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Wed, 9 Dec 2015 11:11:23 +0900
Subject: perf tools: Make perf_session__register_idle_thread drop the refcount

Note that since the thread was already inserted to the session
list, it will be released when the session is released.
Also, in perf_session__register_idle_thread() failure path,
the thread should be put before returning.

Refcnt debugger shows that the perf_session__register_idle_thread
gets the returned thread, but the caller (__cmd_top) does not
put the returned idle thread.

  ----
  ==== [0] ====
  Unreclaimed thread@0x24e6240
  Refcount +1 => 0 at
    ./perf(thread__new+0xe5) [0x4c8a75]
    ./perf(machine__findnew_thread+0x9a) [0x4bbdba]
    ./perf(perf_session__register_idle_thread+0x28) [0x4c63c8]
    ./perf(cmd_top+0xd7d) [0x43cf6d]
    ./perf() [0x47ba35]
    ./perf(main+0x617) [0x4225b7]
    /lib64/libc.so.6(__libc_start_main+0xf5) [0x7f06027c5af5]
    ./perf() [0x42272d]
  Refcount +1 => 1 at
    ./perf(thread__get+0x2c) [0x4c8bcc]
    ./perf(machine__findnew_thread+0xee) [0x4bbe0e]
    ./perf(perf_session__register_idle_thread+0x28) [0x4c63c8]
    ./perf(cmd_top+0xd7d) [0x43cf6d]
    ./perf() [0x47ba35]
    ./perf(main+0x617) [0x4225b7]
    /lib64/libc.so.6(__libc_start_main+0xf5) [0x7f06027c5af5]
    ./perf() [0x42272d]
  Refcount +1 => 2 at
    ./perf(thread__get+0x2c) [0x4c8bcc]
    ./perf(machine__findnew_thread+0x112) [0x4bbe32]
    ./perf(perf_session__register_idle_thread+0x28) [0x4c63c8]
    ./perf(cmd_top+0xd7d) [0x43cf6d]
    ./perf() [0x47ba35]
    ./perf(main+0x617) [0x4225b7]
    /lib64/libc.so.6(__libc_start_main+0xf5) [0x7f06027c5af5]
    ./perf() [0x42272d]
  ----

Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20151209021122.10245.69707.stgit@localhost.localdomain
[ Drop the refcount in perf_session__register_idle_thread() ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-top.c  |  2 +-
 tools/perf/util/session.c | 11 +++++++----
 tools/perf/util/session.h |  2 +-
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 84fd636..785aa2d 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -964,7 +964,7 @@ static int __cmd_top(struct perf_top *top)
 	if (ret)
 		goto out_delete;
 
-	if (perf_session__register_idle_thread(top->session) == NULL)
+	if (perf_session__register_idle_thread(top->session) < 0)
 		goto out_delete;
 
 	machine__synthesize_threads(&top->session->machines.host, &opts->target,
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index c35ffdd..9774686 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -1311,17 +1311,20 @@ struct thread *perf_session__findnew(struct perf_session *session, pid_t pid)
 	return machine__findnew_thread(&session->machines.host, -1, pid);
 }
 
-struct thread *perf_session__register_idle_thread(struct perf_session *session)
+int perf_session__register_idle_thread(struct perf_session *session)
 {
 	struct thread *thread;
+	int err = 0;
 
 	thread = machine__findnew_thread(&session->machines.host, 0, 0);
 	if (thread == NULL || thread__set_comm(thread, "swapper", 0)) {
 		pr_err("problem inserting idle task.\n");
-		thread = NULL;
+		err = -1;
 	}
 
-	return thread;
+	/* machine__findnew_thread() got the thread, so put it */
+	thread__put(thread);
+	return err;
 }
 
 static void perf_session__warn_about_errors(const struct perf_session *session)
@@ -1676,7 +1679,7 @@ int perf_session__process_events(struct perf_session *session)
 	u64 size = perf_data_file__size(session->file);
 	int err;
 
-	if (perf_session__register_idle_thread(session) == NULL)
+	if (perf_session__register_idle_thread(session) < 0)
 		return -ENOMEM;
 
 	if (!perf_data_file__is_pipe(session->file))
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index 3e900c0..5f792e3 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -89,7 +89,7 @@ struct machine *perf_session__findnew_machine(struct perf_session *session, pid_
 }
 
 struct thread *perf_session__findnew(struct perf_session *session, pid_t pid);
-struct thread *perf_session__register_idle_thread(struct perf_session *session);
+int perf_session__register_idle_thread(struct perf_session *session);
 
 size_t perf_session__fprintf(struct perf_session *session, FILE *fp);
 
-- 
cgit v1.1


From e7a7865cc0da306542db0b9205cb0a467f59e33d Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Wed, 9 Dec 2015 11:11:18 +0900
Subject: perf symbols: Fix dso__load_sym to put dso

Fix dso__load_sym to put dso because dsos__add already got it.

Refcnt debugger explain the problem:
  ----
  ==== [0] ====
  Unreclaimed dso: 0x19dd200
  Refcount +1 => 1 at
    ./perf(dso__new+0x1ff) [0x4a62df]
    ./perf(dso__load_sym+0xe89) [0x503509]
    ./perf(dso__load_vmlinux+0xbf) [0x4aa77f]
    ./perf(dso__load_vmlinux_path+0x8c) [0x4aa8dc]
    ./perf() [0x50539a]
    ./perf(convert_perf_probe_events+0xd79) [0x50ad39]
    ./perf() [0x45600f]
    ./perf(cmd_probe+0x6c) [0x4566bc]
    ./perf() [0x47abc5]
    ./perf(main+0x610) [0x421f90]
    /lib64/libc.so.6(__libc_start_main+0xf5) [0x7f74dd0efaf5]
    ./perf() [0x4220a9]
  Refcount +1 => 2 at
    ./perf(dso__get+0x34) [0x4a65f4]
    ./perf(map__new2+0x76) [0x4be216]
    ./perf(dso__load_sym+0xee1) [0x503561]
    ./perf(dso__load_vmlinux+0xbf) [0x4aa77f]
    ./perf(dso__load_vmlinux_path+0x8c) [0x4aa8dc]
    ./perf() [0x50539a]
    ./perf(convert_perf_probe_events+0xd79) [0x50ad39]
    ./perf() [0x45600f]
    ./perf(cmd_probe+0x6c) [0x4566bc]
    ./perf() [0x47abc5]
    ./perf(main+0x610) [0x421f90]
    /lib64/libc.so.6(__libc_start_main+0xf5) [0x7f74dd0efaf5]
    ./perf() [0x4220a9]
  Refcount +1 => 3 at
    ./perf(dsos__add+0xf3) [0x4a6bc3]
    ./perf(dso__load_sym+0xfc1) [0x503641]
    ./perf(dso__load_vmlinux+0xbf) [0x4aa77f]
    ./perf(dso__load_vmlinux_path+0x8c) [0x4aa8dc]
    ./perf() [0x50539a]
    ./perf(convert_perf_probe_events+0xd79) [0x50ad39]
    ./perf() [0x45600f]
    ./perf(cmd_probe+0x6c) [0x4566bc]
    ./perf() [0x47abc5]
    ./perf(main+0x610) [0x421f90]
    /lib64/libc.so.6(__libc_start_main+0xf5) [0x7f74dd0efaf5]
    ./perf() [0x4220a9]
  Refcount -1 => 2 at
    ./perf(dso__put+0x2f) [0x4a664f]
    ./perf(map_groups__exit+0xb9) [0x4bee29]
    ./perf(machine__delete+0xb0) [0x4b93d0]
    ./perf(exit_probe_symbol_maps+0x28) [0x506718]
    ./perf() [0x45628a]
    ./perf(cmd_probe+0x6c) [0x4566bc]
    ./perf() [0x47abc5]
    ./perf(main+0x610) [0x421f90]
    /lib64/libc.so.6(__libc_start_main+0xf5) [0x7f74dd0efaf5]
    ./perf() [0x4220a9]
  Refcount -1 => 1 at
    ./perf(dso__put+0x2f) [0x4a664f]
    ./perf(machine__delete+0xfe) [0x4b941e]
    ./perf(exit_probe_symbol_maps+0x28) [0x506718]
    ./perf() [0x45628a]
    ./perf(cmd_probe+0x6c) [0x4566bc]
    ./perf() [0x47abc5]
    ./perf(main+0x610) [0x421f90]
    /lib64/libc.so.6(__libc_start_main+0xf5) [0x7f74dd0efaf5]
    ./perf() [0x4220a9]
  ----
So, in the dso__load_sym, dso is gotten 3 times, by dso__new,
map__new2, and dsos__add. The last 2 is actually released by
map_groups and machine__delete correspondingly. However, the
first reference by dso__new, is never released.

Committer note:

Changed the place where the reference count is dropped to:

Fix it by dropping it right after creating curr_map, since we know that
either that operation failed and we need to drop the dso refcount or
that it succeed and we have it referenced via curr_map->dso.

Then only drop the curr_map refcount after we call dsos__add() to make
sure we hold a reference to it via curr_map->dso.

Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20151209021118.10245.49869.stgit@localhost.localdomain
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/symbol-elf.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c
index 53f1996..562b8eb 100644
--- a/tools/perf/util/symbol-elf.c
+++ b/tools/perf/util/symbol-elf.c
@@ -1026,8 +1026,8 @@ int dso__load_sym(struct dso *dso, struct map *map,
 				curr_dso->long_name_len = dso->long_name_len;
 				curr_map = map__new2(start, curr_dso,
 						     map->type);
+				dso__put(curr_dso);
 				if (curr_map == NULL) {
-					dso__put(curr_dso);
 					goto out_elf_end;
 				}
 				if (adjust_kernel_syms) {
@@ -1042,9 +1042,14 @@ int dso__load_sym(struct dso *dso, struct map *map,
 				}
 				curr_dso->symtab_type = dso->symtab_type;
 				map_groups__insert(kmaps, curr_map);
+				/*
+				 * Add it before we drop the referece to curr_map,
+				 * i.e. while we still are sure to have a reference
+				 * to this DSO via curr_map->dso.
+				 */
+				dsos__add(&map->groups->machine->dsos, curr_dso);
 				/* kmaps already got it */
 				map__put(curr_map);
-				dsos__add(&map->groups->machine->dsos, curr_dso);
 				dso__set_loaded(curr_dso, map->type);
 			} else
 				curr_dso = curr_map->dso;
-- 
cgit v1.1


From 677a73a9aa5433ea728200c26a7b3506d5eaa92b Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 10 Dec 2015 19:20:18 -0800
Subject: x86/kvm: On KVM re-enable (e.g. after suspend), update clocks

This gets rid of the "did TSC go backwards" logic and just
updates all clocks.  It should work better (no more disabling of
fast timing) and more reliably (all of the clocks are actually
updated).

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/861716d768a1da6d1fd257b7972f8df13baf7f85.1449702533.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kvm/x86.c | 75 +++---------------------------------------------------
 1 file changed, 3 insertions(+), 72 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 00462bd..6e32e87 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -123,8 +123,6 @@ module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
 unsigned int __read_mostly lapic_timer_advance_ns = 0;
 module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
 
-static bool __read_mostly backwards_tsc_observed = false;
-
 #define KVM_NR_SHARED_MSRS 16
 
 struct kvm_shared_msrs_global {
@@ -1671,7 +1669,6 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
 					&ka->master_cycle_now);
 
 	ka->use_master_clock = host_tsc_clocksource && vcpus_matched
-				&& !backwards_tsc_observed
 				&& !ka->boot_vcpu_runs_old_kvmclock;
 
 	if (ka->use_master_clock)
@@ -7366,88 +7363,22 @@ int kvm_arch_hardware_enable(void)
 	struct kvm_vcpu *vcpu;
 	int i;
 	int ret;
-	u64 local_tsc;
-	u64 max_tsc = 0;
-	bool stable, backwards_tsc = false;
 
 	kvm_shared_msr_cpu_online();
 	ret = kvm_x86_ops->hardware_enable();
 	if (ret != 0)
 		return ret;
 
-	local_tsc = rdtsc();
-	stable = !check_tsc_unstable();
 	list_for_each_entry(kvm, &vm_list, vm_list) {
 		kvm_for_each_vcpu(i, vcpu, kvm) {
-			if (!stable && vcpu->cpu == smp_processor_id())
+			if (vcpu->cpu == smp_processor_id()) {
 				kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
-			if (stable && vcpu->arch.last_host_tsc > local_tsc) {
-				backwards_tsc = true;
-				if (vcpu->arch.last_host_tsc > max_tsc)
-					max_tsc = vcpu->arch.last_host_tsc;
+				kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE,
+						 vcpu);
 			}
 		}
 	}
 
-	/*
-	 * Sometimes, even reliable TSCs go backwards.  This happens on
-	 * platforms that reset TSC during suspend or hibernate actions, but
-	 * maintain synchronization.  We must compensate.  Fortunately, we can
-	 * detect that condition here, which happens early in CPU bringup,
-	 * before any KVM threads can be running.  Unfortunately, we can't
-	 * bring the TSCs fully up to date with real time, as we aren't yet far
-	 * enough into CPU bringup that we know how much real time has actually
-	 * elapsed; our helper function, get_kernel_ns() will be using boot
-	 * variables that haven't been updated yet.
-	 *
-	 * So we simply find the maximum observed TSC above, then record the
-	 * adjustment to TSC in each VCPU.  When the VCPU later gets loaded,
-	 * the adjustment will be applied.  Note that we accumulate
-	 * adjustments, in case multiple suspend cycles happen before some VCPU
-	 * gets a chance to run again.  In the event that no KVM threads get a
-	 * chance to run, we will miss the entire elapsed period, as we'll have
-	 * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may
-	 * loose cycle time.  This isn't too big a deal, since the loss will be
-	 * uniform across all VCPUs (not to mention the scenario is extremely
-	 * unlikely). It is possible that a second hibernate recovery happens
-	 * much faster than a first, causing the observed TSC here to be
-	 * smaller; this would require additional padding adjustment, which is
-	 * why we set last_host_tsc to the local tsc observed here.
-	 *
-	 * N.B. - this code below runs only on platforms with reliable TSC,
-	 * as that is the only way backwards_tsc is set above.  Also note
-	 * that this runs for ALL vcpus, which is not a bug; all VCPUs should
-	 * have the same delta_cyc adjustment applied if backwards_tsc
-	 * is detected.  Note further, this adjustment is only done once,
-	 * as we reset last_host_tsc on all VCPUs to stop this from being
-	 * called multiple times (one for each physical CPU bringup).
-	 *
-	 * Platforms with unreliable TSCs don't have to deal with this, they
-	 * will be compensated by the logic in vcpu_load, which sets the TSC to
-	 * catchup mode.  This will catchup all VCPUs to real time, but cannot
-	 * guarantee that they stay in perfect synchronization.
-	 */
-	if (backwards_tsc) {
-		u64 delta_cyc = max_tsc - local_tsc;
-		backwards_tsc_observed = true;
-		list_for_each_entry(kvm, &vm_list, vm_list) {
-			kvm_for_each_vcpu(i, vcpu, kvm) {
-				vcpu->arch.tsc_offset_adjustment += delta_cyc;
-				vcpu->arch.last_host_tsc = local_tsc;
-				kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
-			}
-
-			/*
-			 * We have to disable TSC offset matching.. if you were
-			 * booting a VM while issuing an S4 host suspend....
-			 * you may have some problem.  Solving this issue is
-			 * left as an exercise to the reader.
-			 */
-			kvm->arch.last_tsc_nsec = 0;
-			kvm->arch.last_tsc_write = 0;
-		}
-
-	}
 	return 0;
 }
 
-- 
cgit v1.1


From 6b078f5de7fc0851af4102493c7b5bb07e49c4cb Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@amacapital.net>
Date: Thu, 10 Dec 2015 19:20:19 -0800
Subject: x86, vdso, pvclock: Simplify and speed up the vdso pvclock reader

The pvclock vdso code was too abstracted to understand easily
and excessively paranoid.  Simplify it for a huge speedup.

This opens the door for additional simplifications, as the vdso
no longer accesses the pvti for any vcpu other than vcpu 0.

Before, vclock_gettime using kvm-clock took about 45ns on my
machine. With this change, it takes 29ns, which is almost as
fast as the pure TSC implementation.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/6b51dcc41f1b101f963945c5ec7093d72bdac429.1449702533.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/vdso/vclock_gettime.c | 81 ++++++++++++++++++++----------------
 1 file changed, 46 insertions(+), 35 deletions(-)

diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index ca94fa6..c325ba1 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -78,47 +78,58 @@ static notrace const struct pvclock_vsyscall_time_info *get_pvti(int cpu)
 
 static notrace cycle_t vread_pvclock(int *mode)
 {
-	const struct pvclock_vsyscall_time_info *pvti;
+	const struct pvclock_vcpu_time_info *pvti = &get_pvti(0)->pvti;
 	cycle_t ret;
-	u64 last;
-	u32 version;
-	u8 flags;
-	unsigned cpu, cpu1;
-
+	u64 tsc, pvti_tsc;
+	u64 last, delta, pvti_system_time;
+	u32 version, pvti_tsc_to_system_mul, pvti_tsc_shift;
 
 	/*
-	 * Note: hypervisor must guarantee that:
-	 * 1. cpu ID number maps 1:1 to per-CPU pvclock time info.
-	 * 2. that per-CPU pvclock time info is updated if the
-	 *    underlying CPU changes.
-	 * 3. that version is increased whenever underlying CPU
-	 *    changes.
+	 * Note: The kernel and hypervisor must guarantee that cpu ID
+	 * number maps 1:1 to per-CPU pvclock time info.
+	 *
+	 * Because the hypervisor is entirely unaware of guest userspace
+	 * preemption, it cannot guarantee that per-CPU pvclock time
+	 * info is updated if the underlying CPU changes or that that
+	 * version is increased whenever underlying CPU changes.
 	 *
+	 * On KVM, we are guaranteed that pvti updates for any vCPU are
+	 * atomic as seen by *all* vCPUs.  This is an even stronger
+	 * guarantee than we get with a normal seqlock.
+	 *
+	 * On Xen, we don't appear to have that guarantee, but Xen still
+	 * supplies a valid seqlock using the version field.
+
+	 * We only do pvclock vdso timing at all if
+	 * PVCLOCK_TSC_STABLE_BIT is set, and we interpret that bit to
+	 * mean that all vCPUs have matching pvti and that the TSC is
+	 * synced, so we can just look at vCPU 0's pvti.
 	 */
-	do {
-		cpu = __getcpu() & VGETCPU_CPU_MASK;
-		/* TODO: We can put vcpu id into higher bits of pvti.version.
-		 * This will save a couple of cycles by getting rid of
-		 * __getcpu() calls (Gleb).
-		 */
-
-		pvti = get_pvti(cpu);
-
-		version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
-
-		/*
-		 * Test we're still on the cpu as well as the version.
-		 * We could have been migrated just after the first
-		 * vgetcpu but before fetching the version, so we
-		 * wouldn't notice a version change.
-		 */
-		cpu1 = __getcpu() & VGETCPU_CPU_MASK;
-	} while (unlikely(cpu != cpu1 ||
-			  (pvti->pvti.version & 1) ||
-			  pvti->pvti.version != version));
-
-	if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
+
+	if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) {
 		*mode = VCLOCK_NONE;
+		return 0;
+	}
+
+	do {
+		version = pvti->version;
+
+		/* This is also a read barrier, so we'll read version first. */
+		tsc = rdtsc_ordered();
+
+		pvti_tsc_to_system_mul = pvti->tsc_to_system_mul;
+		pvti_tsc_shift = pvti->tsc_shift;
+		pvti_system_time = pvti->system_time;
+		pvti_tsc = pvti->tsc_timestamp;
+
+		/* Make sure that the version double-check is last. */
+		smp_rmb();
+	} while (unlikely((version & 1) || version != pvti->version));
+
+	delta = tsc - pvti_tsc;
+	ret = pvti_system_time +
+		pvclock_scale_delta(delta, pvti_tsc_to_system_mul,
+				    pvti_tsc_shift);
 
 	/* refer to tsc.c read_tsc() comment for rationale */
 	last = gtod->cycle_last;
-- 
cgit v1.1


From dac16fba6fc590fa7239676b35ed75dae4c4cd2b Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 10 Dec 2015 19:20:20 -0800
Subject: x86/vdso: Get pvclock data from the vvar VMA instead of the fixmap

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/9d37826fdc7e2d2809efe31d5345f97186859284.1449702533.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/vdso/vclock_gettime.c  | 20 ++++++++------------
 arch/x86/entry/vdso/vdso-layout.lds.S |  3 ++-
 arch/x86/entry/vdso/vdso2c.c          |  3 +++
 arch/x86/entry/vdso/vma.c             | 13 +++++++++++++
 arch/x86/include/asm/pvclock.h        |  9 +++++++++
 arch/x86/include/asm/vdso.h           |  1 +
 arch/x86/kernel/kvmclock.c            |  5 +++++
 7 files changed, 41 insertions(+), 13 deletions(-)

diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index c325ba1..5dd363d 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -36,6 +36,11 @@ static notrace cycle_t vread_hpet(void)
 }
 #endif
 
+#ifdef CONFIG_PARAVIRT_CLOCK
+extern u8 pvclock_page
+	__attribute__((visibility("hidden")));
+#endif
+
 #ifndef BUILD_VDSO32
 
 #include <linux/kernel.h>
@@ -62,23 +67,14 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
 
 #ifdef CONFIG_PARAVIRT_CLOCK
 
-static notrace const struct pvclock_vsyscall_time_info *get_pvti(int cpu)
+static notrace const struct pvclock_vsyscall_time_info *get_pvti0(void)
 {
-	const struct pvclock_vsyscall_time_info *pvti_base;
-	int idx = cpu / (PAGE_SIZE/PVTI_SIZE);
-	int offset = cpu % (PAGE_SIZE/PVTI_SIZE);
-
-	BUG_ON(PVCLOCK_FIXMAP_BEGIN + idx > PVCLOCK_FIXMAP_END);
-
-	pvti_base = (struct pvclock_vsyscall_time_info *)
-		    __fix_to_virt(PVCLOCK_FIXMAP_BEGIN+idx);
-
-	return &pvti_base[offset];
+	return (const struct pvclock_vsyscall_time_info *)&pvclock_page;
 }
 
 static notrace cycle_t vread_pvclock(int *mode)
 {
-	const struct pvclock_vcpu_time_info *pvti = &get_pvti(0)->pvti;
+	const struct pvclock_vcpu_time_info *pvti = &get_pvti0()->pvti;
 	cycle_t ret;
 	u64 tsc, pvti_tsc;
 	u64 last, delta, pvti_system_time;
diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S
index de2c921..4158acc 100644
--- a/arch/x86/entry/vdso/vdso-layout.lds.S
+++ b/arch/x86/entry/vdso/vdso-layout.lds.S
@@ -25,7 +25,7 @@ SECTIONS
 	 * segment.
 	 */
 
-	vvar_start = . - 2 * PAGE_SIZE;
+	vvar_start = . - 3 * PAGE_SIZE;
 	vvar_page = vvar_start;
 
 	/* Place all vvars at the offsets in asm/vvar.h. */
@@ -36,6 +36,7 @@ SECTIONS
 #undef EMIT_VVAR
 
 	hpet_page = vvar_start + PAGE_SIZE;
+	pvclock_page = vvar_start + 2 * PAGE_SIZE;
 
 	. = SIZEOF_HEADERS;
 
diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c
index 785d992..491020b 100644
--- a/arch/x86/entry/vdso/vdso2c.c
+++ b/arch/x86/entry/vdso/vdso2c.c
@@ -73,6 +73,7 @@ enum {
 	sym_vvar_start,
 	sym_vvar_page,
 	sym_hpet_page,
+	sym_pvclock_page,
 	sym_VDSO_FAKE_SECTION_TABLE_START,
 	sym_VDSO_FAKE_SECTION_TABLE_END,
 };
@@ -80,6 +81,7 @@ enum {
 const int special_pages[] = {
 	sym_vvar_page,
 	sym_hpet_page,
+	sym_pvclock_page,
 };
 
 struct vdso_sym {
@@ -91,6 +93,7 @@ struct vdso_sym required_syms[] = {
 	[sym_vvar_start] = {"vvar_start", true},
 	[sym_vvar_page] = {"vvar_page", true},
 	[sym_hpet_page] = {"hpet_page", true},
+	[sym_pvclock_page] = {"pvclock_page", true},
 	[sym_VDSO_FAKE_SECTION_TABLE_START] = {
 		"VDSO_FAKE_SECTION_TABLE_START", false
 	},
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 64df471..aa82819 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -100,6 +100,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
 		.name = "[vvar]",
 		.pages = no_pages,
 	};
+	struct pvclock_vsyscall_time_info *pvti;
 
 	if (calculate_addr) {
 		addr = vdso_addr(current->mm->start_stack,
@@ -169,6 +170,18 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
 	}
 #endif
 
+	pvti = pvclock_pvti_cpu0_va();
+	if (pvti && image->sym_pvclock_page) {
+		ret = remap_pfn_range(vma,
+				      text_start + image->sym_pvclock_page,
+				      __pa(pvti) >> PAGE_SHIFT,
+				      PAGE_SIZE,
+				      PAGE_READONLY);
+
+		if (ret)
+			goto up_fail;
+	}
+
 up_fail:
 	if (ret)
 		current->mm->context.vdso = NULL;
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 7a6bed5..3864398 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -4,6 +4,15 @@
 #include <linux/clocksource.h>
 #include <asm/pvclock-abi.h>
 
+#ifdef CONFIG_PARAVIRT_CLOCK
+extern struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void);
+#else
+static inline struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void)
+{
+	return NULL;
+}
+#endif
+
 /* some helper functions for xen and kvm pv clock sources */
 cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
 u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src);
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index 756de91..deabaf9 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -22,6 +22,7 @@ struct vdso_image {
 
 	long sym_vvar_page;
 	long sym_hpet_page;
+	long sym_pvclock_page;
 	long sym_VDSO32_NOTE_MASK;
 	long sym___kernel_sigreturn;
 	long sym___kernel_rt_sigreturn;
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 2bd81e3..ec1b06d 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -45,6 +45,11 @@ early_param("no-kvmclock", parse_no_kvmclock);
 static struct pvclock_vsyscall_time_info *hv_clock;
 static struct pvclock_wall_clock wall_clock;
 
+struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void)
+{
+	return hv_clock;
+}
+
 /*
  * The wallclock is the time of day when we booted. Since then, some time may
  * have elapsed since the hypervisor wrote the data. So we try to account for
-- 
cgit v1.1


From cc1e24fdb064d3126a494716f22ad4fc39306742 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 10 Dec 2015 19:20:21 -0800
Subject: x86/vdso: Remove pvclock fixmap machinery

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/4933029991103ae44672c82b97a20035f5c1fe4f.1449702533.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/vdso/vclock_gettime.c |  1 -
 arch/x86/entry/vdso/vma.c            |  1 +
 arch/x86/include/asm/fixmap.h        |  5 -----
 arch/x86/include/asm/pvclock.h       |  5 -----
 arch/x86/kernel/kvmclock.c           |  6 ------
 arch/x86/kernel/pvclock.c            | 24 ------------------------
 6 files changed, 1 insertion(+), 41 deletions(-)

diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index 5dd363d..59a98c2 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -45,7 +45,6 @@ extern u8 pvclock_page
 
 #include <linux/kernel.h>
 #include <asm/vsyscall.h>
-#include <asm/fixmap.h>
 #include <asm/pvclock.h>
 
 notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index aa82819..b8f69e2 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -12,6 +12,7 @@
 #include <linux/random.h>
 #include <linux/elf.h>
 #include <linux/cpu.h>
+#include <asm/pvclock.h>
 #include <asm/vgtod.h>
 #include <asm/proto.h>
 #include <asm/vdso.h>
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index f80d700..6d7d0e5 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -19,7 +19,6 @@
 #include <asm/acpi.h>
 #include <asm/apicdef.h>
 #include <asm/page.h>
-#include <asm/pvclock.h>
 #ifdef CONFIG_X86_32
 #include <linux/threads.h>
 #include <asm/kmap_types.h>
@@ -72,10 +71,6 @@ enum fixed_addresses {
 #ifdef CONFIG_X86_VSYSCALL_EMULATION
 	VSYSCALL_PAGE = (FIXADDR_TOP - VSYSCALL_ADDR) >> PAGE_SHIFT,
 #endif
-#ifdef CONFIG_PARAVIRT_CLOCK
-	PVCLOCK_FIXMAP_BEGIN,
-	PVCLOCK_FIXMAP_END = PVCLOCK_FIXMAP_BEGIN+PVCLOCK_VSYSCALL_NR_PAGES-1,
-#endif
 #endif
 	FIX_DBGP_BASE,
 	FIX_EARLYCON_MEM_BASE,
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 3864398..66df22b 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -100,10 +100,5 @@ struct pvclock_vsyscall_time_info {
 } __attribute__((__aligned__(SMP_CACHE_BYTES)));
 
 #define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info)
-#define PVCLOCK_VSYSCALL_NR_PAGES (((NR_CPUS-1)/(PAGE_SIZE/PVTI_SIZE))+1)
-
-int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
-				 int size);
-struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu);
 
 #endif /* _ASM_X86_PVCLOCK_H */
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index ec1b06d..72cef58 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -310,7 +310,6 @@ int __init kvm_setup_vsyscall_timeinfo(void)
 {
 #ifdef CONFIG_X86_64
 	int cpu;
-	int ret;
 	u8 flags;
 	struct pvclock_vcpu_time_info *vcpu_time;
 	unsigned int size;
@@ -330,11 +329,6 @@ int __init kvm_setup_vsyscall_timeinfo(void)
 		return 1;
 	}
 
-	if ((ret = pvclock_init_vsyscall(hv_clock, size))) {
-		put_cpu();
-		return ret;
-	}
-
 	put_cpu();
 
 	kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK;
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 2f355d2..99bfc02 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -140,27 +140,3 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
 
 	set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
 }
-
-#ifdef CONFIG_X86_64
-/*
- * Initialize the generic pvclock vsyscall state.  This will allocate
- * a/some page(s) for the per-vcpu pvclock information, set up a
- * fixmap mapping for the page(s)
- */
-
-int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
-				 int size)
-{
-	int idx;
-
-	WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE);
-
-	for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) {
-		__set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx,
-			     __pa(i) + (idx*PAGE_SIZE),
-			     PAGE_KERNEL_VVAR);
-	}
-
-	return 0;
-}
-#endif
-- 
cgit v1.1


From 76480a6a55a03d0fe5dd6290ccde7f78678ab85e Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 10 Dec 2015 19:20:22 -0800
Subject: x86/vdso: Enable vdso pvclock access on all vdso variants

Now that pvclock doesn't require access to the fixmap, all vdso
variants can use it.

The kernel side isn't wired up for 32-bit kernels yet, but this
covers 32-bit and x32 userspace on 64-bit kernels.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/a7ef693b7a4c88dd2173dc1d4bf6bc27023626eb.1449702533.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/vdso/vclock_gettime.c | 91 ++++++++++++++++--------------------
 1 file changed, 40 insertions(+), 51 deletions(-)

diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index 59a98c2..8602f06 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -17,8 +17,10 @@
 #include <asm/vvar.h>
 #include <asm/unistd.h>
 #include <asm/msr.h>
+#include <asm/pvclock.h>
 #include <linux/math64.h>
 #include <linux/time.h>
+#include <linux/kernel.h>
 
 #define gtod (&VVAR(vsyscall_gtod_data))
 
@@ -43,10 +45,6 @@ extern u8 pvclock_page
 
 #ifndef BUILD_VDSO32
 
-#include <linux/kernel.h>
-#include <asm/vsyscall.h>
-#include <asm/pvclock.h>
-
 notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
 {
 	long ret;
@@ -64,8 +62,42 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
 	return ret;
 }
 
-#ifdef CONFIG_PARAVIRT_CLOCK
 
+#else
+
+notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
+{
+	long ret;
+
+	asm(
+		"mov %%ebx, %%edx \n"
+		"mov %2, %%ebx \n"
+		"call __kernel_vsyscall \n"
+		"mov %%edx, %%ebx \n"
+		: "=a" (ret)
+		: "0" (__NR_clock_gettime), "g" (clock), "c" (ts)
+		: "memory", "edx");
+	return ret;
+}
+
+notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
+{
+	long ret;
+
+	asm(
+		"mov %%ebx, %%edx \n"
+		"mov %2, %%ebx \n"
+		"call __kernel_vsyscall \n"
+		"mov %%edx, %%ebx \n"
+		: "=a" (ret)
+		: "0" (__NR_gettimeofday), "g" (tv), "c" (tz)
+		: "memory", "edx");
+	return ret;
+}
+
+#endif
+
+#ifdef CONFIG_PARAVIRT_CLOCK
 static notrace const struct pvclock_vsyscall_time_info *get_pvti0(void)
 {
 	return (const struct pvclock_vsyscall_time_info *)&pvclock_page;
@@ -109,9 +141,9 @@ static notrace cycle_t vread_pvclock(int *mode)
 	do {
 		version = pvti->version;
 
-		/* This is also a read barrier, so we'll read version first. */
-		tsc = rdtsc_ordered();
+		smp_rmb();
 
+		tsc = rdtsc_ordered();
 		pvti_tsc_to_system_mul = pvti->tsc_to_system_mul;
 		pvti_tsc_shift = pvti->tsc_shift;
 		pvti_system_time = pvti->system_time;
@@ -126,7 +158,7 @@ static notrace cycle_t vread_pvclock(int *mode)
 		pvclock_scale_delta(delta, pvti_tsc_to_system_mul,
 				    pvti_tsc_shift);
 
-	/* refer to tsc.c read_tsc() comment for rationale */
+	/* refer to vread_tsc() comment for rationale */
 	last = gtod->cycle_last;
 
 	if (likely(ret >= last))
@@ -136,49 +168,6 @@ static notrace cycle_t vread_pvclock(int *mode)
 }
 #endif
 
-#else
-
-notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
-{
-	long ret;
-
-	asm(
-		"mov %%ebx, %%edx \n"
-		"mov %2, %%ebx \n"
-		"call __kernel_vsyscall \n"
-		"mov %%edx, %%ebx \n"
-		: "=a" (ret)
-		: "0" (__NR_clock_gettime), "g" (clock), "c" (ts)
-		: "memory", "edx");
-	return ret;
-}
-
-notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
-{
-	long ret;
-
-	asm(
-		"mov %%ebx, %%edx \n"
-		"mov %2, %%ebx \n"
-		"call __kernel_vsyscall \n"
-		"mov %%edx, %%ebx \n"
-		: "=a" (ret)
-		: "0" (__NR_gettimeofday), "g" (tv), "c" (tz)
-		: "memory", "edx");
-	return ret;
-}
-
-#ifdef CONFIG_PARAVIRT_CLOCK
-
-static notrace cycle_t vread_pvclock(int *mode)
-{
-	*mode = VCLOCK_NONE;
-	return 0;
-}
-#endif
-
-#endif
-
 notrace static cycle_t vread_tsc(void)
 {
 	cycle_t ret = (cycle_t)rdtsc_ordered();
-- 
cgit v1.1


From d51953b0873358d13b189996e6976dfa12a9b59d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 11 Dec 2015 09:01:30 +0100
Subject: x86/platform/uv: Include clocksource.h for
 clocksource_touch_watchdog()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This build failure triggers on 64-bit allmodconfig:

  arch/x86/platform/uv/uv_nmi.c:493:2: error: implicit declaration of function ‘clocksource_touch_watchdog’ [-Werror=implicit-function-declaration]

which is caused by recent changes exposing a missing clocksource.h include
in uv_nmi.c:

  cc1e24fdb064 x86/vdso: Remove pvclock fixmap machinery

this file got clocksource.h indirectly via fixmap.h - that stealth route
of header inclusion is now gone.

Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/uv/uv_nmi.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c
index 327f21c..8dd8005 100644
--- a/arch/x86/platform/uv/uv_nmi.c
+++ b/arch/x86/platform/uv/uv_nmi.c
@@ -28,6 +28,7 @@
 #include <linux/nmi.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/clocksource.h>
 
 #include <asm/apic.h>
 #include <asm/current.h>
-- 
cgit v1.1


From 64226bcf64629996948dc03c38594f00511bfc2b Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Fri, 11 Dec 2015 11:56:53 +0900
Subject: perf top: Do not convert address for perf_top__record_precise_ip()

We call map->unmap_ip() before the function and call map->map_ip()
inside the function.  This is meaningless and look strange since only
one of the two checks 'map'.  Let's use al->addr directly.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1449802616-16170-1-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-top.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 785aa2d..3b0978e 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -189,8 +189,6 @@ static void perf_top__record_precise_ip(struct perf_top *top,
 	if (pthread_mutex_trylock(&notes->lock))
 		return;
 
-	ip = he->ms.map->map_ip(he->ms.map, ip);
-
 	if (ui__has_annotation())
 		err = hist_entry__inc_addr_samples(he, counter, ip);
 
@@ -687,14 +685,8 @@ static int hist_iter__top_callback(struct hist_entry_iter *iter,
 	struct hist_entry *he = iter->he;
 	struct perf_evsel *evsel = iter->evsel;
 
-	if (sort__has_sym && single) {
-		u64 ip = al->addr;
-
-		if (al->map)
-			ip = al->map->unmap_ip(al->map, ip);
-
-		perf_top__record_precise_ip(top, he, evsel->idx, ip);
-	}
+	if (sort__has_sym && single)
+		perf_top__record_precise_ip(top, he, evsel->idx, al->addr);
 
 	hist__account_cycles(iter->sample->branch_stack, al, iter->sample,
 		     !(top->record_opts.branch_stack & PERF_SAMPLE_BRANCH_ANY));
-- 
cgit v1.1


From 151ee834cc946fa159ee406c62b4d5ce1ebd7115 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Fri, 11 Dec 2015 11:56:54 +0900
Subject: perf top: Access hists->lock only if needed

The perf_top__record_precise_ip() releases and regrabs the
he->hists->lock because it can sleep if there's an error.  But it should
be done conditionally as it slows down the fast path.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1449802616-16170-2-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-top.c | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 3b0978e..586798a 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -194,21 +194,23 @@ static void perf_top__record_precise_ip(struct perf_top *top,
 
 	pthread_mutex_unlock(&notes->lock);
 
-	/*
-	 * This function is now called with he->hists->lock held.
-	 * Release it before going to sleep.
-	 */
-	pthread_mutex_unlock(&he->hists->lock);
+	if (unlikely(err)) {
+		/*
+		 * This function is now called with he->hists->lock held.
+		 * Release it before going to sleep.
+		 */
+		pthread_mutex_unlock(&he->hists->lock);
+
+		if (err == -ERANGE && !he->ms.map->erange_warned)
+			ui__warn_map_erange(he->ms.map, sym, ip);
+		else if (err == -ENOMEM) {
+			pr_err("Not enough memory for annotating '%s' symbol!\n",
+			       sym->name);
+			sleep(1);
+		}
 
-	if (err == -ERANGE && !he->ms.map->erange_warned)
-		ui__warn_map_erange(he->ms.map, sym, ip);
-	else if (err == -ENOMEM) {
-		pr_err("Not enough memory for annotating '%s' symbol!\n",
-		       sym->name);
-		sleep(1);
+		pthread_mutex_lock(&he->hists->lock);
 	}
-
-	pthread_mutex_lock(&he->hists->lock);
 }
 
 static void perf_top__show_details(struct perf_top *top)
-- 
cgit v1.1


From 448f13b2d18fdc8dbaada97442e8954dcb4ef8fa Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Fri, 11 Dec 2015 11:56:55 +0900
Subject: perf top: Fix annotation on --stdio

The ui__has_annotation() inside perf_top__record_precise_ip() should be
removed since it returns true only for TUI (and when sort key has
symbol).  However the 'perf top --stdio' also supports annotation for a
symbol which was specified by 's' key action.

Actually it already does the necessary checks before calling the
function.  So it's ok to get rid of the check here.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1449802616-16170-3-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-top.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 586798a..f447e55 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -189,8 +189,7 @@ static void perf_top__record_precise_ip(struct perf_top *top,
 	if (pthread_mutex_trylock(&notes->lock))
 		return;
 
-	if (ui__has_annotation())
-		err = hist_entry__inc_addr_samples(he, counter, ip);
+	err = hist_entry__inc_addr_samples(he, counter, ip);
 
 	pthread_mutex_unlock(&notes->lock);
 
-- 
cgit v1.1


From beefb8d0e556aaf3cb69168c5953e023ace6aa78 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Fri, 11 Dec 2015 11:56:56 +0900
Subject: perf top: Cleanup condition in perf_top__record_precise_ip()

The 'he' cannot be NULL since it's caller hist_iter__top_callback() is
called only if iter->he is not NULL (see hist_entry_iter__add).  So
setting 'sym' before the condition to simplify the code.

Also make it clearer that the top->symbol_filter_entry check is only
meaningful on stdio mode (i.e. when use_browser is 0).

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1449802616-16170-4-git-send-email-namhyung@kernel.org
[ Complete the simplification replacing one more he->ms.sym with sym ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-top.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index f447e55..92fe963 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -175,15 +175,14 @@ static void perf_top__record_precise_ip(struct perf_top *top,
 					int counter, u64 ip)
 {
 	struct annotation *notes;
-	struct symbol *sym;
+	struct symbol *sym = he->ms.sym;
 	int err = 0;
 
-	if (he == NULL || he->ms.sym == NULL ||
-	    ((top->sym_filter_entry == NULL ||
-	      top->sym_filter_entry->ms.sym != he->ms.sym) && use_browser != 1))
+	if (sym == NULL || (use_browser == 0 &&
+			    (top->sym_filter_entry == NULL ||
+			     top->sym_filter_entry->ms.sym != sym)))
 		return;
 
-	sym = he->ms.sym;
 	notes = symbol__annotation(sym);
 
 	if (pthread_mutex_trylock(&notes->lock))
-- 
cgit v1.1


From 973170e66726672518eb935eb0dc0e63876d133d Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Tue, 8 Dec 2015 02:25:29 +0000
Subject: tools lib bpf: Check return value of strdup when reading map names

Commit 561bbccac72d08babafaa33fd7fa9100ec4c9fb6 ("tools lib bpf:
Extract and collect map names from BPF object file") forgets checking
return value of strdup(). This patch fixes it. It also checks names
pointer before strcmp() for safety.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Fixes: 561bbccac72d ("tools lib bpf: Extract and collect map names from BPF object file")
Link: http://lkml.kernel.org/r/1449541544-67621-2-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/lib/bpf/libbpf.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index a298614..16485ab 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -527,14 +527,14 @@ bpf_object__init_maps(struct bpf_object *obj, void *data,
 	return 0;
 }
 
-static void
+static int
 bpf_object__init_maps_name(struct bpf_object *obj, int maps_shndx)
 {
 	int i;
 	Elf_Data *symbols = obj->efile.symbols;
 
 	if (!symbols || maps_shndx < 0)
-		return;
+		return -EINVAL;
 
 	for (i = 0; i < symbols->d_size / sizeof(GElf_Sym); i++) {
 		GElf_Sym sym;
@@ -556,9 +556,14 @@ bpf_object__init_maps_name(struct bpf_object *obj, int maps_shndx)
 			continue;
 		}
 		obj->maps[map_idx].name = strdup(map_name);
+		if (!obj->maps[map_idx].name) {
+			pr_warning("failed to alloc map name\n");
+			return -ENOMEM;
+		}
 		pr_debug("map %zu is \"%s\"\n", map_idx,
 			 obj->maps[map_idx].name);
 	}
+	return 0;
 }
 
 static int bpf_object__elf_collect(struct bpf_object *obj)
@@ -663,7 +668,7 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
 	}
 
 	if (maps_shndx >= 0)
-		bpf_object__init_maps_name(obj, maps_shndx);
+		err = bpf_object__init_maps_name(obj, maps_shndx);
 out:
 	return err;
 }
@@ -1372,7 +1377,7 @@ bpf_object__get_map_by_name(struct bpf_object *obj, const char *name)
 	struct bpf_map *pos;
 
 	bpf_map__for_each(pos, obj) {
-		if (strcmp(pos->name, name) == 0)
+		if (pos->name && !strcmp(pos->name, name))
 			return pos;
 	}
 	return NULL;
-- 
cgit v1.1


From 77ba9a5b48a7c742f9a46d26596852e9cfec7900 Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Tue, 8 Dec 2015 02:25:30 +0000
Subject: tools lib bpf: Fetch map names from correct strtab

Namhyung Kim pointed out a potential problem in original code that it
fetches names of maps from section header string table, which is used
to store section names.

Original code doesn't cause error because of a LLVM behavior that, it
combines shstrtab into strtab. For example:

 $ echo 'int func() {return 0;}' | x86_64-oe-linux-clang -x c -o temp.o -c -
 $ readelf -h ./temp.o
 ELF Header:
   Magic:   7f 45 4c 46 02 01 01 03 00 00 00 00 00 00 00 00
   ...
   Section header string table index: 1
 $ readelf -S ./temp.o
 There are 10 section headers, starting at offset 0x288:

 Section Headers:
   [Nr] Name              Type             Address           Offset
        Size              EntSize          Flags  Link  Info  Align
   [ 0]                   NULL             0000000000000000  00000000
        0000000000000000  0000000000000000           0     0     0
   [ 1] .strtab           STRTAB           0000000000000000  00000230
        0000000000000051  0000000000000000           0     0     1
        ...
 $ readelf -p .strtab ./temp.o

 String dump of section '.strtab':
   [     1]  .text
   [     7]  .comment
   [    10]  .bss
   [    15]  .note.GNU-stack
   [    25]  .rela.eh_frame
   [    34]  func
   [    39]  .strtab
   [    41]  .symtab
   [    49]  .data
   [    4f]  -

 $ readelf -p .shstrtab ./temp.o
 readelf: Warning: Section '.shstrtab' was not dumped because it does not exist!

Where, 'section header string table index' points to '.strtab', and
symbol names are also stored there.

However, in case of gcc:

 $ echo 'int func() {return 0;}' | gcc -x c -o temp.o -c -
 $ readelf -p .shstrtab ./temp.o

 String dump of section '.shstrtab':
   [     1]  .symtab
   [     9]  .strtab
   [    11]  .shstrtab
   [    1b]  .text
   [    21]  .data
   [    27]  .bss
   [    2c]  .comment
   [    35]  .note.GNU-stack
   [    45]  .rela.eh_frame
 $ readelf -p .strtab ./temp.o

 String dump of section '.strtab':
   [     1]  func

They are separated sections.

Although original code doesn't cause error, we'd better use canonical
method for fetching symbol names to avoid potential behavior changing.
This patch learns from readelf's code, fetches string from sh_link
of .symbol section.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Reported-and-Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1449541544-67621-3-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/lib/bpf/libbpf.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 16485ab..8334a5a 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -195,6 +195,7 @@ struct bpf_object {
 		Elf *elf;
 		GElf_Ehdr ehdr;
 		Elf_Data *symbols;
+		size_t strtabidx;
 		struct {
 			GElf_Shdr shdr;
 			Elf_Data *data;
@@ -547,7 +548,7 @@ bpf_object__init_maps_name(struct bpf_object *obj, int maps_shndx)
 			continue;
 
 		map_name = elf_strptr(obj->efile.elf,
-				      obj->efile.ehdr.e_shstrndx,
+				      obj->efile.strtabidx,
 				      sym.st_name);
 		map_idx = sym.st_value / sizeof(struct bpf_map_def);
 		if (map_idx >= obj->nr_maps) {
@@ -630,8 +631,10 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
 				pr_warning("bpf: multiple SYMTAB in %s\n",
 					   obj->path);
 				err = -LIBBPF_ERRNO__FORMAT;
-			} else
+			} else {
 				obj->efile.symbols = data;
+				obj->efile.strtabidx = sh.sh_link;
+			}
 		} else if ((sh.sh_type == SHT_PROGBITS) &&
 			   (sh.sh_flags & SHF_EXECINSTR) &&
 			   (data->d_size > 0)) {
@@ -667,6 +670,10 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
 			goto out;
 	}
 
+	if (!obj->efile.strtabidx || obj->efile.strtabidx >= idx) {
+		pr_warning("Corrupted ELF file: index of strtab invalid\n");
+		return LIBBPF_ERRNO__FORMAT;
+	}
 	if (maps_shndx >= 0)
 		err = bpf_object__init_maps_name(obj, maps_shndx);
 out:
-- 
cgit v1.1


From 26812d466b2633d0c772fe3aca954129f150d3cb Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Tue, 8 Dec 2015 02:25:39 +0000
Subject: perf data: Add u32_hex data type

Add hexadecimal u32 to base data type, which is useful for raw output
because raw data is u32 aligned.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1449541544-67621-12-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/data-convert-bt.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/perf/util/data-convert-bt.c b/tools/perf/util/data-convert-bt.c
index 5bfc119..34cd1e4 100644
--- a/tools/perf/util/data-convert-bt.c
+++ b/tools/perf/util/data-convert-bt.c
@@ -63,6 +63,7 @@ struct ctf_writer {
 			struct bt_ctf_field_type	*s32;
 			struct bt_ctf_field_type	*u32;
 			struct bt_ctf_field_type	*string;
+			struct bt_ctf_field_type	*u32_hex;
 			struct bt_ctf_field_type	*u64_hex;
 		};
 		struct bt_ctf_field_type *array[6];
@@ -982,6 +983,7 @@ do {							\
 	CREATE_INT_TYPE(cw->data.u64, 64, false, false);
 	CREATE_INT_TYPE(cw->data.s32, 32, true,  false);
 	CREATE_INT_TYPE(cw->data.u32, 32, false, false);
+	CREATE_INT_TYPE(cw->data.u32_hex, 32, false, true);
 	CREATE_INT_TYPE(cw->data.u64_hex, 64, false, true);
 
 	cw->data.string  = bt_ctf_field_type_string_create();
-- 
cgit v1.1


From 27cfef009ae8a1019d174153987ce22a0e6677fc Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Tue, 8 Dec 2015 02:25:43 +0000
Subject: perf script: Add support for PERF_TYPE_BREAKPOINT

Useful for getting stack traces for hardware breakpoint events.

Test result:

Before this patch:
 # ~/perf record -g -e mem:0x600980 ./sample
 [ perf record: Woken up 1 times to write data ]
 [ perf record: Captured and wrote 0.011 MB perf.data (12 samples) ]

 # ~/perf script

 # ~/perf script -F comm,tid,pid,time,event,ip,sym,dso
 sample 22520/22520 97457.836294: mem:0x600980:
          5a4ad8 __clear_user (/lib/modules/4.3.0-rc4+/build/vmlinux)
 ...
          3f41ba sys_execve (/lib/modules/4.3.0-rc4+/build/vmlinux)
          979395 return_from_execve (/lib/modules/4.3.0-rc4+/build/vmlinux)
    7f1b59719cf7 [unknown] ([unknown])

 sample 22520/22520 97457.836648: mem:0x600980:
             532 main (/home/w00229757/DataBreakpoints/sample)
           21bd5 __libc_start_main (/tmp/oxygen_root-root/lib64/libc-2.18.so)
 ...

After this patch:
 # ~/perf script
 sample 22520 97457.836294: mem:0x600980:
                   5a4ad8 __clear_user (/lib/modules/4.3.0-rc4+/build/vmlinux)
 ...
                   3f41ba sys_execve (/lib/modules/4.3.0-rc4+/build/vmlinux)
                   979395 return_from_execve (/lib/modules/4.3.0-rc4+/build/vmlinux)
             7f1b59719cf7 [unknown] ([unknown])

 sample 22520 97457.836648: mem:0x600980:
                      532 main (/home/w00229757/DataBreakpoints/sample)
                    21bd5 __libc_start_main (/tmp/oxygen_root-root/lib64/libc-2.18.so)

Committer note:

So, further testing, lets do it for a kernel global variable,
tcp_hashinfo:

  # grep -w tcp_hashinfo /proc/kallsyms
  ffffffff8202fc00 B tcp_hashinfo
  #

Note: allow specifying mem:tcp_hashinfo:

  # perf record -g -e mem:0xffffffff81c65ac0 -a
  ^C[ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.790 MB perf.data ]
  #
  # perf evlist
  mem:0xffffffff8202fc00
  # perf evlist -v
  mem:0xffffffff8202fc00: type: 5, size: 112, { sample_period, sample_freq }: 1, sample_type: IP|TID|TIME|CALLCHAIN|CPU, disabled: 1, inherit: 1, mmap: 1, comm: 1, task: 1, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, bp_type: 3, { bp_addr, config1 }: 0xffffffff8202fc00, { bp_len, config2 }: 0x4
  #

Then, after this patch:

  # perf script
  swapper 0 [000] 171036.986988: mem:0xffffffff8202fc00:
    8a0fb5 __inet_lookup_established (/lib/modules/4.3.0+/build/vmlinux)
    8bc09d tcp_v4_early_demux (/lib/modules/4.3.0+/build/vmlinux)
    896def ip_rcv_finish (/lib/modules/4.3.0+/build/vmlinux)
    8976c2 ip_rcv (/lib/modules/4.3.0+/build/vmlinux)
    855eba __netif_receive_skb_core (/lib/modules/4.3.0+/build/vmlinux)
    8565d8 __netif_receive_skb (/lib/modules/4.3.0+/build/vmlinux)
    8572a8 process_backlog (/lib/modules/4.3.0+/build/vmlinux)
    856b11 net_rx_action (/lib/modules/4.3.0+/build/vmlinux)
    2a284b __do_softirq (/lib/modules/4.3.0+/build/vmlinux)
    2a2ba3 irq_exit (/lib/modules/4.3.0+/build/vmlinux)
    96b7a4 do_IRQ (/lib/modules/4.3.0+/build/vmlinux)
    969807 ret_from_intr (/lib/modules/4.3.0+/build/vmlinux)
    804c27 cpuidle_enter (/lib/modules/4.3.0+/build/vmlinux)
    2ded22 call_cpuidle (/lib/modules/4.3.0+/build/vmlinux)
    2defb6 cpu_startup_entry (/lib/modules/4.3.0+/build/vmlinux)
    95d5bc rest_init (/lib/modules/4.3.0+/build/vmlinux)
   1163ffa start_kernel ([kernel.vmlinux].init.text)
   11634d7 x86_64_start_reservations ([kernel.vmlinux].init.text)
   1163623 x86_64_start_kernel ([kernel.vmlinux].init.text)

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1449541544-67621-16-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-script.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 3c3f8d0..d259e9a 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -130,6 +130,18 @@ static struct {
 
 		.invalid_fields = PERF_OUTPUT_TRACE,
 	},
+
+	[PERF_TYPE_BREAKPOINT] = {
+		.user_set = false,
+
+		.fields = PERF_OUTPUT_COMM | PERF_OUTPUT_TID |
+			      PERF_OUTPUT_CPU | PERF_OUTPUT_TIME |
+			      PERF_OUTPUT_EVNAME | PERF_OUTPUT_IP |
+			      PERF_OUTPUT_SYM | PERF_OUTPUT_DSO |
+			      PERF_OUTPUT_PERIOD,
+
+		.invalid_fields = PERF_OUTPUT_TRACE,
+	},
 };
 
 static bool output_set_by_user(void)
@@ -1129,6 +1141,8 @@ static int parse_output_fields(const struct option *opt __maybe_unused,
 			type = PERF_TYPE_TRACEPOINT;
 		else if (!strcmp(str, "raw"))
 			type = PERF_TYPE_RAW;
+		else if (!strcmp(str, "break"))
+			type = PERF_TYPE_BREAKPOINT;
 		else {
 			fprintf(stderr, "Invalid event type in field string.\n");
 			rc = -EINVAL;
-- 
cgit v1.1


From 93b0ba3c60da89043ce2b9f601cd2b3da408903b Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Tue, 8 Dec 2015 02:25:44 +0000
Subject: perf tools: Clear struct machine during machine__init()

There are so many test cases use stack allocated 'struct machine'.
Including:
  test__hists_link
  test__hists_filter
  test__mmap_thread_lookup
  test__thread_mg_share
  test__hists_output
  test__hists_cumulate

Also, in non-test code (for example, machine__new_host()) there are
code use 'malloc()' to alloc struct machine.

These are dangerous operations, cause some tests fail or hung in
machines__exit(). For example, in

 machines__exit ->
   machine__destroy_kernel_maps ->
     map_groups__remove ->
       maps__remove ->
         pthread_rwlock_wrlock

a incorrectly initialized lock causes unintended behavior.

This patch memset(0) that structure in machine__init() to ensure all
fields in 'struct machine' are initialized to zero.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1449541544-67621-17-git-send-email-wangnan0@huawei.com
[ Use memset, see 'man bzero' ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/machine.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index f5882b8..1407d51 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -25,6 +25,7 @@ static void dsos__init(struct dsos *dsos)
 
 int machine__init(struct machine *machine, const char *root_dir, pid_t pid)
 {
+	memset(machine, 0, sizeof(*machine));
 	map_groups__init(&machine->kmaps, machine);
 	RB_CLEAR_NODE(&machine->rb_node);
 	dsos__init(&machine->dsos);
-- 
cgit v1.1


From c4803c497fbdb37e96af614813a7cfb434b6682a Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Wed, 2 Dec 2015 14:44:41 +0100
Subject: nfs: Move call to security_inode_listsecurity into nfs_listxattr

Add a nfs_listxattr operation.  Move the call to security_inode_listsecurity
from list operation of the "security.*" xattr handler to nfs_listxattr.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Trond Myklebust <trond.myklebust@primarydata.com>
Cc: Anna Schumaker <anna.schumaker@netapp.com>
Cc: linux-nfs@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfs/nfs4proc.c          | 53 +++++++++++++++++++++++++++++++---------------
 fs/xattr.c                 |  4 ++++
 security/smack/smack_lsm.c |  2 --
 3 files changed, 40 insertions(+), 19 deletions(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index f6f40aa..dbfade2 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -6279,10 +6279,6 @@ static size_t nfs4_xattr_list_nfs4_acl(const struct xattr_handler *handler,
 }
 
 #ifdef CONFIG_NFS_V4_SECURITY_LABEL
-static inline int nfs4_server_supports_labels(struct nfs_server *server)
-{
-	return server->caps & NFS_CAP_SECURITY_LABEL;
-}
 
 static int nfs4_xattr_set_nfs4_label(const struct xattr_handler *handler,
 				     struct dentry *dentry, const char *key,
@@ -6304,29 +6300,34 @@ static int nfs4_xattr_get_nfs4_label(const struct xattr_handler *handler,
 	return -EOPNOTSUPP;
 }
 
-static size_t nfs4_xattr_list_nfs4_label(const struct xattr_handler *handler,
-					 struct dentry *dentry, char *list,
-					 size_t list_len, const char *name,
-					 size_t name_len)
+static ssize_t
+nfs4_listxattr_nfs4_label(struct inode *inode, char *list, size_t list_len)
 {
-	size_t len = 0;
+	int len = 0;
 
-	if (nfs_server_capable(d_inode(dentry), NFS_CAP_SECURITY_LABEL)) {
-		len = security_inode_listsecurity(d_inode(dentry), NULL, 0);
-		if (list && len <= list_len)
-			security_inode_listsecurity(d_inode(dentry), list, len);
+	if (nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL)) {
+		len = security_inode_listsecurity(inode, list, list_len);
+		if (list_len && len > list_len)
+			return -ERANGE;
 	}
 	return len;
 }
 
 static const struct xattr_handler nfs4_xattr_nfs4_label_handler = {
 	.prefix = XATTR_SECURITY_PREFIX,
-	.list	= nfs4_xattr_list_nfs4_label,
 	.get	= nfs4_xattr_get_nfs4_label,
 	.set	= nfs4_xattr_set_nfs4_label,
 };
-#endif
 
+#else
+
+static ssize_t
+nfs4_listxattr_nfs4_label(struct inode *inode, char *list, size_t list_len)
+{
+	return 0;
+}
+
+#endif
 
 /*
  * nfs_fhget will use either the mounted_on_fileid or the fileid
@@ -8743,6 +8744,24 @@ const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = {
 #endif
 };
 
+ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size)
+{
+	ssize_t error, error2;
+
+	error = generic_listxattr(dentry, list, size);
+	if (error < 0)
+		return error;
+	if (list) {
+		list += error;
+		size -= error;
+	}
+
+	error2 = nfs4_listxattr_nfs4_label(d_inode(dentry), list, size);
+	if (error2 < 0)
+		return error2;
+	return error + error2;
+}
+
 static const struct inode_operations nfs4_dir_inode_operations = {
 	.create		= nfs_create,
 	.lookup		= nfs_lookup,
@@ -8759,7 +8778,7 @@ static const struct inode_operations nfs4_dir_inode_operations = {
 	.setattr	= nfs_setattr,
 	.getxattr	= generic_getxattr,
 	.setxattr	= generic_setxattr,
-	.listxattr	= generic_listxattr,
+	.listxattr	= nfs4_listxattr,
 	.removexattr	= generic_removexattr,
 };
 
@@ -8769,7 +8788,7 @@ static const struct inode_operations nfs4_file_inode_operations = {
 	.setattr	= nfs_setattr,
 	.getxattr	= generic_getxattr,
 	.setxattr	= generic_setxattr,
-	.listxattr	= generic_listxattr,
+	.listxattr	= nfs4_listxattr,
 	.removexattr	= generic_removexattr,
 };
 
diff --git a/fs/xattr.c b/fs/xattr.c
index c3af6c9..2c77764 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -723,6 +723,8 @@ generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
 
 	if (!buffer) {
 		for_each_xattr_handler(handlers, handler) {
+			if (!handler->list)
+				continue;
 			size += handler->list(handler, dentry, NULL, 0,
 					      NULL, 0);
 		}
@@ -730,6 +732,8 @@ generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
 		char *buf = buffer;
 
 		for_each_xattr_handler(handlers, handler) {
+			if (!handler->list)
+				continue;
 			size = handler->list(handler, dentry, buf, buffer_size,
 					     NULL, 0);
 			if (size > buffer_size)
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index ff81026..37fdd54 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -1519,8 +1519,6 @@ static int smack_inode_getsecurity(const struct inode *inode,
  * @inode: the object
  * @buffer: where they go
  * @buffer_size: size of buffer
- *
- * Returns 0 on success, -EINVAL otherwise
  */
 static int smack_inode_listsecurity(struct inode *inode, char *buffer,
 				    size_t buffer_size)
-- 
cgit v1.1


From 1046cb119521b5e1881f380dc99729fc84c96661 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Wed, 2 Dec 2015 14:44:42 +0100
Subject: ocfs2: Replace list xattr handler operations

The list operations of the ocfs2 xattr handlers were never called
anywhere.  Remove them and directly check in ocfs2_xattr_list_entry
which attributes should be skipped over instead.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: ocfs2-devel@oss.oracle.com
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ocfs2/xattr.c | 151 +++++++++++++++++++++----------------------------------
 1 file changed, 57 insertions(+), 94 deletions(-)

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 5823f98..f0e241f 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -883,14 +883,39 @@ static int ocfs2_xattr_value_truncate(struct inode *inode,
 	return ret;
 }
 
-static int ocfs2_xattr_list_entry(char *buffer, size_t size,
-				  size_t *result, const char *prefix,
+static int ocfs2_xattr_list_entry(struct super_block *sb,
+				  char *buffer, size_t size,
+				  size_t *result, int type,
 				  const char *name, int name_len)
 {
 	char *p = buffer + *result;
-	int prefix_len = strlen(prefix);
-	int total_len = prefix_len + name_len + 1;
+	const char *prefix;
+	int prefix_len;
+	int total_len;
 
+	switch(type) {
+	case OCFS2_XATTR_INDEX_USER:
+		if (OCFS2_SB(sb)->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
+			return 0;
+		break;
+
+	case OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS:
+	case OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT:
+		if (!(sb->s_flags & MS_POSIXACL))
+			return 0;
+		break;
+
+	case OCFS2_XATTR_INDEX_TRUSTED:
+		if (!capable(CAP_SYS_ADMIN))
+			return 0;
+		break;
+	}
+
+	prefix = ocfs2_xattr_prefix(type);
+	if (!prefix)
+		return 0;
+	prefix_len = strlen(prefix);
+	total_len = prefix_len + name_len + 1;
 	*result += total_len;
 
 	/* we are just looking for how big our buffer needs to be */
@@ -913,23 +938,20 @@ static int ocfs2_xattr_list_entries(struct inode *inode,
 {
 	size_t result = 0;
 	int i, type, ret;
-	const char *prefix, *name;
+	const char *name;
 
 	for (i = 0 ; i < le16_to_cpu(header->xh_count); i++) {
 		struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
 		type = ocfs2_xattr_get_type(entry);
-		prefix = ocfs2_xattr_prefix(type);
-
-		if (prefix) {
-			name = (const char *)header +
-				le16_to_cpu(entry->xe_name_offset);
+		name = (const char *)header +
+			le16_to_cpu(entry->xe_name_offset);
 
-			ret = ocfs2_xattr_list_entry(buffer, buffer_size,
-						     &result, prefix, name,
-						     entry->xe_name_len);
-			if (ret)
-				return ret;
-		}
+		ret = ocfs2_xattr_list_entry(inode->i_sb,
+					     buffer, buffer_size,
+					     &result, type, name,
+					     entry->xe_name_len);
+		if (ret)
+			return ret;
 	}
 
 	return result;
@@ -4032,32 +4054,30 @@ static int ocfs2_list_xattr_bucket(struct inode *inode,
 	int ret = 0, type;
 	struct ocfs2_xattr_tree_list *xl = (struct ocfs2_xattr_tree_list *)para;
 	int i, block_off, new_offset;
-	const char *prefix, *name;
+	const char *name;
 
 	for (i = 0 ; i < le16_to_cpu(bucket_xh(bucket)->xh_count); i++) {
 		struct ocfs2_xattr_entry *entry = &bucket_xh(bucket)->xh_entries[i];
 		type = ocfs2_xattr_get_type(entry);
-		prefix = ocfs2_xattr_prefix(type);
 
-		if (prefix) {
-			ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
-								bucket_xh(bucket),
-								i,
-								&block_off,
-								&new_offset);
-			if (ret)
-				break;
+		ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
+							bucket_xh(bucket),
+							i,
+							&block_off,
+							&new_offset);
+		if (ret)
+			break;
 
-			name = (const char *)bucket_block(bucket, block_off) +
-				new_offset;
-			ret = ocfs2_xattr_list_entry(xl->buffer,
-						     xl->buffer_size,
-						     &xl->result,
-						     prefix, name,
-						     entry->xe_name_len);
-			if (ret)
-				break;
-		}
+		name = (const char *)bucket_block(bucket, block_off) +
+			new_offset;
+		ret = ocfs2_xattr_list_entry(inode->i_sb,
+					     xl->buffer,
+					     xl->buffer_size,
+					     &xl->result,
+					     type, name,
+					     entry->xe_name_len);
+		if (ret)
+			break;
 	}
 
 	return ret;
@@ -7225,25 +7245,10 @@ int ocfs2_init_security_and_acl(struct inode *dir,
 leave:
 	return ret;
 }
+
 /*
  * 'security' attributes support
  */
-static size_t ocfs2_xattr_security_list(const struct xattr_handler *handler,
-					struct dentry *dentry, char *list,
-					size_t list_size, const char *name,
-					size_t name_len)
-{
-	const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
-	const size_t total_len = prefix_len + name_len + 1;
-
-	if (list && total_len <= list_size) {
-		memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
-		memcpy(list + prefix_len, name, name_len);
-		list[prefix_len + name_len] = '\0';
-	}
-	return total_len;
-}
-
 static int ocfs2_xattr_security_get(const struct xattr_handler *handler,
 				    struct dentry *dentry, const char *name,
 				    void *buffer, size_t size)
@@ -7308,7 +7313,6 @@ int ocfs2_init_security_set(handle_t *handle,
 
 const struct xattr_handler ocfs2_xattr_security_handler = {
 	.prefix	= XATTR_SECURITY_PREFIX,
-	.list	= ocfs2_xattr_security_list,
 	.get	= ocfs2_xattr_security_get,
 	.set	= ocfs2_xattr_security_set,
 };
@@ -7316,25 +7320,6 @@ const struct xattr_handler ocfs2_xattr_security_handler = {
 /*
  * 'trusted' attributes support
  */
-static size_t ocfs2_xattr_trusted_list(const struct xattr_handler *handler,
-				       struct dentry *dentry, char *list,
-				       size_t list_size, const char *name,
-				       size_t name_len)
-{
-	const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
-	const size_t total_len = prefix_len + name_len + 1;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return 0;
-
-	if (list && total_len <= list_size) {
-		memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
-		memcpy(list + prefix_len, name, name_len);
-		list[prefix_len + name_len] = '\0';
-	}
-	return total_len;
-}
-
 static int ocfs2_xattr_trusted_get(const struct xattr_handler *handler,
 				   struct dentry *dentry, const char *name,
 				   void *buffer, size_t size)
@@ -7353,7 +7338,6 @@ static int ocfs2_xattr_trusted_set(const struct xattr_handler *handler,
 
 const struct xattr_handler ocfs2_xattr_trusted_handler = {
 	.prefix	= XATTR_TRUSTED_PREFIX,
-	.list	= ocfs2_xattr_trusted_list,
 	.get	= ocfs2_xattr_trusted_get,
 	.set	= ocfs2_xattr_trusted_set,
 };
@@ -7361,26 +7345,6 @@ const struct xattr_handler ocfs2_xattr_trusted_handler = {
 /*
  * 'user' attributes support
  */
-static size_t ocfs2_xattr_user_list(const struct xattr_handler *handler,
-				    struct dentry *dentry, char *list,
-				    size_t list_size, const char *name,
-				    size_t name_len)
-{
-	const size_t prefix_len = XATTR_USER_PREFIX_LEN;
-	const size_t total_len = prefix_len + name_len + 1;
-	struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
-
-	if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
-		return 0;
-
-	if (list && total_len <= list_size) {
-		memcpy(list, XATTR_USER_PREFIX, prefix_len);
-		memcpy(list + prefix_len, name, name_len);
-		list[prefix_len + name_len] = '\0';
-	}
-	return total_len;
-}
-
 static int ocfs2_xattr_user_get(const struct xattr_handler *handler,
 				struct dentry *dentry, const char *name,
 				void *buffer, size_t size)
@@ -7408,7 +7372,6 @@ static int ocfs2_xattr_user_set(const struct xattr_handler *handler,
 
 const struct xattr_handler ocfs2_xattr_user_handler = {
 	.prefix	= XATTR_USER_PREFIX,
-	.list	= ocfs2_xattr_user_list,
 	.get	= ocfs2_xattr_user_get,
 	.set	= ocfs2_xattr_user_set,
 };
-- 
cgit v1.1


From 764a5c6b1fa4306dd7573c1d80914254909cd036 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Wed, 2 Dec 2015 14:44:43 +0100
Subject: xattr handlers: Simplify list operation

Change the list operation to only return whether or not an attribute
should be listed.  Copying the attribute names into the buffer is moved
to the callers.

Since the result only depends on the dentry and not on the attribute
name, we do not pass the attribute name to list operations.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ext2/xattr.c              | 15 +++++---
 fs/ext2/xattr_security.c     | 17 ---------
 fs/ext2/xattr_trusted.c      | 19 ++--------
 fs/ext2/xattr_user.c         | 19 ++--------
 fs/ext4/xattr.c              | 17 +++++----
 fs/ext4/xattr_security.c     | 18 ----------
 fs/ext4/xattr_trusted.c      | 19 ++--------
 fs/ext4/xattr_user.c         | 19 ++--------
 fs/f2fs/xattr.c              | 82 ++++++++++++++------------------------------
 fs/jffs2/security.c          | 16 ---------
 fs/jffs2/xattr.c             | 26 ++++++++------
 fs/jffs2/xattr_trusted.c     | 17 ++-------
 fs/jffs2/xattr_user.c        | 16 ---------
 fs/nfs/nfs4proc.c            | 14 ++------
 fs/posix_acl.c               | 17 ++-------
 fs/reiserfs/xattr.c          | 13 +++----
 fs/reiserfs/xattr_security.c | 16 ++-------
 fs/reiserfs/xattr_trusted.c  | 15 ++------
 fs/reiserfs/xattr_user.c     | 14 ++------
 fs/squashfs/xattr.c          | 35 +++++--------------
 fs/xattr.c                   | 20 ++++++-----
 include/linux/xattr.h        |  4 +--
 22 files changed, 113 insertions(+), 335 deletions(-)

diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index fa70848..cd95d14 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -292,16 +292,21 @@ bad_block:	ext2_error(inode->i_sb, "ext2_xattr_list",
 		const struct xattr_handler *handler =
 			ext2_xattr_handler(entry->e_name_index);
 
-		if (handler) {
-			size_t size = handler->list(handler, dentry, buffer,
-						    rest, entry->e_name,
-						    entry->e_name_len);
+		if (handler && (!handler->list || handler->list(dentry))) {
+			const char *prefix = handler->prefix ?: handler->name;
+			size_t prefix_len = strlen(prefix);
+			size_t size = prefix_len + entry->e_name_len + 1;
+
 			if (buffer) {
 				if (size > rest) {
 					error = -ERANGE;
 					goto cleanup;
 				}
-				buffer += size;
+				memcpy(buffer, prefix, prefix_len);
+				buffer += prefix_len;
+				memcpy(buffer, entry->e_name, entry->e_name_len);
+				buffer += entry->e_name_len;
+				*buffer++ = 0;
 			}
 			rest -= size;
 		}
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index 118bf231..ba97f24 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -7,22 +7,6 @@
 #include <linux/security.h>
 #include "xattr.h"
 
-static size_t
-ext2_xattr_security_list(const struct xattr_handler *handler,
-			 struct dentry *dentry, char *list, size_t list_size,
-			 const char *name, size_t name_len)
-{
-	const int prefix_len = XATTR_SECURITY_PREFIX_LEN;
-	const size_t total_len = prefix_len + name_len + 1;
-
-	if (list && total_len <= list_size) {
-		memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
-		memcpy(list+prefix_len, name, name_len);
-		list[prefix_len + name_len] = '\0';
-	}
-	return total_len;
-}
-
 static int
 ext2_xattr_security_get(const struct xattr_handler *handler,
 			struct dentry *dentry, const char *name,
@@ -67,7 +51,6 @@ ext2_init_security(struct inode *inode, struct inode *dir,
 
 const struct xattr_handler ext2_xattr_security_handler = {
 	.prefix	= XATTR_SECURITY_PREFIX,
-	.list	= ext2_xattr_security_list,
 	.get	= ext2_xattr_security_get,
 	.set	= ext2_xattr_security_set,
 };
diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c
index 3f8f2bc..2c94d19 100644
--- a/fs/ext2/xattr_trusted.c
+++ b/fs/ext2/xattr_trusted.c
@@ -8,23 +8,10 @@
 #include "ext2.h"
 #include "xattr.h"
 
-static size_t
-ext2_xattr_trusted_list(const struct xattr_handler *handler,
-			struct dentry *dentry, char *list, size_t list_size,
-			const char *name, size_t name_len)
+static bool
+ext2_xattr_trusted_list(struct dentry *dentry)
 {
-	const int prefix_len = XATTR_TRUSTED_PREFIX_LEN;
-	const size_t total_len = prefix_len + name_len + 1;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return 0;
-
-	if (list && total_len <= list_size) {
-		memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
-		memcpy(list+prefix_len, name, name_len);
-		list[prefix_len + name_len] = '\0';
-	}
-	return total_len;
+	return capable(CAP_SYS_ADMIN);
 }
 
 static int
diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c
index afd45ab..72a2a96 100644
--- a/fs/ext2/xattr_user.c
+++ b/fs/ext2/xattr_user.c
@@ -10,23 +10,10 @@
 #include "ext2.h"
 #include "xattr.h"
 
-static size_t
-ext2_xattr_user_list(const struct xattr_handler *handler,
-		     struct dentry *dentry, char *list, size_t list_size,
-		     const char *name, size_t name_len)
+static bool
+ext2_xattr_user_list(struct dentry *dentry)
 {
-	const size_t prefix_len = XATTR_USER_PREFIX_LEN;
-	const size_t total_len = prefix_len + name_len + 1;
-
-	if (!test_opt(dentry->d_sb, XATTR_USER))
-		return 0;
-
-	if (list && total_len <= list_size) {
-		memcpy(list, XATTR_USER_PREFIX, prefix_len);
-		memcpy(list+prefix_len, name, name_len);
-		list[prefix_len + name_len] = '\0';
-	}
-	return total_len;
+	return test_opt(dentry->d_sb, XATTR_USER);
 }
 
 static int
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 6b6b3e7..e9b9afd 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -404,19 +404,24 @@ ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry,
 		const struct xattr_handler *handler =
 			ext4_xattr_handler(entry->e_name_index);
 
-		if (handler) {
-			size_t size = handler->list(handler, dentry, buffer,
-						    rest, entry->e_name,
-						    entry->e_name_len);
+		if (handler && (!handler->list || handler->list(dentry))) {
+			const char *prefix = handler->prefix ?: handler->name;
+			size_t prefix_len = strlen(prefix);
+			size_t size = prefix_len + entry->e_name_len + 1;
+
 			if (buffer) {
 				if (size > rest)
 					return -ERANGE;
-				buffer += size;
+				memcpy(buffer, prefix, prefix_len);
+				buffer += prefix_len;
+				memcpy(buffer, entry->e_name, entry->e_name_len);
+				buffer += entry->e_name_len;
+				*buffer++ = 0;
 			}
 			rest -= size;
 		}
 	}
-	return buffer_size - rest;
+	return buffer_size - rest;  /* total size */
 }
 
 static int
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 195abc4..3e81bdc 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -11,23 +11,6 @@
 #include "ext4.h"
 #include "xattr.h"
 
-static size_t
-ext4_xattr_security_list(const struct xattr_handler *handler,
-			 struct dentry *dentry, char *list, size_t list_size,
-			 const char *name, size_t name_len)
-{
-	const size_t prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1;
-	const size_t total_len = prefix_len + name_len + 1;
-
-
-	if (list && total_len <= list_size) {
-		memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
-		memcpy(list+prefix_len, name, name_len);
-		list[prefix_len + name_len] = '\0';
-	}
-	return total_len;
-}
-
 static int
 ext4_xattr_security_get(const struct xattr_handler *handler,
 			struct dentry *dentry, const char *name,
@@ -75,7 +58,6 @@ ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
 
 const struct xattr_handler ext4_xattr_security_handler = {
 	.prefix	= XATTR_SECURITY_PREFIX,
-	.list	= ext4_xattr_security_list,
 	.get	= ext4_xattr_security_get,
 	.set	= ext4_xattr_security_set,
 };
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index 121fdf9..2a3c6f9 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -12,23 +12,10 @@
 #include "ext4.h"
 #include "xattr.h"
 
-static size_t
-ext4_xattr_trusted_list(const struct xattr_handler *handler,
-			struct dentry *dentry, char *list, size_t list_size,
-			const char *name, size_t name_len)
+static bool
+ext4_xattr_trusted_list(struct dentry *dentry)
 {
-	const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
-	const size_t total_len = prefix_len + name_len + 1;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return 0;
-
-	if (list && total_len <= list_size) {
-		memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
-		memcpy(list+prefix_len, name, name_len);
-		list[prefix_len + name_len] = '\0';
-	}
-	return total_len;
+	return capable(CAP_SYS_ADMIN);
 }
 
 static int
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index 577fc12..d152f43 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -11,23 +11,10 @@
 #include "ext4.h"
 #include "xattr.h"
 
-static size_t
-ext4_xattr_user_list(const struct xattr_handler *handler,
-		     struct dentry *dentry, char *list, size_t list_size,
-		     const char *name, size_t name_len)
+static bool
+ext4_xattr_user_list(struct dentry *dentry)
 {
-	const size_t prefix_len = XATTR_USER_PREFIX_LEN;
-	const size_t total_len = prefix_len + name_len + 1;
-
-	if (!test_opt(dentry->d_sb, XATTR_USER))
-		return 0;
-
-	if (list && total_len <= list_size) {
-		memcpy(list, XATTR_USER_PREFIX, prefix_len);
-		memcpy(list+prefix_len, name, name_len);
-		list[prefix_len + name_len] = '\0';
-	}
-	return total_len;
+	return test_opt(dentry->d_sb, XATTR_USER);
 }
 
 static int
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 21cfe51..036952a 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -25,40 +25,6 @@
 #include "f2fs.h"
 #include "xattr.h"
 
-static size_t f2fs_xattr_generic_list(const struct xattr_handler *handler,
-		struct dentry *dentry, char *list, size_t list_size,
-		const char *name, size_t len)
-{
-	struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
-	const char *prefix;
-	int total_len, prefix_len;
-
-	switch (handler->flags) {
-	case F2FS_XATTR_INDEX_USER:
-		if (!test_opt(sbi, XATTR_USER))
-			return -EOPNOTSUPP;
-		break;
-	case F2FS_XATTR_INDEX_TRUSTED:
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
-		break;
-	case F2FS_XATTR_INDEX_SECURITY:
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	prefix = xattr_prefix(handler);
-	prefix_len = strlen(prefix);
-	total_len = prefix_len + len + 1;
-	if (list && total_len <= list_size) {
-		memcpy(list, prefix, prefix_len);
-		memcpy(list + prefix_len, name, len);
-		list[prefix_len + len] = '\0';
-	}
-	return total_len;
-}
-
 static int f2fs_xattr_generic_get(const struct xattr_handler *handler,
 		struct dentry *dentry, const char *name, void *buffer,
 		size_t size)
@@ -107,17 +73,16 @@ static int f2fs_xattr_generic_set(const struct xattr_handler *handler,
 					value, size, NULL, flags);
 }
 
-static size_t f2fs_xattr_advise_list(const struct xattr_handler *handler,
-		struct dentry *dentry, char *list, size_t list_size,
-		const char *name, size_t len)
+static bool f2fs_xattr_user_list(struct dentry *dentry)
 {
-	const char *xname = F2FS_SYSTEM_ADVISE_NAME;
-	size_t size;
+	struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
+
+	return test_opt(sbi, XATTR_USER);
+}
 
-	size = strlen(xname) + 1;
-	if (list && size <= list_size)
-		memcpy(list, xname, size);
-	return size;
+static bool f2fs_xattr_trusted_list(struct dentry *dentry)
+{
+	return capable(CAP_SYS_ADMIN);
 }
 
 static int f2fs_xattr_advise_get(const struct xattr_handler *handler,
@@ -175,7 +140,7 @@ int f2fs_init_security(struct inode *inode, struct inode *dir,
 const struct xattr_handler f2fs_xattr_user_handler = {
 	.prefix	= XATTR_USER_PREFIX,
 	.flags	= F2FS_XATTR_INDEX_USER,
-	.list	= f2fs_xattr_generic_list,
+	.list	= f2fs_xattr_user_list,
 	.get	= f2fs_xattr_generic_get,
 	.set	= f2fs_xattr_generic_set,
 };
@@ -183,7 +148,7 @@ const struct xattr_handler f2fs_xattr_user_handler = {
 const struct xattr_handler f2fs_xattr_trusted_handler = {
 	.prefix	= XATTR_TRUSTED_PREFIX,
 	.flags	= F2FS_XATTR_INDEX_TRUSTED,
-	.list	= f2fs_xattr_generic_list,
+	.list	= f2fs_xattr_trusted_list,
 	.get	= f2fs_xattr_generic_get,
 	.set	= f2fs_xattr_generic_set,
 };
@@ -191,7 +156,6 @@ const struct xattr_handler f2fs_xattr_trusted_handler = {
 const struct xattr_handler f2fs_xattr_advise_handler = {
 	.name	= F2FS_SYSTEM_ADVISE_NAME,
 	.flags	= F2FS_XATTR_INDEX_ADVISE,
-	.list   = f2fs_xattr_advise_list,
 	.get    = f2fs_xattr_advise_get,
 	.set    = f2fs_xattr_advise_set,
 };
@@ -199,7 +163,6 @@ const struct xattr_handler f2fs_xattr_advise_handler = {
 const struct xattr_handler f2fs_xattr_security_handler = {
 	.prefix	= XATTR_SECURITY_PREFIX,
 	.flags	= F2FS_XATTR_INDEX_SECURITY,
-	.list	= f2fs_xattr_generic_list,
 	.get	= f2fs_xattr_generic_get,
 	.set	= f2fs_xattr_generic_set,
 };
@@ -447,20 +410,27 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
 	list_for_each_xattr(entry, base_addr) {
 		const struct xattr_handler *handler =
 			f2fs_xattr_handler(entry->e_name_index);
+		const char *prefix;
+		size_t prefix_len;
 		size_t size;
 
-		if (!handler)
+		if (!handler || (handler->list && !handler->list(dentry)))
 			continue;
 
-		size = handler->list(handler, dentry, buffer, rest,
-				     entry->e_name, entry->e_name_len);
-		if (buffer && size > rest) {
-			error = -ERANGE;
-			goto cleanup;
+		prefix = handler->prefix ?: handler->name;
+		prefix_len = strlen(prefix);
+		size = prefix_len + entry->e_name_len + 1;
+		if (buffer) {
+			if (size > rest) {
+				error = -ERANGE;
+				goto cleanup;
+			}
+			memcpy(buffer, prefix, prefix_len);
+			buffer += prefix_len;
+			memcpy(buffer, entry->e_name, entry->e_name_len);
+			buffer += entry->e_name_len;
+			*buffer++ = 0;
 		}
-
-		if (buffer)
-			buffer += size;
 		rest -= size;
 	}
 	error = buffer_size - rest;
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
index ea79932..7a28fac 100644
--- a/fs/jffs2/security.c
+++ b/fs/jffs2/security.c
@@ -64,24 +64,8 @@ static int jffs2_security_setxattr(const struct xattr_handler *handler,
 				 name, buffer, size, flags);
 }
 
-static size_t jffs2_security_listxattr(const struct xattr_handler *handler,
-				       struct dentry *dentry, char *list,
-				       size_t list_size, const char *name,
-				       size_t name_len)
-{
-	size_t retlen = XATTR_SECURITY_PREFIX_LEN + name_len + 1;
-
-	if (list && retlen <= list_size) {
-		strcpy(list, XATTR_SECURITY_PREFIX);
-		strcpy(list + XATTR_SECURITY_PREFIX_LEN, name);
-	}
-
-	return retlen;
-}
-
 const struct xattr_handler jffs2_security_xattr_handler = {
 	.prefix = XATTR_SECURITY_PREFIX,
-	.list = jffs2_security_listxattr,
 	.set = jffs2_security_setxattr,
 	.get = jffs2_security_getxattr
 };
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 4c2c036..da3e185 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -967,7 +967,8 @@ ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
 	struct jffs2_xattr_ref *ref, **pref;
 	struct jffs2_xattr_datum *xd;
 	const struct xattr_handler *xhandle;
-	ssize_t len, rc;
+	const char *prefix;
+	ssize_t prefix_len, len, rc;
 	int retry = 0;
 
 	rc = check_xattr_ref_inode(c, ic);
@@ -998,18 +999,23 @@ ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
 			}
 		}
 		xhandle = xprefix_to_handler(xd->xprefix);
-		if (!xhandle)
+		if (!xhandle || (xhandle->list && !xhandle->list(dentry)))
 			continue;
+		prefix = xhandle->prefix ?: xhandle->name;
+		prefix_len = strlen(prefix);
+		rc = prefix_len + xd->name_len + 1;
+
 		if (buffer) {
-			rc = xhandle->list(xhandle, dentry, buffer + len,
-					   size - len, xd->xname,
-					   xd->name_len);
-		} else {
-			rc = xhandle->list(xhandle, dentry, NULL, 0,
-					   xd->xname, xd->name_len);
+			if (rc > size - len) {
+				rc = -ERANGE;
+				goto out;
+			}
+			memcpy(buffer, prefix, prefix_len);
+			buffer += prefix_len;
+			memcpy(buffer, xd->xname, xd->name_len);
+			buffer += xd->name_len;
+			*buffer++ = 0;
 		}
-		if (rc < 0)
-			goto out;
 		len += rc;
 	}
 	rc = len;
diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c
index 8b55fe4..b2555ef 100644
--- a/fs/jffs2/xattr_trusted.c
+++ b/fs/jffs2/xattr_trusted.c
@@ -32,22 +32,9 @@ static int jffs2_trusted_setxattr(const struct xattr_handler *handler,
 				 name, buffer, size, flags);
 }
 
-static size_t jffs2_trusted_listxattr(const struct xattr_handler *handler,
-				      struct dentry *dentry, char *list,
-				      size_t list_size, const char *name,
-				      size_t name_len)
+static bool jffs2_trusted_listxattr(struct dentry *dentry)
 {
-	size_t retlen = XATTR_TRUSTED_PREFIX_LEN + name_len + 1;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return 0;
-
-	if (list && retlen<=list_size) {
-		strcpy(list, XATTR_TRUSTED_PREFIX);
-		strcpy(list + XATTR_TRUSTED_PREFIX_LEN, name);
-	}
-
-	return retlen;
+	return capable(CAP_SYS_ADMIN);
 }
 
 const struct xattr_handler jffs2_trusted_xattr_handler = {
diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c
index b04335b..539bd63 100644
--- a/fs/jffs2/xattr_user.c
+++ b/fs/jffs2/xattr_user.c
@@ -32,24 +32,8 @@ static int jffs2_user_setxattr(const struct xattr_handler *handler,
 				 name, buffer, size, flags);
 }
 
-static size_t jffs2_user_listxattr(const struct xattr_handler *handler,
-				   struct dentry *dentry, char *list,
-				   size_t list_size, const char *name,
-				   size_t name_len)
-{
-	size_t retlen = XATTR_USER_PREFIX_LEN + name_len + 1;
-
-	if (list && retlen <= list_size) {
-		strcpy(list, XATTR_USER_PREFIX);
-		strcpy(list + XATTR_USER_PREFIX_LEN, name);
-	}
-
-	return retlen;
-}
-
 const struct xattr_handler jffs2_user_xattr_handler = {
 	.prefix = XATTR_USER_PREFIX,
-	.list = jffs2_user_listxattr,
 	.set = jffs2_user_setxattr,
 	.get = jffs2_user_getxattr
 };
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index dbfade2..c57d133 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -6263,19 +6263,9 @@ static int nfs4_xattr_get_nfs4_acl(const struct xattr_handler *handler,
 	return nfs4_proc_get_acl(d_inode(dentry), buf, buflen);
 }
 
-static size_t nfs4_xattr_list_nfs4_acl(const struct xattr_handler *handler,
-				       struct dentry *dentry, char *list,
-				       size_t list_len, const char *name,
-				       size_t name_len)
+static bool nfs4_xattr_list_nfs4_acl(struct dentry *dentry)
 {
-	size_t len = sizeof(XATTR_NAME_NFSV4_ACL);
-
-	if (!nfs4_server_supports_acls(NFS_SERVER(d_inode(dentry))))
-		return 0;
-
-	if (list && len <= list_len)
-		memcpy(list, XATTR_NAME_NFSV4_ACL, len);
-	return len;
+	return nfs4_server_supports_acls(NFS_SERVER(d_inode(dentry)));
 }
 
 #ifdef CONFIG_NFS_V4_SECURITY_LABEL
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 17efd76..711dd51 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -823,21 +823,10 @@ out:
 	return ret;
 }
 
-static size_t
-posix_acl_xattr_list(const struct xattr_handler *handler,
-		     struct dentry *dentry, char *list, size_t list_size,
-		     const char *name, size_t name_len)
+static bool
+posix_acl_xattr_list(struct dentry *dentry)
 {
-	const char *xname = handler->name;
-	size_t size;
-
-	if (!IS_POSIXACL(d_backing_inode(dentry)))
-		return 0;
-
-	size = strlen(xname) + 1;
-	if (list && size <= list_size)
-		memcpy(list, xname, size);
-	return size;
+	return IS_POSIXACL(d_backing_inode(dentry));
 }
 
 const struct xattr_handler posix_acl_access_xattr_handler = {
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index efe2ed3..e5ddb4e 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -840,19 +840,16 @@ static int listxattr_filler(struct dir_context *ctx, const char *name,
 
 		handler = find_xattr_handler_prefix(b->dentry->d_sb->s_xattr,
 						    name);
-		if (!handler)	/* Unsupported xattr name */
+		if (!handler /* Unsupported xattr name */ ||
+		    (handler->list && !handler->list(b->dentry)))
 			return 0;
+		size = namelen + 1;
 		if (b->buf) {
-			size = handler->list(handler, b->dentry,
-					     b->buf + b->pos, b->size, name,
-					     namelen);
 			if (size > b->size)
 				return -ERANGE;
-		} else {
-			size = handler->list(handler, b->dentry,
-					     NULL, 0, name, namelen);
+			memcpy(b->buf + b->pos, name, namelen);
+			b->buf[b->pos + namelen] = 0;
 		}
-
 		b->pos += size;
 	}
 	return 0;
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index ac659af..ab0217d 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -34,21 +34,9 @@ security_set(const struct xattr_handler *handler, struct dentry *dentry,
 	return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags);
 }
 
-static size_t security_list(const struct xattr_handler *handler,
-			    struct dentry *dentry, char *list, size_t list_len,
-			    const char *name, size_t namelen)
+static bool security_list(struct dentry *dentry)
 {
-	const size_t len = namelen + 1;
-
-	if (IS_PRIVATE(d_inode(dentry)))
-		return 0;
-
-	if (list && len <= list_len) {
-		memcpy(list, name, namelen);
-		list[namelen] = '\0';
-	}
-
-	return len;
+	return !IS_PRIVATE(d_inode(dentry));
 }
 
 /* Initializes the security context for a new inode and returns the number
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
index a338adf..64b67aa 100644
--- a/fs/reiserfs/xattr_trusted.c
+++ b/fs/reiserfs/xattr_trusted.c
@@ -33,20 +33,9 @@ trusted_set(const struct xattr_handler *handler, struct dentry *dentry,
 	return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags);
 }
 
-static size_t trusted_list(const struct xattr_handler *handler,
-			   struct dentry *dentry, char *list, size_t list_size,
-			   const char *name, size_t name_len)
+static bool trusted_list(struct dentry *dentry)
 {
-	const size_t len = name_len + 1;
-
-	if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(d_inode(dentry)))
-		return 0;
-
-	if (list && len <= list_size) {
-		memcpy(list, name, name_len);
-		list[name_len] = '\0';
-	}
-	return len;
+	return capable(CAP_SYS_ADMIN) && !IS_PRIVATE(d_inode(dentry));
 }
 
 const struct xattr_handler reiserfs_xattr_trusted_handler = {
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
index 39c9667..12e6306 100644
--- a/fs/reiserfs/xattr_user.c
+++ b/fs/reiserfs/xattr_user.c
@@ -30,19 +30,9 @@ user_set(const struct xattr_handler *handler, struct dentry *dentry,
 	return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags);
 }
 
-static size_t user_list(const struct xattr_handler *handler,
-			struct dentry *dentry, char *list, size_t list_size,
-			const char *name, size_t name_len)
+static bool user_list(struct dentry *dentry)
 {
-	const size_t len = name_len + 1;
-
-	if (!reiserfs_xattrs_user(dentry->d_sb))
-		return 0;
-	if (list && len <= list_size) {
-		memcpy(list, name, name_len);
-		list[name_len] = '\0';
-	}
-	return len;
+	return reiserfs_xattrs_user(dentry->d_sb);
 }
 
 const struct xattr_handler reiserfs_xattr_user_handler = {
diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c
index 2f0ccba..1e9de96 100644
--- a/fs/squashfs/xattr.c
+++ b/fs/squashfs/xattr.c
@@ -58,7 +58,7 @@ ssize_t squashfs_listxattr(struct dentry *d, char *buffer,
 		struct squashfs_xattr_entry entry;
 		struct squashfs_xattr_val val;
 		const struct xattr_handler *handler;
-		int name_size, prefix_size = 0;
+		int name_size;
 
 		err = squashfs_read_metadata(sb, &entry, &start, &offset,
 							sizeof(entry));
@@ -67,15 +67,16 @@ ssize_t squashfs_listxattr(struct dentry *d, char *buffer,
 
 		name_size = le16_to_cpu(entry.size);
 		handler = squashfs_xattr_handler(le16_to_cpu(entry.type));
-		if (handler)
-			prefix_size = handler->list(handler, d, buffer, rest,
-						    NULL, name_size);
-		if (prefix_size) {
+		if (handler && (!handler->list || handler->list(d))) {
+			const char *prefix = handler->prefix ?: handler->name;
+			size_t prefix_size = strlen(prefix);
+
 			if (buffer) {
 				if (prefix_size + name_size + 1 > rest) {
 					err = -ERANGE;
 					goto failed;
 				}
+				memcpy(buffer, prefix, prefix_size);
 				buffer += prefix_size;
 			}
 			err = squashfs_read_metadata(sb, buffer, &start,
@@ -212,18 +213,6 @@ failed:
 }
 
 
-static size_t squashfs_xattr_handler_list(const struct xattr_handler *handler,
-					  struct dentry *d, char *list,
-					  size_t list_size, const char *name,
-					  size_t name_len)
-{
-	int len = strlen(handler->prefix);
-
-	if (list && len <= list_size)
-		memcpy(list, handler->prefix, len);
-	return len;
-}
-
 static int squashfs_xattr_handler_get(const struct xattr_handler *handler,
 				      struct dentry *d, const char *name,
 				      void *buffer, size_t size)
@@ -238,22 +227,15 @@ static int squashfs_xattr_handler_get(const struct xattr_handler *handler,
 static const struct xattr_handler squashfs_xattr_user_handler = {
 	.prefix	= XATTR_USER_PREFIX,
 	.flags	= SQUASHFS_XATTR_USER,
-	.list	= squashfs_xattr_handler_list,
 	.get	= squashfs_xattr_handler_get
 };
 
 /*
  * Trusted namespace support
  */
-static size_t squashfs_trusted_xattr_handler_list(const struct xattr_handler *handler,
-						  struct dentry *d, char *list,
-						  size_t list_size, const char *name,
-						  size_t name_len)
+static bool squashfs_trusted_xattr_handler_list(struct dentry *d)
 {
-	if (!capable(CAP_SYS_ADMIN))
-		return 0;
-	return squashfs_xattr_handler_list(handler, d, list, list_size, name,
-					   name_len);
+	return capable(CAP_SYS_ADMIN);
 }
 
 static const struct xattr_handler squashfs_xattr_trusted_handler = {
@@ -269,7 +251,6 @@ static const struct xattr_handler squashfs_xattr_trusted_handler = {
 static const struct xattr_handler squashfs_xattr_security_handler = {
 	.prefix	= XATTR_SECURITY_PREFIX,
 	.flags	= SQUASHFS_XATTR_SECURITY,
-	.list	= squashfs_xattr_handler_list,
 	.get	= squashfs_xattr_handler_get
 };
 
diff --git a/fs/xattr.c b/fs/xattr.c
index 2c77764..d7f5037 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -723,23 +723,25 @@ generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
 
 	if (!buffer) {
 		for_each_xattr_handler(handlers, handler) {
-			if (!handler->list)
+			if (!handler->name ||
+			    (handler->list && !handler->list(dentry)))
 				continue;
-			size += handler->list(handler, dentry, NULL, 0,
-					      NULL, 0);
+			size += strlen(handler->name) + 1;
 		}
 	} else {
 		char *buf = buffer;
+		size_t len;
 
 		for_each_xattr_handler(handlers, handler) {
-			if (!handler->list)
+			if (!handler->name ||
+			    (handler->list && !handler->list(dentry)))
 				continue;
-			size = handler->list(handler, dentry, buf, buffer_size,
-					     NULL, 0);
-			if (size > buffer_size)
+			len = strlen(handler->name);
+			if (len + 1 > buffer_size)
 				return -ERANGE;
-			buf += size;
-			buffer_size -= size;
+			memcpy(buf, handler->name, len + 1);
+			buf += len + 1;
+			buffer_size -= len + 1;
 		}
 		size = buf - buffer;
 	}
diff --git a/include/linux/xattr.h b/include/linux/xattr.h
index d23ce8e..4457541 100644
--- a/include/linux/xattr.h
+++ b/include/linux/xattr.h
@@ -28,9 +28,7 @@ struct xattr_handler {
 	const char *name;
 	const char *prefix;
 	int flags;      /* fs private flags */
-	size_t (*list)(const struct xattr_handler *, struct dentry *dentry,
-		       char *list, size_t list_size, const char *name,
-		       size_t name_len);
+	bool (*list)(struct dentry *dentry);
 	int (*get)(const struct xattr_handler *, struct dentry *dentry,
 		   const char *name, void *buffer, size_t size);
 	int (*set)(const struct xattr_handler *, struct dentry *dentry,
-- 
cgit v1.1


From f74acf0e4326bfaa2c0be1e82f23801fe347cd9c Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Sat, 12 Dec 2015 11:27:57 +0100
Subject: x86/entry/64_compat: Make labels local

... so that they don't appear as symbols in the final ELF.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1449916077-6506-1-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/entry_64_compat.S | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index bbcb285..8d802a1 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -96,15 +96,15 @@ ENTRY(entry_SYSENTER_compat)
 	 * This needs to happen before enabling interrupts so that
 	 * we don't get preempted with NT set.
 	 *
-	 * NB.: sysenter_fix_flags is a label with the code under it moved
+	 * NB.: .Lsysenter_fix_flags is a label with the code under it moved
 	 * out-of-line as an optimization: NT is unlikely to be set in the
 	 * majority of the cases and instead of polluting the I$ unnecessarily,
 	 * we're keeping that code behind a branch which will predict as
 	 * not-taken and therefore its instructions won't be fetched.
 	 */
 	testl	$X86_EFLAGS_NT, EFLAGS(%rsp)
-	jnz	sysenter_fix_flags
-sysenter_flags_fixed:
+	jnz	.Lsysenter_fix_flags
+.Lsysenter_flags_fixed:
 
 	/*
 	 * User mode is traced as though IRQs are on, and SYSENTER
@@ -119,10 +119,10 @@ sysenter_flags_fixed:
 		    "jmp .Lsyscall_32_done", X86_FEATURE_XENPV
 	jmp	sysret32_from_system_call
 
-sysenter_fix_flags:
+.Lsysenter_fix_flags:
 	pushq	$X86_EFLAGS_FIXED
 	popfq
-	jmp	sysenter_flags_fixed
+	jmp	.Lsysenter_flags_fixed
 ENDPROC(entry_SYSENTER_compat)
 
 /*
-- 
cgit v1.1


From 9daddf66a37708ec7182a7058f159166d12c9812 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 11 Dec 2015 16:43:57 -0300
Subject: perf tools: Use same signal handling strategy as 'record'

I.e. don't exit with the signal number, instead set the signal handler
to the default one and then raise it again.

Noticed while trying to dump the stack at segfaults in the 'perf test'
forked process used to run each test, that inspects signal info at
each test.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/n/tip-5x5r176wnoqxi5p6id05wv9w@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/util.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c
index 07da970..aff0cfd 100644
--- a/tools/perf/util/util.c
+++ b/tools/perf/util/util.c
@@ -352,7 +352,8 @@ void sighandler_dump_stack(int sig)
 {
 	psignal(sig, "perf");
 	dump_stack();
-	exit(sig);
+	signal(sig, SIG_DFL);
+	raise(sig);
 }
 
 int parse_nsec_time(const char *str, u64 *ptime)
-- 
cgit v1.1


From b6847d2c2a50e96680e233ce4b2784981b6f309e Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 11 Dec 2015 19:06:53 -0300
Subject: perf test: Dump the stack when test segfaults when in verbose mode

E.g.:

  # perf test 26
  26: Test mmap thread lookup                                  : FAILED!
  # perf test -v 26
  26: Test mmap thread lookup                                  :
  --- start ---
  test child forked, pid 9269
  tid = 9269, map = 0x7ff99ff0c000
  tid = 9270, map = 0x7ff99ff0b000
  tid = 9271, map = 0x7ff99ff0a000
  tid = 9272, map = 0x7ff99ff09000
  perf: Segmentation fault
  Obtained 13 stack frames.
  perf(sighandler_dump_stack+0x41) [0x4e3541]
  /lib64/libc.so.6(+0x34960) [0x7ff99d5f6960]
  perf(thread__put+0x5b) [0x4c6f6b]
  perf(machine__process_event+0x14e) [0x4bd37e]
  perf(perf_event__synthesize_threads+0x3aa) [0x48678a]
  perf(test__mmap_thread_lookup+0x20a) [0x474e0a]
  perf() [0x460d56]
  perf(cmd_test+0x589) [0x461319]
  perf() [0x47c641]
  perf(main+0x617) [0x422317]
  /lib64/libc.so.6(__libc_start_main+0xf0) [0x7ff99d5e1fe0]
  perf() [0x422429]
  [(nil)]
  test child interrupted
  ---- end ----
  Test mmap thread lookup: FAILED!
  #

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/n/tip-sypazzsl4ptctrmlyi2zcmaj@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/builtin-test.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c
index 2b1ade1..fa98406 100644
--- a/tools/perf/tests/builtin-test.c
+++ b/tools/perf/tests/builtin-test.c
@@ -236,6 +236,9 @@ static int run_test(struct test *test, int subtest)
 				dup2(STDOUT_FILENO, STDERR_FILENO);
 				close(nullfd);
 			}
+		} else {
+			signal(SIGSEGV, sighandler_dump_stack);
+			signal(SIGFPE, sighandler_dump_stack);
 		}
 
 		err = test->func(subtest);
-- 
cgit v1.1


From abd828688407eb86044f1bc9e5133c55d7597596 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 11 Dec 2015 19:11:23 -0300
Subject: perf thread: Fix reference count initial state

We should always return from thread__new(), the constructor, with the
object with a reference count of one, so that:

     struct thread *thread = thread__new();
     thread__put(thread);

Will call thread__delete().

If any reference is made to that 'thread' variable, it better use
thread__get(thread) to hold a reference.

We were returning with thread->refcnt set to zero, fix it and some cases
where thread__delete() was being called, which were not a problem
because just one reference was being used, now that we set it to 1, use
thread__put() instead.

Reported-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/n/tip-4b9mkuk66to4ecckpmpvqx6s@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/intel-pt.c |  4 ++--
 tools/perf/util/machine.c  | 19 ++++++++++++-------
 tools/perf/util/thread.c   | 10 ++++++++--
 3 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c
index 97f963a..81a2eb7 100644
--- a/tools/perf/util/intel-pt.c
+++ b/tools/perf/util/intel-pt.c
@@ -1744,7 +1744,7 @@ static void intel_pt_free(struct perf_session *session)
 	auxtrace_heap__free(&pt->heap);
 	intel_pt_free_events(session);
 	session->auxtrace = NULL;
-	thread__delete(pt->unknown_thread);
+	thread__put(pt->unknown_thread);
 	free(pt);
 }
 
@@ -2153,7 +2153,7 @@ int intel_pt_process_auxtrace_info(union perf_event *event,
 	return 0;
 
 err_delete_thread:
-	thread__delete(pt->unknown_thread);
+	thread__zput(pt->unknown_thread);
 err_free_queues:
 	intel_pt_log_disable();
 	auxtrace_queues__free(&pt->queues);
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 1407d51..ad79297 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -352,13 +352,18 @@ static void machine__update_thread_pid(struct machine *machine,
 	}
 
 	th->mg = map_groups__get(leader->mg);
-
+out_put:
+	thread__put(leader);
 	return;
-
 out_err:
 	pr_err("Failed to join map groups for %d:%d\n", th->pid_, th->tid);
+	goto out_put;
 }
 
+/*
+ * Caller must eventually drop thread->refcnt returned with a successfull
+ * lookup/new thread inserted.
+ */
 static struct thread *____machine__findnew_thread(struct machine *machine,
 						  pid_t pid, pid_t tid,
 						  bool create)
@@ -376,7 +381,7 @@ static struct thread *____machine__findnew_thread(struct machine *machine,
 	if (th != NULL) {
 		if (th->tid == tid) {
 			machine__update_thread_pid(machine, th, pid);
-			return th;
+			return thread__get(th);
 		}
 
 		machine->last_match = NULL;
@@ -389,7 +394,7 @@ static struct thread *____machine__findnew_thread(struct machine *machine,
 		if (th->tid == tid) {
 			machine->last_match = th;
 			machine__update_thread_pid(machine, th, pid);
-			return th;
+			return thread__get(th);
 		}
 
 		if (tid < th->tid)
@@ -417,7 +422,7 @@ static struct thread *____machine__findnew_thread(struct machine *machine,
 		if (thread__init_map_groups(th, machine)) {
 			rb_erase_init(&th->rb_node, &machine->threads);
 			RB_CLEAR_NODE(&th->rb_node);
-			thread__delete(th);
+			thread__put(th);
 			return NULL;
 		}
 		/*
@@ -441,7 +446,7 @@ struct thread *machine__findnew_thread(struct machine *machine, pid_t pid,
 	struct thread *th;
 
 	pthread_rwlock_wrlock(&machine->threads_lock);
-	th = thread__get(__machine__findnew_thread(machine, pid, tid));
+	th = __machine__findnew_thread(machine, pid, tid);
 	pthread_rwlock_unlock(&machine->threads_lock);
 	return th;
 }
@@ -451,7 +456,7 @@ struct thread *machine__find_thread(struct machine *machine, pid_t pid,
 {
 	struct thread *th;
 	pthread_rwlock_rdlock(&machine->threads_lock);
-	th =  thread__get(____machine__findnew_thread(machine, pid, tid, false));
+	th =  ____machine__findnew_thread(machine, pid, tid, false);
 	pthread_rwlock_unlock(&machine->threads_lock);
 	return th;
 }
diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c
index 0a9ae80..dfd00c6 100644
--- a/tools/perf/util/thread.c
+++ b/tools/perf/util/thread.c
@@ -19,8 +19,10 @@ int thread__init_map_groups(struct thread *thread, struct machine *machine)
 		thread->mg = map_groups__new(machine);
 	} else {
 		leader = __machine__findnew_thread(machine, pid, pid);
-		if (leader)
+		if (leader) {
 			thread->mg = map_groups__get(leader->mg);
+			thread__put(leader);
+		}
 	}
 
 	return thread->mg ? 0 : -1;
@@ -53,7 +55,7 @@ struct thread *thread__new(pid_t pid, pid_t tid)
 			goto err_thread;
 
 		list_add(&comm->list, &thread->comm_list);
-		atomic_set(&thread->refcnt, 0);
+		atomic_set(&thread->refcnt, 1);
 		RB_CLEAR_NODE(&thread->rb_node);
 	}
 
@@ -95,6 +97,10 @@ struct thread *thread__get(struct thread *thread)
 void thread__put(struct thread *thread)
 {
 	if (thread && atomic_dec_and_test(&thread->refcnt)) {
+		/*
+		 * Remove it from the dead_threads list, as last reference
+		 * is gone.
+		 */
 		list_del_init(&thread->node);
 		thread__delete(thread);
 	}
-- 
cgit v1.1


From bd0f889536f80630c1c4a414f2de90744d2c87d0 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Fri, 11 Dec 2015 16:12:24 -0800
Subject: perf evsel: Disable branch flags/cycles for --callgraph lbr

[The kernel patch needed for this is in tip now (b16a5b52eb9 perf/x86:
Add option to disable ...) So this user tools patch to make use of it
should be merged now]

Automatically disable collecting branch flags and cycles with
--call-graph lbr. This allows avoiding a bunch of extra MSR
reads in the PMI on Skylake.

When the kernel doesn't support the new flags they are automatically
cleared in the fallback code.

v2: Switch to use branch_sample_type instead of sample_type.
Adjust description.
Fix the fallback logic.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: http://lkml.kernel.org/r/1449879144-29074-1-git-send-email-andi@firstfloor.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/evsel.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 47f0330..544e440 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -36,6 +36,7 @@ static struct {
 	bool cloexec;
 	bool clockid;
 	bool clockid_wrong;
+	bool lbr_flags;
 } perf_missing_features;
 
 static clockid_t clockid;
@@ -574,7 +575,9 @@ perf_evsel__config_callgraph(struct perf_evsel *evsel,
 			} else {
 				perf_evsel__set_sample_bit(evsel, BRANCH_STACK);
 				attr->branch_sample_type = PERF_SAMPLE_BRANCH_USER |
-							PERF_SAMPLE_BRANCH_CALL_STACK;
+							PERF_SAMPLE_BRANCH_CALL_STACK |
+							PERF_SAMPLE_BRANCH_NO_CYCLES |
+							PERF_SAMPLE_BRANCH_NO_FLAGS;
 			}
 		} else
 			 pr_warning("Cannot use LBR callstack with branch stack. "
@@ -1337,6 +1340,9 @@ fallback_missing_features:
 		evsel->attr.mmap2 = 0;
 	if (perf_missing_features.exclude_guest)
 		evsel->attr.exclude_guest = evsel->attr.exclude_host = 0;
+	if (perf_missing_features.lbr_flags)
+		evsel->attr.branch_sample_type &= ~(PERF_SAMPLE_BRANCH_NO_FLAGS |
+				     PERF_SAMPLE_BRANCH_NO_CYCLES);
 retry_sample_id:
 	if (perf_missing_features.sample_id_all)
 		evsel->attr.sample_id_all = 0;
@@ -1455,6 +1461,12 @@ try_fallback:
 	} else if (!perf_missing_features.sample_id_all) {
 		perf_missing_features.sample_id_all = true;
 		goto retry_sample_id;
+	} else if (!perf_missing_features.lbr_flags &&
+			(evsel->attr.branch_sample_type &
+			 (PERF_SAMPLE_BRANCH_NO_CYCLES |
+			  PERF_SAMPLE_BRANCH_NO_FLAGS))) {
+		perf_missing_features.lbr_flags = true;
+		goto fallback_missing_features;
 	}
 
 out_close:
-- 
cgit v1.1


From 71d6de64feddd4b455555326fba2111b3006d9e0 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Mon, 14 Dec 2015 12:11:13 +0900
Subject: perf test: Fix hist testcases when kptr_restrict is on

Currently if kptr_restrict is enabled, all hist tests failed with
segfaults.  This is because machine__create_kernel_maps() in
setup_fake_machine() failed in that situation, and it called
machine__delete() on the error path.  But outer callers again called
machines__exit() causing double free for the host machine.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1450062673-22312-1-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/hists_common.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tools/perf/tests/hists_common.c b/tools/perf/tests/hists_common.c
index 46f453b..bcfd081 100644
--- a/tools/perf/tests/hists_common.c
+++ b/tools/perf/tests/hists_common.c
@@ -88,8 +88,8 @@ struct machine *setup_fake_machine(struct machines *machines)
 	}
 
 	if (machine__create_kernel_maps(machine)) {
-		pr_debug("Not enough memory for machine setup\n");
-		goto out;
+		pr_debug("Cannot create kernel maps\n");
+		return NULL;
 	}
 
 	for (i = 0; i < ARRAY_SIZE(fake_threads); i++) {
@@ -155,7 +155,6 @@ struct machine *setup_fake_machine(struct machines *machines)
 out:
 	pr_debug("Not enough memory for machine setup\n");
 	machine__delete_threads(machine);
-	machine__delete(machine);
 	return NULL;
 }
 
-- 
cgit v1.1


From 26bc9b2df1f38536cdfd58df94bf8b5601eb894a Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Sun, 13 Dec 2015 22:18:01 -0600
Subject: perf build: Remove unnecessary line in Makefile.feature

This line always silently fails because it doesn't add the 'test-'
prefix to the .bin file.

And it seems to be unnecessary anyway: the line immediately after it
does all the individual feature checks.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/554a05c18af564ba015c9e68f25730126e0f4acb.1449965119.git.jpoimboe@redhat.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/build/Makefile.feature | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature
index 37ff4c9..b8c31ec 100644
--- a/tools/build/Makefile.feature
+++ b/tools/build/Makefile.feature
@@ -101,7 +101,6 @@ ifeq ($(feature-all), 1)
   #
   $(foreach feat,$(FEATURE_TESTS),$(call feature_set,$(feat)))
 else
-  $(shell $(MAKE) OUTPUT=$(OUTPUT_FEATURES) CFLAGS="$(EXTRA_CFLAGS)" LDFLAGS=$(LDFLAGS) -i -j -C $(feature_dir) $(addsuffix .bin,$(FEATURE_TESTS)) >/dev/null 2>&1)
   $(foreach feat,$(FEATURE_TESTS),$(call feature_check,$(feat)))
 endif
 
-- 
cgit v1.1


From 8bda6a63a049caee4f8ddf2dd99055794df96e4f Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Sun, 13 Dec 2015 22:18:03 -0600
Subject: perf test: Add Build file to dependencies for llvm-src-*.c

Because the Build file writes source code to the generated llvm-src-*.c
files, it should be listed as one of the dependencies, so that any
future changes to the code being echoed won't require a 'make clean'.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/9b9886c295750dc83cbbb29a665d280f9c5e8b3e.1449965119.git.jpoimboe@redhat.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/Build | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/perf/tests/Build b/tools/perf/tests/Build
index 0ff8a97..f23fb7e 100644
--- a/tools/perf/tests/Build
+++ b/tools/perf/tests/Build
@@ -35,21 +35,21 @@ perf-y += llvm.o llvm-src-base.o llvm-src-kbuild.o llvm-src-prologue.o
 perf-y += bpf.o
 perf-y += topology.o
 
-$(OUTPUT)tests/llvm-src-base.c: tests/bpf-script-example.c
+$(OUTPUT)tests/llvm-src-base.c: tests/bpf-script-example.c tests/Build
 	$(call rule_mkdir)
 	$(Q)echo '#include <tests/llvm.h>' > $@
 	$(Q)echo 'const char test_llvm__bpf_base_prog[] =' >> $@
 	$(Q)sed -e 's/"/\\"/g' -e 's/\(.*\)/"\1\\n"/g' $< >> $@
 	$(Q)echo ';' >> $@
 
-$(OUTPUT)tests/llvm-src-kbuild.c: tests/bpf-script-test-kbuild.c
+$(OUTPUT)tests/llvm-src-kbuild.c: tests/bpf-script-test-kbuild.c tests/Build
 	$(call rule_mkdir)
 	$(Q)echo '#include <tests/llvm.h>' > $@
 	$(Q)echo 'const char test_llvm__bpf_test_kbuild_prog[] =' >> $@
 	$(Q)sed -e 's/"/\\"/g' -e 's/\(.*\)/"\1\\n"/g' $< >> $@
 	$(Q)echo ';' >> $@
 
-$(OUTPUT)tests/llvm-src-prologue.c: tests/bpf-script-test-prologue.c
+$(OUTPUT)tests/llvm-src-prologue.c: tests/bpf-script-test-prologue.c tests/Build
 	$(call rule_mkdir)
 	$(Q)echo '#include <tests/llvm.h>' > $@
 	$(Q)echo 'const char test_llvm__bpf_test_prologue_prog[] =' >> $@
-- 
cgit v1.1


From 004bd89da8c8e7df87e951bf88e34af67348e4e9 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Sun, 13 Dec 2015 22:18:04 -0600
Subject: perf test: Remove tarpkg at end of test

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/5e7e97a23e3ce11b59d1009b39ebb6d2813a0560.1449965119.git.jpoimboe@redhat.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/make | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/perf/tests/make b/tools/perf/tests/make
index 8ea3dff..c1fbb8e 100644
--- a/tools/perf/tests/make
+++ b/tools/perf/tests/make
@@ -259,7 +259,8 @@ $(run_O):
 tarpkg:
 	@cmd="$(PERF)/tests/perf-targz-src-pkg $(PERF)"; \
 	echo "- $@: $$cmd" && echo $$cmd > $@ && \
-	( eval $$cmd ) >> $@ 2>&1
+	( eval $$cmd ) >> $@ 2>&1 && \
+	rm -f $@
 
 make_kernelsrc:
 	@echo "- make -C <kernelsrc> tools/perf"
-- 
cgit v1.1


From 8f46dfd73e5378909834bd32a4e7710cd5522506 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Sun, 13 Dec 2015 22:18:05 -0600
Subject: perf build: Fix 'make clean'

Add some missing files to the 'make clean' target.

Reported-and-Acked-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/8b1f5a5bd66a652be071d423e64aaa994254be31.1449965119.git.jpoimboe@redhat.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Makefile.perf | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index 929a32b..906c723 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -582,15 +582,16 @@ $(INSTALL_DOC_TARGETS):
 #
 config-clean:
 	$(call QUIET_CLEAN, config)
-	$(Q)$(MAKE) -C $(srctree)/tools/build/feature/ clean >/dev/null
+	$(Q)$(MAKE) -C $(srctree)/tools/build/feature/ $(if $(OUTPUT),OUTPUT=$(OUTPUT)feature/,) clean >/dev/null
 
 clean: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean config-clean
 	$(call QUIET_CLEAN, core-objs)  $(RM) $(LIB_FILE) $(OUTPUT)perf-archive $(OUTPUT)perf-with-kcore $(LANG_BINDINGS)
-	$(Q)find . -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete
+	$(Q)find $(if $(OUTPUT),$(OUTPUT),.) -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete
 	$(Q)$(RM) $(OUTPUT).config-detected
 	$(call QUIET_CLEAN, core-progs) $(RM) $(ALL_PROGRAMS) perf perf-read-vdso32 perf-read-vdsox32
 	$(call QUIET_CLEAN, core-gen)   $(RM)  *.spec *.pyc *.pyo */*.pyc */*.pyo $(OUTPUT)common-cmds.h TAGS tags cscope* $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)FEATURE-DUMP $(OUTPUT)util/*-bison* $(OUTPUT)util/*-flex* \
-		$(OUTPUT)util/intel-pt-decoder/inat-tables.c
+		$(OUTPUT)util/intel-pt-decoder/inat-tables.c $(OUTPUT)fixdep \
+		$(OUTPUT)tests/llvm-src-{base,kbuild,prologue}.c
 	$(QUIET_SUBDIR0)Documentation $(QUIET_SUBDIR1) clean
 	$(python-clean)
 
-- 
cgit v1.1


From 212e984a07d19c2e6b83da4ebac4e965dd92efd3 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Sun, 13 Dec 2015 22:18:06 -0600
Subject: perf build: Rename LIB_PATH -> API_PATH

'LIB_PATH' is a misnomer because there are multiple library paths.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/c10df0b749a27f05cc531fe06b8dd71a329341fa.1449965119.git.jpoimboe@redhat.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Makefile.perf | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index 906c723..388ec64f 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -185,13 +185,13 @@ ifneq ($(OUTPUT),)
   TE_PATH=$(OUTPUT)
   BPF_PATH=$(OUTPUT)
 ifneq ($(subdir),)
-  LIB_PATH=$(OUTPUT)/../lib/api/
+  API_PATH=$(OUTPUT)/../lib/api/
 else
-  LIB_PATH=$(OUTPUT)
+  API_PATH=$(OUTPUT)
 endif
 else
   TE_PATH=$(TRACE_EVENT_DIR)
-  LIB_PATH=$(LIB_DIR)
+  API_PATH=$(LIB_DIR)
   BPF_PATH=$(BPF_DIR)
 endif
 
@@ -201,7 +201,7 @@ export LIBTRACEEVENT
 LIBTRACEEVENT_DYNAMIC_LIST = $(TE_PATH)libtraceevent-dynamic-list
 LIBTRACEEVENT_DYNAMIC_LIST_LDFLAGS = -Xlinker --dynamic-list=$(LIBTRACEEVENT_DYNAMIC_LIST)
 
-LIBAPI = $(LIB_PATH)libapi.a
+LIBAPI = $(API_PATH)libapi.a
 export LIBAPI
 
 LIBBPF = $(BPF_PATH)libbpf.a
-- 
cgit v1.1


From 32a56bd438ab3023d08874e2770aa0675364b8ab Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Sun, 13 Dec 2015 22:18:07 -0600
Subject: perf tools: Create pager.h

Move the 'pager' function prototypes into a new pager.h so that the
pager code can be moved out to a library.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/ba7c316474dd6bfc047e5c6dc4dcab39a982caf5.1449965119.git.jpoimboe@redhat.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/cache.h | 5 +----
 tools/perf/util/pager.h | 7 +++++++
 2 files changed, 8 insertions(+), 4 deletions(-)
 create mode 100644 tools/perf/util/pager.h

diff --git a/tools/perf/util/cache.h b/tools/perf/util/cache.h
index 4c2b764..9ca4a58 100644
--- a/tools/perf/util/cache.h
+++ b/tools/perf/util/cache.h
@@ -4,6 +4,7 @@
 #include <stdbool.h>
 #include "util.h"
 #include "strbuf.h"
+#include "pager.h"
 #include "../perf.h"
 #include "../ui/ui.h"
 
@@ -28,10 +29,6 @@ extern int perf_config_bool(const char *, const char *);
 extern int config_error_nonbool(const char *);
 extern const char *perf_config_dirname(const char *, const char *);
 
-/* pager.c */
-extern void setup_pager(void);
-extern int pager_in_use(void);
-
 char *alias_lookup(const char *alias);
 int split_cmdline(char *cmdline, const char ***argv);
 
diff --git a/tools/perf/util/pager.h b/tools/perf/util/pager.h
new file mode 100644
index 0000000..2794a83
--- /dev/null
+++ b/tools/perf/util/pager.h
@@ -0,0 +1,7 @@
+#ifndef __PERF_PAGER_H
+#define __PERF_PAGER_H
+
+extern void setup_pager(void);
+extern int pager_in_use(void);
+
+#endif /* __PERF_PAGER_H */
-- 
cgit v1.1


From a871a775172ac586b76199fd158e2843971bd052 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Sun, 13 Dec 2015 22:18:08 -0600
Subject: perf tools: Remove check for unused PERF_PAGER_IN_USE

PERF_PAGER_IN_USE doesn't seem to be used anywhere, so let's remove it.

This will also make it easier to move pager.c into a separate library.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/ed9e8370db9811746dc590544cf48c36dcfb1731.1449965119.git.jpoimboe@redhat.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/pager.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/tools/perf/util/pager.c b/tools/perf/util/pager.c
index 53ef006..7dcbef6 100644
--- a/tools/perf/util/pager.c
+++ b/tools/perf/util/pager.c
@@ -85,11 +85,5 @@ void setup_pager(void)
 
 int pager_in_use(void)
 {
-	const char *env;
-
-	if (spawned_pager)
-		return 1;
-
-	env = getenv("PERF_PAGER_IN_USE");
-	return env ? perf_config_bool("PERF_PAGER_IN_USE", env) : 0;
+	return spawned_pager;
 }
-- 
cgit v1.1


From 5feaac248a46dd5f9876c4ae45c4bbbde5472e90 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Sun, 13 Dec 2015 22:18:09 -0600
Subject: perf tools: Move help_unknown_cmd() to its own file

help_unknown_cmd() is quite perf-specific because it relies on some
perf_config*() functions.  Move it and its supporting functions out into
a separate file so that help.c can be moved to a library.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/562d918bcaaf340c1ae3e47586b3f0ae33b9918b.1449965119.git.jpoimboe@redhat.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/Build              |   1 +
 tools/perf/util/help-unknown-cmd.c | 103 +++++++++++++++++++++++++++++++++++
 tools/perf/util/help-unknown-cmd.h |   0
 tools/perf/util/help.c             | 107 ++-----------------------------------
 tools/perf/util/help.h             |   3 ++
 5 files changed, 110 insertions(+), 104 deletions(-)
 create mode 100644 tools/perf/util/help-unknown-cmd.c
 create mode 100644 tools/perf/util/help-unknown-cmd.h

diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index 65fef59..99b3dae 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -87,6 +87,7 @@ libperf-$(CONFIG_AUXTRACE) += intel-bts.o
 libperf-y += parse-branch-options.o
 libperf-y += parse-regs-options.o
 libperf-y += term.o
+libperf-y += help-unknown-cmd.o
 
 libperf-$(CONFIG_LIBBPF) += bpf-loader.o
 libperf-$(CONFIG_BPF_PROLOGUE) += bpf-prologue.o
diff --git a/tools/perf/util/help-unknown-cmd.c b/tools/perf/util/help-unknown-cmd.c
new file mode 100644
index 0000000..a0820f1
--- /dev/null
+++ b/tools/perf/util/help-unknown-cmd.c
@@ -0,0 +1,103 @@
+#include "cache.h"
+#include "help.h"
+#include "../builtin.h"
+#include "levenshtein.h"
+
+static int autocorrect;
+static struct cmdnames aliases;
+
+static int perf_unknown_cmd_config(const char *var, const char *value, void *cb)
+{
+	if (!strcmp(var, "help.autocorrect"))
+		autocorrect = perf_config_int(var,value);
+	/* Also use aliases for command lookup */
+	if (!prefixcmp(var, "alias."))
+		add_cmdname(&aliases, var + 6, strlen(var + 6));
+
+	return perf_default_config(var, value, cb);
+}
+
+static int levenshtein_compare(const void *p1, const void *p2)
+{
+	const struct cmdname *const *c1 = p1, *const *c2 = p2;
+	const char *s1 = (*c1)->name, *s2 = (*c2)->name;
+	int l1 = (*c1)->len;
+	int l2 = (*c2)->len;
+	return l1 != l2 ? l1 - l2 : strcmp(s1, s2);
+}
+
+static void add_cmd_list(struct cmdnames *cmds, struct cmdnames *old)
+{
+	unsigned int i;
+
+	ALLOC_GROW(cmds->names, cmds->cnt + old->cnt, cmds->alloc);
+
+	for (i = 0; i < old->cnt; i++)
+		cmds->names[cmds->cnt++] = old->names[i];
+	zfree(&old->names);
+	old->cnt = 0;
+}
+
+const char *help_unknown_cmd(const char *cmd)
+{
+	unsigned int i, n = 0, best_similarity = 0;
+	struct cmdnames main_cmds, other_cmds;
+
+	memset(&main_cmds, 0, sizeof(main_cmds));
+	memset(&other_cmds, 0, sizeof(main_cmds));
+	memset(&aliases, 0, sizeof(aliases));
+
+	perf_config(perf_unknown_cmd_config, NULL);
+
+	load_command_list("perf-", &main_cmds, &other_cmds);
+
+	add_cmd_list(&main_cmds, &aliases);
+	add_cmd_list(&main_cmds, &other_cmds);
+	qsort(main_cmds.names, main_cmds.cnt,
+	      sizeof(main_cmds.names), cmdname_compare);
+	uniq(&main_cmds);
+
+	if (main_cmds.cnt) {
+		/* This reuses cmdname->len for similarity index */
+		for (i = 0; i < main_cmds.cnt; ++i)
+			main_cmds.names[i]->len =
+				levenshtein(cmd, main_cmds.names[i]->name, 0, 2, 1, 4);
+
+		qsort(main_cmds.names, main_cmds.cnt,
+		      sizeof(*main_cmds.names), levenshtein_compare);
+
+		best_similarity = main_cmds.names[0]->len;
+		n = 1;
+		while (n < main_cmds.cnt && best_similarity == main_cmds.names[n]->len)
+			++n;
+	}
+
+	if (autocorrect && n == 1) {
+		const char *assumed = main_cmds.names[0]->name;
+
+		main_cmds.names[0] = NULL;
+		clean_cmdnames(&main_cmds);
+		fprintf(stderr, "WARNING: You called a perf program named '%s', "
+			"which does not exist.\n"
+			"Continuing under the assumption that you meant '%s'\n",
+			cmd, assumed);
+		if (autocorrect > 0) {
+			fprintf(stderr, "in %0.1f seconds automatically...\n",
+				(float)autocorrect/10.0);
+			poll(NULL, 0, autocorrect * 100);
+		}
+		return assumed;
+	}
+
+	fprintf(stderr, "perf: '%s' is not a perf-command. See 'perf --help'.\n", cmd);
+
+	if (main_cmds.cnt && best_similarity < 6) {
+		fprintf(stderr, "\nDid you mean %s?\n",
+			n < 2 ? "this": "one of these");
+
+		for (i = 0; i < n; i++)
+			fprintf(stderr, "\t%s\n", main_cmds.names[i]->name);
+	}
+
+	exit(1);
+}
diff --git a/tools/perf/util/help-unknown-cmd.h b/tools/perf/util/help-unknown-cmd.h
new file mode 100644
index 0000000..e69de29
diff --git a/tools/perf/util/help.c b/tools/perf/util/help.c
index 929c93f..8d74f7d 100644
--- a/tools/perf/util/help.c
+++ b/tools/perf/util/help.c
@@ -1,9 +1,7 @@
 #include "cache.h"
 #include "../builtin.h"
 #include "exec_cmd.h"
-#include "levenshtein.h"
 #include "help.h"
-#include <termios.h>
 
 void add_cmdname(struct cmdnames *cmds, const char *name, size_t len)
 {
@@ -17,7 +15,7 @@ void add_cmdname(struct cmdnames *cmds, const char *name, size_t len)
 	cmds->names[cmds->cnt++] = ent;
 }
 
-static void clean_cmdnames(struct cmdnames *cmds)
+void clean_cmdnames(struct cmdnames *cmds)
 {
 	unsigned int i;
 
@@ -28,14 +26,14 @@ static void clean_cmdnames(struct cmdnames *cmds)
 	cmds->alloc = 0;
 }
 
-static int cmdname_compare(const void *a_, const void *b_)
+int cmdname_compare(const void *a_, const void *b_)
 {
 	struct cmdname *a = *(struct cmdname **)a_;
 	struct cmdname *b = *(struct cmdname **)b_;
 	return strcmp(a->name, b->name);
 }
 
-static void uniq(struct cmdnames *cmds)
+void uniq(struct cmdnames *cmds)
 {
 	unsigned int i, j;
 
@@ -233,102 +231,3 @@ int is_in_cmdlist(struct cmdnames *c, const char *s)
 			return 1;
 	return 0;
 }
-
-static int autocorrect;
-static struct cmdnames aliases;
-
-static int perf_unknown_cmd_config(const char *var, const char *value, void *cb)
-{
-	if (!strcmp(var, "help.autocorrect"))
-		autocorrect = perf_config_int(var,value);
-	/* Also use aliases for command lookup */
-	if (!prefixcmp(var, "alias."))
-		add_cmdname(&aliases, var + 6, strlen(var + 6));
-
-	return perf_default_config(var, value, cb);
-}
-
-static int levenshtein_compare(const void *p1, const void *p2)
-{
-	const struct cmdname *const *c1 = p1, *const *c2 = p2;
-	const char *s1 = (*c1)->name, *s2 = (*c2)->name;
-	int l1 = (*c1)->len;
-	int l2 = (*c2)->len;
-	return l1 != l2 ? l1 - l2 : strcmp(s1, s2);
-}
-
-static void add_cmd_list(struct cmdnames *cmds, struct cmdnames *old)
-{
-	unsigned int i;
-
-	ALLOC_GROW(cmds->names, cmds->cnt + old->cnt, cmds->alloc);
-
-	for (i = 0; i < old->cnt; i++)
-		cmds->names[cmds->cnt++] = old->names[i];
-	zfree(&old->names);
-	old->cnt = 0;
-}
-
-const char *help_unknown_cmd(const char *cmd)
-{
-	unsigned int i, n = 0, best_similarity = 0;
-	struct cmdnames main_cmds, other_cmds;
-
-	memset(&main_cmds, 0, sizeof(main_cmds));
-	memset(&other_cmds, 0, sizeof(main_cmds));
-	memset(&aliases, 0, sizeof(aliases));
-
-	perf_config(perf_unknown_cmd_config, NULL);
-
-	load_command_list("perf-", &main_cmds, &other_cmds);
-
-	add_cmd_list(&main_cmds, &aliases);
-	add_cmd_list(&main_cmds, &other_cmds);
-	qsort(main_cmds.names, main_cmds.cnt,
-	      sizeof(main_cmds.names), cmdname_compare);
-	uniq(&main_cmds);
-
-	if (main_cmds.cnt) {
-		/* This reuses cmdname->len for similarity index */
-		for (i = 0; i < main_cmds.cnt; ++i)
-			main_cmds.names[i]->len =
-				levenshtein(cmd, main_cmds.names[i]->name, 0, 2, 1, 4);
-
-		qsort(main_cmds.names, main_cmds.cnt,
-		      sizeof(*main_cmds.names), levenshtein_compare);
-
-		best_similarity = main_cmds.names[0]->len;
-		n = 1;
-		while (n < main_cmds.cnt && best_similarity == main_cmds.names[n]->len)
-			++n;
-	}
-
-	if (autocorrect && n == 1) {
-		const char *assumed = main_cmds.names[0]->name;
-
-		main_cmds.names[0] = NULL;
-		clean_cmdnames(&main_cmds);
-		fprintf(stderr, "WARNING: You called a perf program named '%s', "
-			"which does not exist.\n"
-			"Continuing under the assumption that you meant '%s'\n",
-			cmd, assumed);
-		if (autocorrect > 0) {
-			fprintf(stderr, "in %0.1f seconds automatically...\n",
-				(float)autocorrect/10.0);
-			poll(NULL, 0, autocorrect * 100);
-		}
-		return assumed;
-	}
-
-	fprintf(stderr, "perf: '%s' is not a perf-command. See 'perf --help'.\n", cmd);
-
-	if (main_cmds.cnt && best_similarity < 6) {
-		fprintf(stderr, "\nDid you mean %s?\n",
-			n < 2 ? "this": "one of these");
-
-		for (i = 0; i < n; i++)
-			fprintf(stderr, "\t%s\n", main_cmds.names[i]->name);
-	}
-
-	exit(1);
-}
diff --git a/tools/perf/util/help.h b/tools/perf/util/help.h
index 7f5c6de..14851b0 100644
--- a/tools/perf/util/help.h
+++ b/tools/perf/util/help.h
@@ -20,6 +20,9 @@ void load_command_list(const char *prefix,
 		struct cmdnames *main_cmds,
 		struct cmdnames *other_cmds);
 void add_cmdname(struct cmdnames *cmds, const char *name, size_t len);
+void clean_cmdnames(struct cmdnames *cmds);
+int cmdname_compare(const void *a, const void *b);
+void uniq(struct cmdnames *cmds);
 /* Here we require that excludes is a sorted list. */
 void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes);
 int is_in_cmdlist(struct cmdnames *c, const char *s);
-- 
cgit v1.1


From 408cf34c176e1832bc2f9f68033a55a765484f93 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Sun, 13 Dec 2015 22:18:12 -0600
Subject: perf tools: Convert parse-options.c internal functions to static

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/c027b5f47ec1055077f5650edb1c7ad37c191e6c.1449965119.git.jpoimboe@redhat.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/parse-options.c | 18 +++++++++---------
 tools/perf/util/parse-options.h |  9 ---------
 2 files changed, 9 insertions(+), 18 deletions(-)

diff --git a/tools/perf/util/parse-options.c b/tools/perf/util/parse-options.c
index de3290b..14b2bee 100644
--- a/tools/perf/util/parse-options.c
+++ b/tools/perf/util/parse-options.c
@@ -360,8 +360,8 @@ static void check_typos(const char *arg, const struct option *options)
 	}
 }
 
-void parse_options_start(struct parse_opt_ctx_t *ctx,
-			 int argc, const char **argv, int flags)
+static void parse_options_start(struct parse_opt_ctx_t *ctx,
+				int argc, const char **argv, int flags)
 {
 	memset(ctx, 0, sizeof(*ctx));
 	ctx->argc = argc - 1;
@@ -378,9 +378,9 @@ static int usage_with_options_internal(const char * const *,
 				       const struct option *, int,
 				       struct parse_opt_ctx_t *);
 
-int parse_options_step(struct parse_opt_ctx_t *ctx,
-		       const struct option *options,
-		       const char * const usagestr[])
+static int parse_options_step(struct parse_opt_ctx_t *ctx,
+			      const struct option *options,
+			      const char * const usagestr[])
 {
 	int internal_help = !(ctx->flags & PARSE_OPT_NO_INTERNAL_HELP);
 	int excl_short_opt = 1;
@@ -489,7 +489,7 @@ exclusive:
 	return PARSE_OPT_HELP;
 }
 
-int parse_options_end(struct parse_opt_ctx_t *ctx)
+static int parse_options_end(struct parse_opt_ctx_t *ctx)
 {
 	memmove(ctx->out + ctx->cpidx, ctx->argv, ctx->argc * sizeof(*ctx->out));
 	ctx->out[ctx->cpidx + ctx->argc] = NULL;
@@ -717,9 +717,9 @@ static bool option__in_argv(const struct option *opt, const struct parse_opt_ctx
 	return false;
 }
 
-int usage_with_options_internal(const char * const *usagestr,
-				const struct option *opts, int full,
-				struct parse_opt_ctx_t *ctx)
+static int usage_with_options_internal(const char * const *usagestr,
+				       const struct option *opts, int full,
+				       struct parse_opt_ctx_t *ctx)
 {
 	struct option *ordered;
 
diff --git a/tools/perf/util/parse-options.h b/tools/perf/util/parse-options.h
index a8e407b..dd1236d 100644
--- a/tools/perf/util/parse-options.h
+++ b/tools/perf/util/parse-options.h
@@ -195,15 +195,6 @@ extern int parse_options_usage(const char * const *usagestr,
 			       const char *optstr,
 			       bool short_opt);
 
-extern void parse_options_start(struct parse_opt_ctx_t *ctx,
-				int argc, const char **argv, int flags);
-
-extern int parse_options_step(struct parse_opt_ctx_t *ctx,
-			      const struct option *options,
-			      const char * const usagestr[]);
-
-extern int parse_options_end(struct parse_opt_ctx_t *ctx);
-
 
 /*----- some often used options -----*/
 extern int parse_opt_abbrev_cb(const struct option *, const char *, int);
-- 
cgit v1.1


From 48e1cab1ba4db84fbc26379b887ba94a180347fe Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Mon, 14 Dec 2015 10:39:22 +0000
Subject: perf tools: Make options always available, even if required libs not
 linked

This patch keeps options of perf builtins same in all conditions. If
one option is disabled because of compiling options, users should be
notified.

Masami suggested another implementation in [1] that, by adding a
OPTION_NEXT_DEPENDS option before those options in the 'struct option'
array, options parser knows an option is disabled. However, in some
cases this array is reordered (options__order()). In addition, in
parse-option.c that array is const, so we can't simply merge
information in decorator option into the affacted option.

This patch chooses a simpler implementation that, introducing a
set_option_nobuild() function and two option parsing flags. Builtins
with such options should call set_option_nobuild() before option
parsing. The complexity of this patch is because we want some of options
can be skipped safely. In this case their arguments should also be
consumed.

Options in 'perf record' and 'perf probe' are fixed in this patch.

[1] http://lkml.kernel.org/g/50399556C9727B4D88A595C8584AAB3752627CD4@GSjpTKYDCembx32.service.hitachi.net

Test result:

Normal case:

  # ./perf probe --vmlinux /tmp/vmlinux sys_write
  Added new event:
    probe:sys_write      (on sys_write)

  You can now use it in all perf tools, such as:

	perf record -e probe:sys_write -aR sleep 1

Build with NO_DWARF=1:

  # ./perf probe -L sys_write
    Error: switch `L' is not available because NO_DWARF=1

   Usage: perf probe [<options>] 'PROBEDEF' ['PROBEDEF' ...]
      or: perf probe [<options>] --add 'PROBEDEF' [--add 'PROBEDEF' ...]
      or: perf probe [<options>] --del '[GROUP:]EVENT' ...
      or: perf probe --list [GROUP:]EVENT ...
      or: perf probe [<options>] --funcs

    -L, --line <FUNC[:RLN[+NUM|-RLN2]]|SRC:ALN[+NUM|-ALN2]>
                          Show source code lines.
                          (not built-in because NO_DWARF=1)

  # ./perf probe -k /tmp/vmlinux sys_write
    Warning: switch `k' is being ignored because NO_DWARF=1
  Added new event:
    probe:sys_write      (on sys_write)

  You can now use it in all perf tools, such as:

	perf record -e probe:sys_write -aR sleep 1

  # ./perf probe --vmlinux /tmp/vmlinux sys_write
    Warning: option `vmlinux' is being ignored because NO_DWARF=1
  Added new event:
  [SNIP]

  # ./perf probe -l
   Usage: perf probe [<options>] 'PROBEDEF' ['PROBEDEF' ...]
      or: perf probe [<options>] --add 'PROBEDEF' [--add 'PROBEDEF' ...]
...
    -k, --vmlinux <file>  vmlinux pathname
                          (not built-in because NO_DWARF=1)
    -L, --line <FUNC[:RLN[+NUM|-RLN2]]|SRC:ALN[+NUM|-ALN2]>
                          Show source code lines.
                          (not built-in because NO_DWARF=1)
...
    -V, --vars <FUNC[@SRC][+OFF|%return|:RL|;PT]|SRC:AL|SRC;PT>
                          Show accessible variables on PROBEDEF
                          (not built-in because NO_DWARF=1)
        --externs         Show external variables too (with --vars only)
                          (not built-in because NO_DWARF=1)
        --no-inlines      Don't search inlined functions
                          (not built-in because NO_DWARF=1)
        --range           Show variables location range in scope (with --vars only)
                          (not built-in because NO_DWARF=1)

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1450089563-122430-14-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-probe.c      |  15 ++++-
 tools/perf/builtin-record.c     |   9 ++-
 tools/perf/util/parse-options.c | 118 +++++++++++++++++++++++++++++++++++++---
 tools/perf/util/parse-options.h |   5 ++
 4 files changed, 134 insertions(+), 13 deletions(-)

diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c
index 132afc9..dbe2ea5 100644
--- a/tools/perf/builtin-probe.c
+++ b/tools/perf/builtin-probe.c
@@ -249,6 +249,9 @@ static int opt_show_vars(const struct option *opt,
 
 	return ret;
 }
+#else
+# define opt_show_lines NULL
+# define opt_show_vars NULL
 #endif
 static int opt_add_probe_event(const struct option *opt,
 			      const char *str, int unset __maybe_unused)
@@ -473,7 +476,6 @@ __cmd_probe(int argc, const char **argv, const char *prefix __maybe_unused)
 		opt_add_probe_event),
 	OPT_BOOLEAN('f', "force", &probe_conf.force_add, "forcibly add events"
 		    " with existing name"),
-#ifdef HAVE_DWARF_SUPPORT
 	OPT_CALLBACK('L', "line", NULL,
 		     "FUNC[:RLN[+NUM|-RLN2]]|SRC:ALN[+NUM|-ALN2]",
 		     "Show source code lines.", opt_show_lines),
@@ -490,7 +492,6 @@ __cmd_probe(int argc, const char **argv, const char *prefix __maybe_unused)
 		   "directory", "path to kernel source"),
 	OPT_BOOLEAN('\0', "no-inlines", &probe_conf.no_inlines,
 		"Don't search inlined functions"),
-#endif
 	OPT__DRY_RUN(&probe_event_dry_run),
 	OPT_INTEGER('\0', "max-probes", &probe_conf.max_probes,
 		 "Set how many probe points can be found for a probe."),
@@ -521,6 +522,16 @@ __cmd_probe(int argc, const char **argv, const char *prefix __maybe_unused)
 #ifdef HAVE_DWARF_SUPPORT
 	set_option_flag(options, 'L', "line", PARSE_OPT_EXCLUSIVE);
 	set_option_flag(options, 'V', "vars", PARSE_OPT_EXCLUSIVE);
+#else
+# define set_nobuild(s, l, c) set_option_nobuild(options, s, l, "NO_DWARF=1", c)
+	set_nobuild('L', "line", false);
+	set_nobuild('V', "vars", false);
+	set_nobuild('\0', "externs", false);
+	set_nobuild('\0', "range", false);
+	set_nobuild('k', "vmlinux", true);
+	set_nobuild('s', "source", true);
+	set_nobuild('\0', "no-inlines", true);
+# undef set_nobuild
 #endif
 	set_option_flag(options, 'F', "funcs", PARSE_OPT_EXCLUSIVE);
 
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 199fc31..c2ba377e 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -1113,12 +1113,10 @@ struct option __record_options[] = {
 			"per thread proc mmap processing timeout in ms"),
 	OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events,
 		    "Record context switch events"),
-#ifdef HAVE_LIBBPF_SUPPORT
 	OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
 		   "clang binary to use for compiling BPF scriptlets"),
 	OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
 		   "options passed to clang when compiling BPF scriptlets"),
-#endif
 	OPT_END()
 };
 
@@ -1130,6 +1128,13 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
 	struct record *rec = &record;
 	char errbuf[BUFSIZ];
 
+#ifndef HAVE_LIBBPF_SUPPORT
+# define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
+	set_nobuild('\0', "clang-path", true);
+	set_nobuild('\0', "clang-opt", true);
+# undef set_nobuild
+#endif
+
 	rec->evlist = perf_evlist__new();
 	if (rec->evlist == NULL)
 		return -ENOMEM;
diff --git a/tools/perf/util/parse-options.c b/tools/perf/util/parse-options.c
index 14b2bee..0ad1384 100644
--- a/tools/perf/util/parse-options.c
+++ b/tools/perf/util/parse-options.c
@@ -18,20 +18,34 @@ static int opterror(const struct option *opt, const char *reason, int flags)
 	return error("option `%s' %s", opt->long_name, reason);
 }
 
+static void optwarning(const struct option *opt, const char *reason, int flags)
+{
+	if (flags & OPT_SHORT)
+		warning("switch `%c' %s", opt->short_name, reason);
+	else if (flags & OPT_UNSET)
+		warning("option `no-%s' %s", opt->long_name, reason);
+	else
+		warning("option `%s' %s", opt->long_name, reason);
+}
+
 static int get_arg(struct parse_opt_ctx_t *p, const struct option *opt,
 		   int flags, const char **arg)
 {
+	const char *res;
+
 	if (p->opt) {
-		*arg = p->opt;
+		res = p->opt;
 		p->opt = NULL;
 	} else if ((opt->flags & PARSE_OPT_LASTARG_DEFAULT) && (p->argc == 1 ||
 		    **(p->argv + 1) == '-')) {
-		*arg = (const char *)opt->defval;
+		res = (const char *)opt->defval;
 	} else if (p->argc > 1) {
 		p->argc--;
-		*arg = *++p->argv;
+		res = *++p->argv;
 	} else
 		return opterror(opt, "requires a value", flags);
+	if (arg)
+		*arg = res;
 	return 0;
 }
 
@@ -91,6 +105,64 @@ static int get_value(struct parse_opt_ctx_t *p,
 		}
 	}
 
+	if (opt->flags & PARSE_OPT_NOBUILD) {
+		char reason[128];
+		bool noarg = false;
+
+		err = snprintf(reason, sizeof(reason),
+				opt->flags & PARSE_OPT_CANSKIP ?
+					"is being ignored because %s " :
+					"is not available because %s",
+				opt->build_opt);
+		reason[sizeof(reason) - 1] = '\0';
+
+		if (err < 0)
+			strncpy(reason, opt->flags & PARSE_OPT_CANSKIP ?
+					"is being ignored" :
+					"is not available",
+					sizeof(reason));
+
+		if (!(opt->flags & PARSE_OPT_CANSKIP))
+			return opterror(opt, reason, flags);
+
+		err = 0;
+		if (unset)
+			noarg = true;
+		if (opt->flags & PARSE_OPT_NOARG)
+			noarg = true;
+		if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
+			noarg = true;
+
+		switch (opt->type) {
+		case OPTION_BOOLEAN:
+		case OPTION_INCR:
+		case OPTION_BIT:
+		case OPTION_SET_UINT:
+		case OPTION_SET_PTR:
+		case OPTION_END:
+		case OPTION_ARGUMENT:
+		case OPTION_GROUP:
+			noarg = true;
+			break;
+		case OPTION_CALLBACK:
+		case OPTION_STRING:
+		case OPTION_INTEGER:
+		case OPTION_UINTEGER:
+		case OPTION_LONG:
+		case OPTION_U64:
+		default:
+			break;
+		}
+
+		if (!noarg)
+			err = get_arg(p, opt, flags, NULL);
+		if (err)
+			return err;
+
+		optwarning(opt, reason, flags);
+		return 0;
+	}
+
 	switch (opt->type) {
 	case OPTION_BIT:
 		if (unset)
@@ -645,6 +717,10 @@ static void print_option_help(const struct option *opts, int full)
 		pad = USAGE_OPTS_WIDTH;
 	}
 	fprintf(stderr, "%*s%s\n", pad + USAGE_GAP, "", opts->help);
+	if (opts->flags & PARSE_OPT_NOBUILD)
+		fprintf(stderr, "%*s(not built-in because %s)\n",
+			USAGE_OPTS_WIDTH + USAGE_GAP, "",
+			opts->build_opt);
 }
 
 static int option__cmp(const void *va, const void *vb)
@@ -848,15 +924,39 @@ int parse_opt_verbosity_cb(const struct option *opt,
 	return 0;
 }
 
-void set_option_flag(struct option *opts, int shortopt, const char *longopt,
-		     int flag)
+static struct option *
+find_option(struct option *opts, int shortopt, const char *longopt)
 {
 	for (; opts->type != OPTION_END; opts++) {
 		if ((shortopt && opts->short_name == shortopt) ||
 		    (opts->long_name && longopt &&
-		     !strcmp(opts->long_name, longopt))) {
-			opts->flags |= flag;
-			break;
-		}
+		     !strcmp(opts->long_name, longopt)))
+			return opts;
 	}
+	return NULL;
+}
+
+void set_option_flag(struct option *opts, int shortopt, const char *longopt,
+		     int flag)
+{
+	struct option *opt = find_option(opts, shortopt, longopt);
+
+	if (opt)
+		opt->flags |= flag;
+	return;
+}
+
+void set_option_nobuild(struct option *opts, int shortopt,
+			const char *longopt,
+			const char *build_opt,
+			bool can_skip)
+{
+	struct option *opt = find_option(opts, shortopt, longopt);
+
+	if (!opt)
+		return;
+
+	opt->flags |= PARSE_OPT_NOBUILD;
+	opt->flags |= can_skip ? PARSE_OPT_CANSKIP : 0;
+	opt->build_opt = build_opt;
 }
diff --git a/tools/perf/util/parse-options.h b/tools/perf/util/parse-options.h
index dd1236d..1231960 100644
--- a/tools/perf/util/parse-options.h
+++ b/tools/perf/util/parse-options.h
@@ -41,6 +41,8 @@ enum parse_opt_option_flags {
 	PARSE_OPT_DISABLED = 32,
 	PARSE_OPT_EXCLUSIVE = 64,
 	PARSE_OPT_NOEMPTY  = 128,
+	PARSE_OPT_NOBUILD  = 256,
+	PARSE_OPT_CANSKIP  = 512,
 };
 
 struct option;
@@ -96,6 +98,7 @@ struct option {
 	void *value;
 	const char *argh;
 	const char *help;
+	const char *build_opt;
 
 	int flags;
 	parse_opt_cb *callback;
@@ -217,4 +220,6 @@ extern int parse_opt_verbosity_cb(const struct option *, const char *, int);
 extern const char *parse_options_fix_filename(const char *prefix, const char *file);
 
 void set_option_flag(struct option *opts, int sopt, const char *lopt, int flag);
+void set_option_nobuild(struct option *opts, int shortopt, const char *longopt,
+			const char *build_opt, bool can_skip);
 #endif /* __PERF_PARSE_OPTIONS_H */
-- 
cgit v1.1


From 7efe0e034c713716060bc7794c7e332589980c70 Mon Sep 17 00:00:00 2001
From: He Kuang <hekuang@huawei.com>
Date: Mon, 14 Dec 2015 10:39:23 +0000
Subject: perf record: Support custom vmlinux path

Make perf-record command support --vmlinux option if BPF_PROLOGUE is on.

'perf record' needs vmlinux as the source of DWARF info to generate
prologue for BPF programs, so path of vmlinux should be specified.

Short name 'k' has been taken by 'clockid'. This patch skips the short
option name and uses '--vmlinux' for vmlinux path.

Documentation is also updated.

Test result:

In a production (or broken) environment:
 (by:
  # rm -rf ~/.debug/
  # mv /lib/modules/`uname -r`/build/vmlinux /tmp/
 )

 # ./perf record -e ./test_bpf_base.c ls
 Failed to find the path for kernel: No such file or directory
 event syntax error: './test_bpf_base.c'
                      \___ You need to check probing points in BPF file
 ...

 # ./perf record --vmlinux /tmp/vmlinux -e ./test_bpf_base.c ls
 ...
 [ perf record: Woken up 1 times to write data ]
 [ perf record: Captured and wrote 0.011 MB perf.data ]

Help messages when build with NO_LIBBPF:

 # ./perf record -h
        --transaction     sample transaction flags (special events only)
        --vmlinux <file>  vmlinux pathname
                          (not built-in because NO_LIBBPF=1)
 # ./perf record --vmlinux /tmp/vmlinux ls /
  Warning: option `vmlinux' is being ignored because NO_LIBBPF=1
 ...
 [ perf record: Woken up 1 times to write data ]
 [ perf record: Captured and wrote 0.011 MB perf.data (11 samples) ]

Help messages when build with NO_DWARF:

 # ./perf record -h
        --transaction     sample transaction flags (special events only)
        --vmlinux <file>  vmlinux pathname
                          (not built-in because NO_DWARF=1)

Signed-off-by: He Kuang <hekuang@huawei.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1450089563-122430-15-git-send-email-wangnan0@huawei.com
Signed-off-by: Wang Nan <wangnan0@huawei.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-record.txt | 10 ++++++++--
 tools/perf/builtin-record.c              | 16 ++++++++++++++++
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index e630a7d..8d032f4 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -314,11 +314,17 @@ This option sets the time out limit. The default value is 500 ms.
 Record context switch events i.e. events of type PERF_RECORD_SWITCH or
 PERF_RECORD_SWITCH_CPU_WIDE.
 
---clang-path::
+--clang-path=PATH::
 Path to clang binary to use for compiling BPF scriptlets.
+(enabled when BPF support is on)
 
---clang-opt::
+--clang-opt=OPTIONS::
 Options passed to clang when compiling BPF scriptlets.
+(enabled when BPF support is on)
+
+--vmlinux=PATH::
+Specify vmlinux path which has debuginfo.
+(enabled when BPF prologue is on)
 
 SEE ALSO
 --------
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index c2ba377e..3ef3c79 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -1117,6 +1117,8 @@ struct option __record_options[] = {
 		   "clang binary to use for compiling BPF scriptlets"),
 	OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
 		   "options passed to clang when compiling BPF scriptlets"),
+	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
+		   "file", "vmlinux pathname"),
 	OPT_END()
 };
 
@@ -1135,6 +1137,20 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
 # undef set_nobuild
 #endif
 
+#ifndef HAVE_BPF_PROLOGUE
+# if !defined (HAVE_DWARF_SUPPORT)
+#  define REASON  "NO_DWARF=1"
+# elif !defined (HAVE_LIBBPF_SUPPORT)
+#  define REASON  "NO_LIBBPF=1"
+# else
+#  define REASON  "this architecture doesn't support BPF prologue"
+# endif
+# define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
+	set_nobuild('\0', "vmlinux", true);
+# undef set_nobuild
+# undef REASON
+#endif
+
 	rec->evlist = perf_evlist__new();
 	if (rec->evlist == NULL)
 		return -ENOMEM;
-- 
cgit v1.1


From 7a29c087ff80f5d534bd6729c852099fc572c8d0 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Tue, 15 Dec 2015 10:49:56 +0900
Subject: perf record: Add record.build-id config option

Post processing at 'perf record' takes a long time on big machines.

What it does is to find the build-id of binaries found in the event
stream, so that it can make sure, at 'report' time, that the symtabs (be
it ELF, kallsyms, etc) being used to resolve symbols are the ones
matching the binaries found at 'record' time.

Sometimes we just want to skip this processing of events at the end of
the session to get quicker results, making sure the binaries haven't
changed from 'record' to 'report' time.

Add a new config option to control this behavior.

The record.build-id config variable can have one of the following
values:

 - cache: post-process data and save/update the binaries into the
          build-id cache (in ~/.debug).  This is the default.
 - no-cache: post-process the data but not update the build-id cache.
             Same effect as using the -N option.
 - skip: skip post-processing and do not update the cache.
         Same effect as using the -B option.

Reported-and-Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Taeung Song <treeze.taeung@gmail.com>
Link: http://lkml.kernel.org/r/1450144196-22957-1-git-send-email-namhyung@kernel.org
[ Added some more text to the documentation ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-record.txt | 14 +++++++++++++-
 tools/perf/builtin-record.c              | 13 +++++++++++++
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index 8d032f4..3a1a32f 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -207,11 +207,23 @@ comma-separated list with no space: 0,1. Ranges of CPUs are specified with -: 0-
 In per-thread mode with inheritance mode on (default), samples are captured only when
 the thread executes on the designated CPUs. Default is to monitor all CPUs.
 
+-B::
+--no-buildid::
+Do not save the build ids of binaries in the perf.data files. This skips
+post processing after recording, which sometimes makes the final step in
+the recording process to take a long time, as it needs to process all
+events looking for mmap records. The downside is that it can misresolve
+symbols if the workload binaries used when recording get locally rebuilt
+or upgraded, because the only key available in this case is the
+pathname. You can also set the "record.build-id" config variable to
+'skip to have this behaviour permanently.
+
 -N::
 --no-buildid-cache::
 Do not update the buildid cache. This saves some overhead in situations
 where the information in the perf.data file (which includes buildids)
-is sufficient.
+is sufficient.  You can also set the "record.build-id" config variable to
+'no-cache' to have the same effect.
 
 -G name,...::
 --cgroup name,...::
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 3ef3c79..a3b4930 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -837,6 +837,19 @@ int record_callchain_opt(const struct option *opt,
 
 static int perf_record_config(const char *var, const char *value, void *cb)
 {
+	struct record *rec = cb;
+
+	if (!strcmp(var, "record.build-id")) {
+		if (!strcmp(value, "cache"))
+			rec->no_buildid_cache = false;
+		else if (!strcmp(value, "no-cache"))
+			rec->no_buildid_cache = true;
+		else if (!strcmp(value, "skip"))
+			rec->no_buildid = true;
+		else
+			return -1;
+		return 0;
+	}
 	if (!strcmp(var, "record.call-graph"))
 		var = "call-graph.record-mode"; /* fall-through */
 
-- 
cgit v1.1


From 1925459b4d92d92e62d67ddc763cda650d2aa79c Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Tue, 15 Dec 2015 09:39:32 -0600
Subject: tools build: Fix feature Makefile issues with 'O='

When building perf binaries outside the source tree with 'make O=<dir>',
the auto-detected features get re-tested for every build, which is
unnecessary and inconsistent with the behavior seen when building
directly in the source tree.

Another issue is that 'make O=<dir> clean' doesn't remove the feature
files from the object tree.

Fix these problems by looking for the binaries in the $(OUTPUT)
directory.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/113bd01530e9761778c60a75a96c65fc59860f68.1450193761.git.jpoimboe@redhat.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/build/Makefile.feature |  2 +-
 tools/build/feature/Makefile | 93 ++++++++++++++++++++++----------------------
 2 files changed, 48 insertions(+), 47 deletions(-)

diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature
index b8c31ec..6c0519d 100644
--- a/tools/build/Makefile.feature
+++ b/tools/build/Makefile.feature
@@ -7,7 +7,7 @@ endif
 
 feature_check = $(eval $(feature_check_code))
 define feature_check_code
-  feature-$(1) := $(shell $(MAKE) OUTPUT=$(OUTPUT_FEATURES) CFLAGS="$(EXTRA_CFLAGS) $(FEATURE_CHECK_CFLAGS-$(1))" LDFLAGS="$(LDFLAGS) $(FEATURE_CHECK_LDFLAGS-$(1))" -C $(feature_dir) test-$1.bin >/dev/null 2>/dev/null && echo 1 || echo 0)
+  feature-$(1) := $(shell $(MAKE) OUTPUT=$(OUTPUT_FEATURES) CFLAGS="$(EXTRA_CFLAGS) $(FEATURE_CHECK_CFLAGS-$(1))" LDFLAGS="$(LDFLAGS) $(FEATURE_CHECK_LDFLAGS-$(1))" -C $(feature_dir) $(OUTPUT_FEATURES)test-$1.bin >/dev/null 2>/dev/null && echo 1 || echo 0)
 endef
 
 feature_set = $(eval $(feature_set_code))
diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile
index cea04ce9..bf8f035 100644
--- a/tools/build/feature/Makefile
+++ b/tools/build/feature/Makefile
@@ -1,4 +1,3 @@
-
 FILES=					\
 	test-all.bin			\
 	test-backtrace.bin		\
@@ -38,38 +37,40 @@ FILES=					\
 	test-bpf.bin			\
 	test-get_cpuid.bin
 
+FILES := $(addprefix $(OUTPUT),$(FILES))
+
 CC := $(CROSS_COMPILE)gcc -MD
 PKG_CONFIG := $(CROSS_COMPILE)pkg-config
 
 all: $(FILES)
 
-__BUILD = $(CC) $(CFLAGS) -Wall -Werror -o $(OUTPUT)$@ $(patsubst %.bin,%.c,$@) $(LDFLAGS)
-  BUILD = $(__BUILD) > $(OUTPUT)$(@:.bin=.make.output) 2>&1
+__BUILD = $(CC) $(CFLAGS) -Wall -Werror -o $@ $(patsubst %.bin,%.c,$(@F)) $(LDFLAGS)
+  BUILD = $(__BUILD) > $(@:.bin=.make.output) 2>&1
 
 ###############################
 
-test-all.bin:
+$(OUTPUT)test-all.bin:
 	$(BUILD) -fstack-protector-all -O2 -D_FORTIFY_SOURCE=2 -ldw -lelf -lnuma -lelf -laudit -I/usr/include/slang -lslang $(shell $(PKG_CONFIG) --libs --cflags gtk+-2.0 2>/dev/null) $(FLAGS_PERL_EMBED) $(FLAGS_PYTHON_EMBED) -DPACKAGE='"perf"' -lbfd -ldl -lz -llzma
 
-test-hello.bin:
+$(OUTPUT)test-hello.bin:
 	$(BUILD)
 
-test-pthread-attr-setaffinity-np.bin:
+$(OUTPUT)test-pthread-attr-setaffinity-np.bin:
 	$(BUILD) -D_GNU_SOURCE -lpthread
 
-test-stackprotector-all.bin:
+$(OUTPUT)test-stackprotector-all.bin:
 	$(BUILD) -fstack-protector-all
 
-test-fortify-source.bin:
+$(OUTPUT)test-fortify-source.bin:
 	$(BUILD) -O2 -D_FORTIFY_SOURCE=2
 
-test-bionic.bin:
+$(OUTPUT)test-bionic.bin:
 	$(BUILD)
 
-test-libelf.bin:
+$(OUTPUT)test-libelf.bin:
 	$(BUILD) -lelf
 
-test-glibc.bin:
+$(OUTPUT)test-glibc.bin:
 	$(BUILD)
 
 DWARFLIBS := -ldw
@@ -77,37 +78,37 @@ ifeq ($(findstring -static,${LDFLAGS}),-static)
 DWARFLIBS += -lelf -lebl -lz -llzma -lbz2
 endif
 
-test-dwarf.bin:
+$(OUTPUT)test-dwarf.bin:
 	$(BUILD) $(DWARFLIBS)
 
-test-libelf-mmap.bin:
+$(OUTPUT)test-libelf-mmap.bin:
 	$(BUILD) -lelf
 
-test-libelf-getphdrnum.bin:
+$(OUTPUT)test-libelf-getphdrnum.bin:
 	$(BUILD) -lelf
 
-test-libnuma.bin:
+$(OUTPUT)test-libnuma.bin:
 	$(BUILD) -lnuma
 
-test-numa_num_possible_cpus.bin:
+$(OUTPUT)test-numa_num_possible_cpus.bin:
 	$(BUILD) -lnuma
 
-test-libunwind.bin:
+$(OUTPUT)test-libunwind.bin:
 	$(BUILD) -lelf
 
-test-libunwind-debug-frame.bin:
+$(OUTPUT)test-libunwind-debug-frame.bin:
 	$(BUILD) -lelf
 
-test-libaudit.bin:
+$(OUTPUT)test-libaudit.bin:
 	$(BUILD) -laudit
 
-test-libslang.bin:
+$(OUTPUT)test-libslang.bin:
 	$(BUILD) -I/usr/include/slang -lslang
 
-test-gtk2.bin:
+$(OUTPUT)test-gtk2.bin:
 	$(BUILD) $(shell $(PKG_CONFIG) --libs --cflags gtk+-2.0 2>/dev/null)
 
-test-gtk2-infobar.bin:
+$(OUTPUT)test-gtk2-infobar.bin:
 	$(BUILD) $(shell $(PKG_CONFIG) --libs --cflags gtk+-2.0 2>/dev/null)
 
 grep-libs  = $(filter -l%,$(1))
@@ -119,63 +120,63 @@ PERL_EMBED_LIBADD = $(call grep-libs,$(PERL_EMBED_LDOPTS))
 PERL_EMBED_CCOPTS = `perl -MExtUtils::Embed -e ccopts 2>/dev/null`
 FLAGS_PERL_EMBED=$(PERL_EMBED_CCOPTS) $(PERL_EMBED_LDOPTS)
 
-test-libperl.bin:
+$(OUTPUT)test-libperl.bin:
 	$(BUILD) $(FLAGS_PERL_EMBED)
 
-test-libpython.bin:
+$(OUTPUT)test-libpython.bin:
 	$(BUILD)
 
-test-libpython-version.bin:
+$(OUTPUT)test-libpython-version.bin:
 	$(BUILD)
 
-test-libbfd.bin:
+$(OUTPUT)test-libbfd.bin:
 	$(BUILD) -DPACKAGE='"perf"' -lbfd -lz -liberty -ldl
 
-test-liberty.bin:
-	$(CC) $(CFLAGS) -Wall -Werror -o $(OUTPUT)$@ test-libbfd.c -DPACKAGE='"perf"' $(LDFLAGS) -lbfd -ldl -liberty
+$(OUTPUT)test-liberty.bin:
+	$(CC) $(CFLAGS) -Wall -Werror -o $@ test-libbfd.c -DPACKAGE='"perf"' $(LDFLAGS) -lbfd -ldl -liberty
 
-test-liberty-z.bin:
-	$(CC) $(CFLAGS) -Wall -Werror -o $(OUTPUT)$@ test-libbfd.c -DPACKAGE='"perf"' $(LDFLAGS) -lbfd -ldl -liberty -lz
+$(OUTPUT)test-liberty-z.bin:
+	$(CC) $(CFLAGS) -Wall -Werror -o $@ test-libbfd.c -DPACKAGE='"perf"' $(LDFLAGS) -lbfd -ldl -liberty -lz
 
-test-cplus-demangle.bin:
+$(OUTPUT)test-cplus-demangle.bin:
 	$(BUILD) -liberty
 
-test-backtrace.bin:
+$(OUTPUT)test-backtrace.bin:
 	$(BUILD)
 
-test-timerfd.bin:
+$(OUTPUT)test-timerfd.bin:
 	$(BUILD)
 
-test-libdw-dwarf-unwind.bin:
+$(OUTPUT)test-libdw-dwarf-unwind.bin:
 	$(BUILD) # -ldw provided by $(FEATURE_CHECK_LDFLAGS-libdw-dwarf-unwind)
 
-test-libbabeltrace.bin:
+$(OUTPUT)test-libbabeltrace.bin:
 	$(BUILD) # -lbabeltrace provided by $(FEATURE_CHECK_LDFLAGS-libbabeltrace)
 
-test-sync-compare-and-swap.bin:
+$(OUTPUT)test-sync-compare-and-swap.bin:
 	$(BUILD)
 
-test-compile-32.bin:
-	$(CC) -m32 -o $(OUTPUT)$@ test-compile.c
+$(OUTPUT)test-compile-32.bin:
+	$(CC) -m32 -o $@ test-compile.c
 
-test-compile-x32.bin:
-	$(CC) -mx32 -o $(OUTPUT)$@ test-compile.c
+$(OUTPUT)test-compile-x32.bin:
+	$(CC) -mx32 -o $@ test-compile.c
 
-test-zlib.bin:
+$(OUTPUT)test-zlib.bin:
 	$(BUILD) -lz
 
-test-lzma.bin:
+$(OUTPUT)test-lzma.bin:
 	$(BUILD) -llzma
 
-test-get_cpuid.bin:
+$(OUTPUT)test-get_cpuid.bin:
 	$(BUILD)
 
-test-bpf.bin:
+$(OUTPUT)test-bpf.bin:
 	$(BUILD)
 
--include *.d
+-include $(OUTPUT)*.d
 
 ###############################
 
 clean:
-	rm -f $(FILES) *.d $(FILES:.bin=.make.output)
+	rm -f $(FILES) $(OUTPUT)*.d $(FILES:.bin=.make.output)
-- 
cgit v1.1


From ce99091730c92bf560712baa0696ea5a461b1fe8 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Tue, 15 Dec 2015 09:39:33 -0600
Subject: perf tools: Move strlcpy() from perf to tools/lib/string.c

strlcpy() will be needed by the subcmd library.  Move it to the shared
tools/lib/string.c file which can be used by other tools.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/71e2804b973bf39ad3d3b9be10f99f2ea630be46.1450193761.git.jpoimboe@redhat.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/include/linux/string.h |  4 ++++
 tools/lib/string.c           | 27 +++++++++++++++++++++++++++
 tools/perf/util/cache.h      |  7 ++-----
 tools/perf/util/path.c       | 18 ------------------
 4 files changed, 33 insertions(+), 23 deletions(-)

diff --git a/tools/include/linux/string.h b/tools/include/linux/string.h
index 2e2f736..e26223f 100644
--- a/tools/include/linux/string.h
+++ b/tools/include/linux/string.h
@@ -8,4 +8,8 @@ void *memdup(const void *src, size_t len);
 
 int strtobool(const char *s, bool *res);
 
+#ifndef __UCLIBC__
+extern size_t strlcpy(char *dest, const char *src, size_t size);
+#endif
+
 #endif /* _LINUX_STRING_H_ */
diff --git a/tools/lib/string.c b/tools/lib/string.c
index 065e54f..bd239bc 100644
--- a/tools/lib/string.c
+++ b/tools/lib/string.c
@@ -16,6 +16,7 @@
 #include <string.h>
 #include <errno.h>
 #include <linux/string.h>
+#include <linux/compiler.h>
 
 /**
  * memdup - duplicate region of memory
@@ -60,3 +61,29 @@ int strtobool(const char *s, bool *res)
 	}
 	return 0;
 }
+
+/**
+ * strlcpy - Copy a C-string into a sized buffer
+ * @dest: Where to copy the string to
+ * @src: Where to copy the string from
+ * @size: size of destination buffer
+ *
+ * Compatible with *BSD: the result is always a valid
+ * NUL-terminated string that fits in the buffer (unless,
+ * of course, the buffer size is zero). It does not pad
+ * out the result like strncpy() does.
+ *
+ * If libc has strlcpy() then that version will override this
+ * implementation:
+ */
+size_t __weak strlcpy(char *dest, const char *src, size_t size)
+{
+	size_t ret = strlen(src);
+
+	if (size) {
+		size_t len = (ret >= size) ? size - 1 : ret;
+		memcpy(dest, src, len);
+		dest[len] = '\0';
+	}
+	return ret;
+}
diff --git a/tools/perf/util/cache.h b/tools/perf/util/cache.h
index 9ca4a58..d723ecb 100644
--- a/tools/perf/util/cache.h
+++ b/tools/perf/util/cache.h
@@ -8,6 +8,8 @@
 #include "../perf.h"
 #include "../ui/ui.h"
 
+#include <linux/string.h>
+
 #define CMD_EXEC_PATH "--exec-path"
 #define CMD_PERF_DIR "--perf-dir="
 #define CMD_WORK_TREE "--work-tree="
@@ -67,9 +69,4 @@ extern char *perf_path(const char *fmt, ...) __attribute__((format (printf, 1, 2
 extern char *perf_pathdup(const char *fmt, ...)
 	__attribute__((format (printf, 1, 2)));
 
-#ifndef __UCLIBC__
-/* Matches the libc/libbsd function attribute so we declare this unconditionally: */
-extern size_t strlcpy(char *dest, const char *src, size_t size);
-#endif
-
 #endif /* __PERF_CACHE_H */
diff --git a/tools/perf/util/path.c b/tools/perf/util/path.c
index 5d13cb4..3654d96 100644
--- a/tools/perf/util/path.c
+++ b/tools/perf/util/path.c
@@ -22,24 +22,6 @@ static const char *get_perf_dir(void)
 	return ".";
 }
 
-/*
- * If libc has strlcpy() then that version will override this
- * implementation:
- */
-size_t __weak strlcpy(char *dest, const char *src, size_t size)
-{
-	size_t ret = strlen(src);
-
-	if (size) {
-		size_t len = (ret >= size) ? size - 1 : ret;
-
-		memcpy(dest, src, len);
-		dest[len] = '\0';
-	}
-
-	return ret;
-}
-
 static char *get_pathname(void)
 {
 	static char pathname_array[4][PATH_MAX];
-- 
cgit v1.1


From 24a88bdd05d5de32f3a56a4dcc5070c97d4a514f Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Tue, 15 Dec 2015 09:39:34 -0600
Subject: perf tools: Document the fact that parse_options*() may exit

Generally, calling exit() from a library is bad practice.  Eventually
these functions might be redesigned so that they don't exit.  For now,
just document the fact that they do.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/97b1af06cc3b18dd0f49e655d6d659eaa64ecde5.1450193761.git.jpoimboe@redhat.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/parse-options.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/perf/util/parse-options.h b/tools/perf/util/parse-options.h
index 1231960..d1544069 100644
--- a/tools/perf/util/parse-options.h
+++ b/tools/perf/util/parse-options.h
@@ -152,6 +152,9 @@ struct option {
 /* parse_options() will filter out the processed options and leave the
  * non-option argments in argv[].
  * Returns the number of arguments left in argv[].
+ *
+ * NOTE: parse_options() and parse_options_subcommand() may call exit() in the
+ * case of an error (or for 'special' options like --list-cmds or --list-opts).
  */
 extern int parse_options(int argc, const char **argv,
                          const struct option *options,
-- 
cgit v1.1


From 096d35585b4fce7d3ee9b8b34314f39f49491ab1 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Tue, 15 Dec 2015 09:39:35 -0600
Subject: perf tools: Provide subcmd configuration at runtime

Create init functions for exec_cmd.c and pager.c.  This allows their
configuration to be specified at runtime so they can be split out into a
separate library which can be used by other programs.  Their
configuration is stored in a shared subcmd_config struct.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/21f5f6b38da72c985a8dcfa185700d03e7eecd1d.1450193761.git.jpoimboe@redhat.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Build                |  5 ++++-
 tools/perf/perf.c               |  6 +++++-
 tools/perf/util/Build           |  2 +-
 tools/perf/util/cache.h         |  1 +
 tools/perf/util/exec_cmd.c      | 23 ++++++++++++++++-------
 tools/perf/util/exec_cmd.h      |  3 +++
 tools/perf/util/pager.c         |  8 +++++++-
 tools/perf/util/pager.h         |  2 ++
 tools/perf/util/parse-options.c |  4 +++-
 tools/perf/util/subcmd-config.c | 11 +++++++++++
 tools/perf/util/subcmd-config.h | 14 ++++++++++++++
 11 files changed, 67 insertions(+), 12 deletions(-)
 create mode 100644 tools/perf/util/subcmd-config.c
 create mode 100644 tools/perf/util/subcmd-config.h

diff --git a/tools/perf/Build b/tools/perf/Build
index 2a41217..00c4b8c 100644
--- a/tools/perf/Build
+++ b/tools/perf/Build
@@ -36,7 +36,10 @@ paths += -DPERF_MAN_PATH="BUILD_STR($(mandir_SQ))"
 
 CFLAGS_builtin-help.o      += $(paths)
 CFLAGS_builtin-timechart.o += $(paths)
-CFLAGS_perf.o              += -DPERF_HTML_PATH="BUILD_STR($(htmldir_SQ))" -include $(OUTPUT)PERF-VERSION-FILE
+CFLAGS_perf.o              += -DPERF_HTML_PATH="BUILD_STR($(htmldir_SQ))"	\
+			      -DPERF_EXEC_PATH="BUILD_STR($(perfexecdir_SQ))"	\
+			      -DPREFIX="BUILD_STR($(prefix_SQ))"		\
+			      -include $(OUTPUT)PERF-VERSION-FILE
 CFLAGS_builtin-trace.o	   += -DSTRACE_GROUPS_DIR="BUILD_STR($(STRACE_GROUPS_DIR_SQ))"
 
 libperf-y += util/
diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index 59ea48c..783a331 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -119,7 +119,7 @@ static void commit_pager_choice(void)
 {
 	switch (use_pager) {
 	case 0:
-		setenv("PERF_PAGER", "cat", 1);
+		setenv(PERF_PAGER_ENVIRONMENT, "cat", 1);
 		break;
 	case 1:
 		/* setup_pager(); */
@@ -530,6 +530,10 @@ int main(int argc, const char **argv)
 	const char *cmd;
 	char sbuf[STRERR_BUFSIZE];
 
+	/* libsubcmd init */
+	exec_cmd_init("perf", PREFIX, PERF_EXEC_PATH, EXEC_PATH_ENVIRONMENT);
+	pager_init(PERF_PAGER_ENVIRONMENT);
+
 	/* The page_size is placed in util object. */
 	page_size = sysconf(_SC_PAGE_SIZE);
 	cacheline_size = sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index 99b3dae..196beef 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -88,6 +88,7 @@ libperf-y += parse-branch-options.o
 libperf-y += parse-regs-options.o
 libperf-y += term.o
 libperf-y += help-unknown-cmd.o
+libperf-y += subcmd-config.o
 
 libperf-$(CONFIG_LIBBPF) += bpf-loader.o
 libperf-$(CONFIG_BPF_PROLOGUE) += bpf-prologue.o
@@ -113,7 +114,6 @@ libperf-$(CONFIG_ZLIB) += zlib.o
 libperf-$(CONFIG_LZMA) += lzma.o
 
 CFLAGS_config.o   += -DETC_PERFCONFIG="BUILD_STR($(ETC_PERFCONFIG_SQ))"
-CFLAGS_exec_cmd.o += -DPERF_EXEC_PATH="BUILD_STR($(perfexecdir_SQ))" -DPREFIX="BUILD_STR($(prefix_SQ))"
 
 $(OUTPUT)util/parse-events-flex.c: util/parse-events.l $(OUTPUT)util/parse-events-bison.c
 	$(call rule_mkdir)
diff --git a/tools/perf/util/cache.h b/tools/perf/util/cache.h
index d723ecb..fc6a745 100644
--- a/tools/perf/util/cache.h
+++ b/tools/perf/util/cache.h
@@ -21,6 +21,7 @@
 #define DEFAULT_PERF_DIR_ENVIRONMENT ".perf"
 #define PERF_DEBUGFS_ENVIRONMENT "PERF_DEBUGFS_DIR"
 #define PERF_TRACEFS_ENVIRONMENT "PERF_TRACEFS_DIR"
+#define PERF_PAGER_ENVIRONMENT "PERF_PAGER"
 
 typedef int (*config_fn_t)(const char *, const char *, void *);
 extern int perf_default_config(const char *, const char *, void *);
diff --git a/tools/perf/util/exec_cmd.c b/tools/perf/util/exec_cmd.c
index 1099e92..b935e4c 100644
--- a/tools/perf/util/exec_cmd.c
+++ b/tools/perf/util/exec_cmd.c
@@ -1,6 +1,7 @@
 #include "cache.h"
 #include "exec_cmd.h"
 #include "quote.h"
+#include "subcmd-config.h"
 
 #include <string.h>
 
@@ -9,15 +10,23 @@
 static const char *argv_exec_path;
 static const char *argv0_path;
 
+void exec_cmd_init(const char *exec_name, const char *prefix,
+		   const char *exec_path, const char *exec_path_env)
+{
+	subcmd_config.exec_name		= exec_name;
+	subcmd_config.prefix		= prefix;
+	subcmd_config.exec_path		= exec_path;
+	subcmd_config.exec_path_env	= exec_path_env;
+}
+
 char *system_path(const char *path)
 {
-	static const char *prefix = PREFIX;
 	struct strbuf d = STRBUF_INIT;
 
 	if (is_absolute_path(path))
 		return strdup(path);
 
-	strbuf_addf(&d, "%s/%s", prefix, path);
+	strbuf_addf(&d, "%s/%s", subcmd_config.prefix, path);
 	path = strbuf_detach(&d, NULL);
 	return (char *)path;
 }
@@ -47,7 +56,7 @@ void perf_set_argv_exec_path(const char *exec_path)
 	/*
 	 * Propagate this setting to external programs.
 	 */
-	setenv(EXEC_PATH_ENVIRONMENT, exec_path, 1);
+	setenv(subcmd_config.exec_path_env, exec_path, 1);
 }
 
 
@@ -59,11 +68,11 @@ char *perf_exec_path(void)
 	if (argv_exec_path)
 		return strdup(argv_exec_path);
 
-	env = getenv(EXEC_PATH_ENVIRONMENT);
+	env = getenv(subcmd_config.exec_path_env);
 	if (env && *env)
 		return strdup(env);
 
-	return system_path(PERF_EXEC_PATH);
+	return system_path(subcmd_config.exec_path);
 }
 
 static void add_path(struct strbuf *out, const char *path)
@@ -107,7 +116,7 @@ static const char **prepare_perf_cmd(const char **argv)
 		; /* just counting */
 	nargv = malloc(sizeof(*nargv) * (argc + 2));
 
-	nargv[0] = "perf";
+	nargv[0] = subcmd_config.exec_name;
 	for (argc = 0; argv[argc]; argc++)
 		nargv[argc + 1] = argv[argc];
 	nargv[argc + 1] = NULL;
@@ -118,7 +127,7 @@ int execv_perf_cmd(const char **argv) {
 	const char **nargv = prepare_perf_cmd(argv);
 
 	/* execvp() can only ever return if it fails */
-	execvp("perf", (char **)nargv);
+	execvp(subcmd_config.exec_name, (char **)nargv);
 
 	free(nargv);
 	return -1;
diff --git a/tools/perf/util/exec_cmd.h b/tools/perf/util/exec_cmd.h
index 48b4175..fd4434e 100644
--- a/tools/perf/util/exec_cmd.h
+++ b/tools/perf/util/exec_cmd.h
@@ -1,6 +1,9 @@
 #ifndef __PERF_EXEC_CMD_H
 #define __PERF_EXEC_CMD_H
 
+extern void exec_cmd_init(const char *exec_name, const char *prefix,
+			  const char *exec_path, const char *exec_path_env);
+
 extern void perf_set_argv_exec_path(const char *exec_path);
 extern const char *perf_extract_argv0_path(const char *path);
 extern void setup_path(void);
diff --git a/tools/perf/util/pager.c b/tools/perf/util/pager.c
index 7dcbef6..d5ef62e 100644
--- a/tools/perf/util/pager.c
+++ b/tools/perf/util/pager.c
@@ -1,6 +1,7 @@
 #include "cache.h"
 #include "run-command.h"
 #include "sigchain.h"
+#include "subcmd-config.h"
 
 /*
  * This is split up from the rest of git so that we can do
@@ -9,6 +10,11 @@
 
 static int spawned_pager;
 
+void pager_init(const char *pager_env)
+{
+	subcmd_config.pager_env = pager_env;
+}
+
 static void pager_preexec(void)
 {
 	/*
@@ -46,7 +52,7 @@ static void wait_for_pager_signal(int signo)
 
 void setup_pager(void)
 {
-	const char *pager = getenv("PERF_PAGER");
+	const char *pager = getenv(subcmd_config.pager_env);
 
 	if (!isatty(1))
 		return;
diff --git a/tools/perf/util/pager.h b/tools/perf/util/pager.h
index 2794a83..d6a591a 100644
--- a/tools/perf/util/pager.h
+++ b/tools/perf/util/pager.h
@@ -1,6 +1,8 @@
 #ifndef __PERF_PAGER_H
 #define __PERF_PAGER_H
 
+extern void pager_init(const char *pager_env);
+
 extern void setup_pager(void);
 extern int pager_in_use(void);
 
diff --git a/tools/perf/util/parse-options.c b/tools/perf/util/parse-options.c
index 0ad1384..da4ba21 100644
--- a/tools/perf/util/parse-options.c
+++ b/tools/perf/util/parse-options.c
@@ -2,6 +2,7 @@
 #include "parse-options.h"
 #include "cache.h"
 #include "header.h"
+#include "subcmd-config.h"
 #include <linux/string.h>
 
 #define OPT_SHORT 1
@@ -577,7 +578,8 @@ int parse_options_subcommand(int argc, const char **argv, const struct option *o
 	if (subcommands && !usagestr[0]) {
 		struct strbuf buf = STRBUF_INIT;
 
-		strbuf_addf(&buf, "perf %s [<options>] {", argv[0]);
+		strbuf_addf(&buf, "%s %s [<options>] {",
+			    subcmd_config.exec_name, argv[0]);
 		for (int i = 0; subcommands[i]; i++) {
 			if (i)
 				strbuf_addstr(&buf, "|");
diff --git a/tools/perf/util/subcmd-config.c b/tools/perf/util/subcmd-config.c
new file mode 100644
index 0000000..d017c72
--- /dev/null
+++ b/tools/perf/util/subcmd-config.c
@@ -0,0 +1,11 @@
+#include "subcmd-config.h"
+
+#define UNDEFINED "SUBCMD_HAS_NOT_BEEN_INITIALIZED"
+
+struct subcmd_config subcmd_config = {
+	.exec_name	= UNDEFINED,
+	.prefix		= UNDEFINED,
+	.exec_path	= UNDEFINED,
+	.exec_path_env	= UNDEFINED,
+	.pager_env	= UNDEFINED,
+};
diff --git a/tools/perf/util/subcmd-config.h b/tools/perf/util/subcmd-config.h
new file mode 100644
index 0000000..cc85140
--- /dev/null
+++ b/tools/perf/util/subcmd-config.h
@@ -0,0 +1,14 @@
+#ifndef __PERF_SUBCMD_CONFIG_H
+#define __PERF_SUBCMD_CONFIG_H
+
+struct subcmd_config {
+	const char *exec_name;
+	const char *prefix;
+	const char *exec_path;
+	const char *exec_path_env;
+	const char *pager_env;
+};
+
+extern struct subcmd_config subcmd_config;
+
+#endif /* __PERF_SUBCMD_CONFIG_H */
-- 
cgit v1.1


From 901421a5bdf605d24c278825cdd032cd6038bcb8 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Tue, 15 Dec 2015 09:39:36 -0600
Subject: perf tools: Remove subcmd dependencies on strbuf

Introduce and use new astrcat() and astrcatf() functions which replace
the strbuf functionality for subcmd.

For now they duplicate strbuf's die-on-allocation-error policy.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/957d207e1254406fa11fc2e405e75a7e405aad8f.1450193761.git.jpoimboe@redhat.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/exec_cmd.c      | 27 +++++++++++++-------------
 tools/perf/util/help.c          | 14 ++++++--------
 tools/perf/util/parse-options.c | 42 +++++++++++++++++++++--------------------
 tools/perf/util/subcmd-util.h   | 24 +++++++++++++++++++++++
 4 files changed, 66 insertions(+), 41 deletions(-)
 create mode 100644 tools/perf/util/subcmd-util.h

diff --git a/tools/perf/util/exec_cmd.c b/tools/perf/util/exec_cmd.c
index b935e4c..65d86dc 100644
--- a/tools/perf/util/exec_cmd.c
+++ b/tools/perf/util/exec_cmd.c
@@ -4,6 +4,7 @@
 #include "subcmd-config.h"
 
 #include <string.h>
+#include "subcmd-util.h"
 
 #define MAX_ARGS	32
 
@@ -21,14 +22,14 @@ void exec_cmd_init(const char *exec_name, const char *prefix,
 
 char *system_path(const char *path)
 {
-	struct strbuf d = STRBUF_INIT;
+	char *buf = NULL;
 
 	if (is_absolute_path(path))
 		return strdup(path);
 
-	strbuf_addf(&d, "%s/%s", subcmd_config.prefix, path);
-	path = strbuf_detach(&d, NULL);
-	return (char *)path;
+	astrcatf(&buf, "%s/%s", subcmd_config.prefix, path);
+
+	return buf;
 }
 
 const char *perf_extract_argv0_path(const char *argv0)
@@ -75,22 +76,22 @@ char *perf_exec_path(void)
 	return system_path(subcmd_config.exec_path);
 }
 
-static void add_path(struct strbuf *out, const char *path)
+static void add_path(char **out, const char *path)
 {
 	if (path && *path) {
 		if (is_absolute_path(path))
-			strbuf_addstr(out, path);
+			astrcat(out, path);
 		else
-			strbuf_addstr(out, make_nonrelative_path(path));
+			astrcat(out, make_nonrelative_path(path));
 
-		strbuf_addch(out, PATH_SEP);
+		astrcat(out, ":");
 	}
 }
 
 void setup_path(void)
 {
 	const char *old_path = getenv("PATH");
-	struct strbuf new_path = STRBUF_INIT;
+	char *new_path = NULL;
 	char *tmp = perf_exec_path();
 
 	add_path(&new_path, tmp);
@@ -98,13 +99,13 @@ void setup_path(void)
 	free(tmp);
 
 	if (old_path)
-		strbuf_addstr(&new_path, old_path);
+		astrcat(&new_path, old_path);
 	else
-		strbuf_addstr(&new_path, "/usr/local/bin:/usr/bin:/bin");
+		astrcat(&new_path, "/usr/local/bin:/usr/bin:/bin");
 
-	setenv("PATH", new_path.buf, 1);
+	setenv("PATH", new_path, 1);
 
-	strbuf_release(&new_path);
+	free(new_path);
 }
 
 static const char **prepare_perf_cmd(const char **argv)
diff --git a/tools/perf/util/help.c b/tools/perf/util/help.c
index 8d74f7d..8e5e0ce 100644
--- a/tools/perf/util/help.c
+++ b/tools/perf/util/help.c
@@ -2,6 +2,7 @@
 #include "../builtin.h"
 #include "exec_cmd.h"
 #include "help.h"
+#include "subcmd-util.h"
 
 void add_cmdname(struct cmdnames *cmds, const char *name, size_t len)
 {
@@ -119,8 +120,7 @@ static void list_commands_in_dir(struct cmdnames *cmds,
 	int prefix_len;
 	DIR *dir = opendir(path);
 	struct dirent *de;
-	struct strbuf buf = STRBUF_INIT;
-	int len;
+	char *buf = NULL;
 
 	if (!dir)
 		return;
@@ -128,8 +128,7 @@ static void list_commands_in_dir(struct cmdnames *cmds,
 		prefix = "perf-";
 	prefix_len = strlen(prefix);
 
-	strbuf_addf(&buf, "%s/", path);
-	len = buf.len;
+	astrcatf(&buf, "%s/", path);
 
 	while ((de = readdir(dir)) != NULL) {
 		int entlen;
@@ -137,9 +136,8 @@ static void list_commands_in_dir(struct cmdnames *cmds,
 		if (prefixcmp(de->d_name, prefix))
 			continue;
 
-		strbuf_setlen(&buf, len);
-		strbuf_addstr(&buf, de->d_name);
-		if (!is_executable(buf.buf))
+		astrcat(&buf, de->d_name);
+		if (!is_executable(buf))
 			continue;
 
 		entlen = strlen(de->d_name) - prefix_len;
@@ -149,7 +147,7 @@ static void list_commands_in_dir(struct cmdnames *cmds,
 		add_cmdname(cmds, de->d_name + prefix_len, entlen);
 	}
 	closedir(dir);
-	strbuf_release(&buf);
+	free(buf);
 }
 
 void load_command_list(const char *prefix,
diff --git a/tools/perf/util/parse-options.c b/tools/perf/util/parse-options.c
index da4ba21..c1da2a5 100644
--- a/tools/perf/util/parse-options.c
+++ b/tools/perf/util/parse-options.c
@@ -1,4 +1,5 @@
 #include "util.h"
+#include "subcmd-util.h"
 #include "parse-options.h"
 #include "cache.h"
 #include "header.h"
@@ -8,7 +9,7 @@
 #define OPT_SHORT 1
 #define OPT_UNSET 2
 
-static struct strbuf error_buf = STRBUF_INIT;
+char *error_buf;
 
 static int opterror(const struct option *opt, const char *reason, int flags)
 {
@@ -576,19 +577,18 @@ int parse_options_subcommand(int argc, const char **argv, const struct option *o
 
 	/* build usage string if it's not provided */
 	if (subcommands && !usagestr[0]) {
-		struct strbuf buf = STRBUF_INIT;
+		char *buf = NULL;
+
+		astrcatf(&buf, "%s %s [<options>] {", subcmd_config.exec_name, argv[0]);
 
-		strbuf_addf(&buf, "%s %s [<options>] {",
-			    subcmd_config.exec_name, argv[0]);
 		for (int i = 0; subcommands[i]; i++) {
 			if (i)
-				strbuf_addstr(&buf, "|");
-			strbuf_addstr(&buf, subcommands[i]);
+				astrcat(&buf, "|");
+			astrcat(&buf, subcommands[i]);
 		}
-		strbuf_addstr(&buf, "}");
+		astrcat(&buf, "}");
 
-		usagestr[0] = strdup(buf.buf);
-		strbuf_release(&buf);
+		usagestr[0] = buf;
 	}
 
 	parse_options_start(&ctx, argc, argv, flags);
@@ -613,13 +613,11 @@ int parse_options_subcommand(int argc, const char **argv, const struct option *o
 		putchar('\n');
 		exit(130);
 	default: /* PARSE_OPT_UNKNOWN */
-		if (ctx.argv[0][1] == '-') {
-			strbuf_addf(&error_buf, "unknown option `%s'",
-				    ctx.argv[0] + 2);
-		} else {
-			strbuf_addf(&error_buf, "unknown switch `%c'",
-				    *ctx.opt);
-		}
+		if (ctx.argv[0][1] == '-')
+			astrcatf(&error_buf, "unknown option `%s'",
+				 ctx.argv[0] + 2);
+		else
+			astrcatf(&error_buf, "unknown switch `%c'", *ctx.opt);
 		usage_with_options(usagestr, options);
 	}
 
@@ -806,9 +804,9 @@ static int usage_with_options_internal(const char * const *usagestr,
 
 	setup_pager();
 
-	if (strbuf_avail(&error_buf)) {
-		fprintf(stderr, "  Error: %s\n", error_buf.buf);
-		strbuf_release(&error_buf);
+	if (error_buf) {
+		fprintf(stderr, "  Error: %s\n", error_buf);
+		zfree(&error_buf);
 	}
 
 	fprintf(stderr, "\n Usage: %s\n", *usagestr++);
@@ -852,11 +850,15 @@ void usage_with_options_msg(const char * const *usagestr,
 			    const struct option *opts, const char *fmt, ...)
 {
 	va_list ap;
+	char *tmp = error_buf;
 
 	va_start(ap, fmt);
-	strbuf_addv(&error_buf, fmt, ap);
+	if (vasprintf(&error_buf, fmt, ap) == -1)
+		die("vasprintf failed");
 	va_end(ap);
 
+	free(tmp);
+
 	usage_with_options_internal(usagestr, opts, 0, NULL);
 	exit(129);
 }
diff --git a/tools/perf/util/subcmd-util.h b/tools/perf/util/subcmd-util.h
new file mode 100644
index 0000000..98fb9f9
--- /dev/null
+++ b/tools/perf/util/subcmd-util.h
@@ -0,0 +1,24 @@
+#ifndef __PERF_SUBCMD_UTIL_H
+#define __PERF_SUBCMD_UTIL_H
+
+#include <stdio.h>
+
+#define astrcatf(out, fmt, ...)						\
+({									\
+	char *tmp = *(out);						\
+	if (asprintf((out), "%s" fmt, tmp ?: "", ## __VA_ARGS__) == -1)	\
+		die("asprintf failed");					\
+	free(tmp);							\
+})
+
+static inline void astrcat(char **out, const char *add)
+{
+	char *tmp = *out;
+
+	if (asprintf(out, "%s%s", tmp ?: "", add) == -1)
+		die("asprintf failed");
+
+	free(tmp);
+}
+
+#endif /* __PERF_SUBCMD_UTIL_H */
-- 
cgit v1.1


From 46113a54be53aea50a4f5926b87e86e2e66c4266 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Tue, 15 Dec 2015 09:39:37 -0600
Subject: perf tools: Remove 'perf' from subcmd function and variable names

In preparation for moving exec_cmd.c and run-command.c out of perf and
into a library, remove 'perf' from all the symbol names.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/bc3ee82b40b8f396b644fa49e0f7260ce442635b.1450193761.git.jpoimboe@redhat.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-help.c     |  2 +-
 tools/perf/builtin-script.c   |  8 ++++----
 tools/perf/perf.c             |  6 +++---
 tools/perf/tests/attr.c       |  2 +-
 tools/perf/util/exec_cmd.c    | 20 ++++++++++----------
 tools/perf/util/exec_cmd.h    | 12 ++++++------
 tools/perf/util/help.c        |  4 ++--
 tools/perf/util/run-command.c |  6 +++---
 tools/perf/util/run-command.h |  4 ++--
 9 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/tools/perf/builtin-help.c b/tools/perf/builtin-help.c
index a7d588b..275aa64 100644
--- a/tools/perf/builtin-help.c
+++ b/tools/perf/builtin-help.c
@@ -407,7 +407,7 @@ static int get_html_page_path(struct strbuf *page_path, const char *page)
 #ifndef open_html
 static void open_html(const char *path)
 {
-	execl_perf_cmd("web--browse", "-c", "help.browser", path, NULL);
+	execl_cmd("web--browse", "-c", "help.browser", path, NULL);
 }
 #endif
 
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index d259e9a..571016f 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -1408,7 +1408,7 @@ static int list_available_scripts(const struct option *opt __maybe_unused,
 	char first_half[BUFSIZ];
 	char *script_root;
 
-	snprintf(scripts_path, MAXPATHLEN, "%s/scripts", perf_exec_path());
+	snprintf(scripts_path, MAXPATHLEN, "%s/scripts", get_argv_exec_path());
 
 	scripts_dir = opendir(scripts_path);
 	if (!scripts_dir)
@@ -1529,7 +1529,7 @@ int find_scripts(char **scripts_array, char **scripts_path_array)
 	if (!session)
 		return -1;
 
-	snprintf(scripts_path, MAXPATHLEN, "%s/scripts", perf_exec_path());
+	snprintf(scripts_path, MAXPATHLEN, "%s/scripts", get_argv_exec_path());
 
 	scripts_dir = opendir(scripts_path);
 	if (!scripts_dir) {
@@ -1587,7 +1587,7 @@ static char *get_script_path(const char *script_root, const char *suffix)
 	char lang_path[MAXPATHLEN];
 	char *__script_root;
 
-	snprintf(scripts_path, MAXPATHLEN, "%s/scripts", perf_exec_path());
+	snprintf(scripts_path, MAXPATHLEN, "%s/scripts", get_argv_exec_path());
 
 	scripts_dir = opendir(scripts_path);
 	if (!scripts_dir)
@@ -1823,7 +1823,7 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
 		scripting_max_stack = itrace_synth_opts.callchain_sz;
 
 	/* make sure PERF_EXEC_PATH is set for scripts */
-	perf_set_argv_exec_path(perf_exec_path());
+	set_argv_exec_path(get_argv_exec_path());
 
 	if (argc && !script_name && !rec_script_path && !rep_script_path) {
 		int live_pipe[2];
diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index 783a331..6894325 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -183,9 +183,9 @@ static int handle_options(const char ***argv, int *argc, int *envchanged)
 		if (!prefixcmp(cmd, CMD_EXEC_PATH)) {
 			cmd += strlen(CMD_EXEC_PATH);
 			if (*cmd == '=')
-				perf_set_argv_exec_path(cmd + 1);
+				set_argv_exec_path(cmd + 1);
 			else {
-				puts(perf_exec_path());
+				puts(get_argv_exec_path());
 				exit(0);
 			}
 		} else if (!strcmp(cmd, "--html-path")) {
@@ -538,7 +538,7 @@ int main(int argc, const char **argv)
 	page_size = sysconf(_SC_PAGE_SIZE);
 	cacheline_size = sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
 
-	cmd = perf_extract_argv0_path(argv[0]);
+	cmd = extract_argv0_path(argv[0]);
 	if (!cmd)
 		cmd = "perf-help";
 
diff --git a/tools/perf/tests/attr.c b/tools/perf/tests/attr.c
index b66730eb9..6337f1c 100644
--- a/tools/perf/tests/attr.c
+++ b/tools/perf/tests/attr.c
@@ -164,7 +164,7 @@ int test__attr(int subtest __maybe_unused)
 		return run_dir("./tests", "./perf");
 
 	/* Then installed path. */
-	snprintf(path_dir,  PATH_MAX, "%s/tests", perf_exec_path());
+	snprintf(path_dir,  PATH_MAX, "%s/tests", get_argv_exec_path());
 	snprintf(path_perf, PATH_MAX, "%s/perf", BINDIR);
 
 	if (!lstat(path_dir, &st) &&
diff --git a/tools/perf/util/exec_cmd.c b/tools/perf/util/exec_cmd.c
index 65d86dc..701111a 100644
--- a/tools/perf/util/exec_cmd.c
+++ b/tools/perf/util/exec_cmd.c
@@ -32,7 +32,7 @@ char *system_path(const char *path)
 	return buf;
 }
 
-const char *perf_extract_argv0_path(const char *argv0)
+const char *extract_argv0_path(const char *argv0)
 {
 	const char *slash;
 
@@ -51,7 +51,7 @@ const char *perf_extract_argv0_path(const char *argv0)
 	return argv0;
 }
 
-void perf_set_argv_exec_path(const char *exec_path)
+void set_argv_exec_path(const char *exec_path)
 {
 	argv_exec_path = exec_path;
 	/*
@@ -61,8 +61,8 @@ void perf_set_argv_exec_path(const char *exec_path)
 }
 
 
-/* Returns the highest-priority, location to look for perf programs. */
-char *perf_exec_path(void)
+/* Returns the highest-priority location to look for subprograms. */
+char *get_argv_exec_path(void)
 {
 	char *env;
 
@@ -92,7 +92,7 @@ void setup_path(void)
 {
 	const char *old_path = getenv("PATH");
 	char *new_path = NULL;
-	char *tmp = perf_exec_path();
+	char *tmp = get_argv_exec_path();
 
 	add_path(&new_path, tmp);
 	add_path(&new_path, argv0_path);
@@ -108,7 +108,7 @@ void setup_path(void)
 	free(new_path);
 }
 
-static const char **prepare_perf_cmd(const char **argv)
+static const char **prepare_exec_cmd(const char **argv)
 {
 	int argc;
 	const char **nargv;
@@ -124,8 +124,8 @@ static const char **prepare_perf_cmd(const char **argv)
 	return nargv;
 }
 
-int execv_perf_cmd(const char **argv) {
-	const char **nargv = prepare_perf_cmd(argv);
+int execv_cmd(const char **argv) {
+	const char **nargv = prepare_exec_cmd(argv);
 
 	/* execvp() can only ever return if it fails */
 	execvp(subcmd_config.exec_name, (char **)nargv);
@@ -135,7 +135,7 @@ int execv_perf_cmd(const char **argv) {
 }
 
 
-int execl_perf_cmd(const char *cmd,...)
+int execl_cmd(const char *cmd,...)
 {
 	int argc;
 	const char *argv[MAX_ARGS + 1];
@@ -155,5 +155,5 @@ int execl_perf_cmd(const char *cmd,...)
 		return error("too many args to run %s", cmd);
 
 	argv[argc] = NULL;
-	return execv_perf_cmd(argv);
+	return execv_cmd(argv);
 }
diff --git a/tools/perf/util/exec_cmd.h b/tools/perf/util/exec_cmd.h
index fd4434e..f1bd343 100644
--- a/tools/perf/util/exec_cmd.h
+++ b/tools/perf/util/exec_cmd.h
@@ -4,13 +4,13 @@
 extern void exec_cmd_init(const char *exec_name, const char *prefix,
 			  const char *exec_path, const char *exec_path_env);
 
-extern void perf_set_argv_exec_path(const char *exec_path);
-extern const char *perf_extract_argv0_path(const char *path);
+extern void set_argv_exec_path(const char *exec_path);
+extern const char *extract_argv0_path(const char *path);
 extern void setup_path(void);
-extern int execv_perf_cmd(const char **argv); /* NULL terminated */
-extern int execl_perf_cmd(const char *cmd, ...);
-/* perf_exec_path and system_path return malloc'd string, caller must free it */
-extern char *perf_exec_path(void);
+extern int execv_cmd(const char **argv); /* NULL terminated */
+extern int execl_cmd(const char *cmd, ...);
+/* get_argv_exec_path and system_path return malloc'd string, caller must free it */
+extern char *get_argv_exec_path(void);
 extern char *system_path(const char *path);
 
 #endif /* __PERF_EXEC_CMD_H */
diff --git a/tools/perf/util/help.c b/tools/perf/util/help.c
index 8e5e0ce..303a347 100644
--- a/tools/perf/util/help.c
+++ b/tools/perf/util/help.c
@@ -155,7 +155,7 @@ void load_command_list(const char *prefix,
 		struct cmdnames *other_cmds)
 {
 	const char *env_path = getenv("PATH");
-	char *exec_path = perf_exec_path();
+	char *exec_path = get_argv_exec_path();
 
 	if (exec_path) {
 		list_commands_in_dir(main_cmds, exec_path, prefix);
@@ -200,7 +200,7 @@ void list_commands(const char *title, struct cmdnames *main_cmds,
 			longest = other_cmds->names[i]->len;
 
 	if (main_cmds->cnt) {
-		char *exec_path = perf_exec_path();
+		char *exec_path = get_argv_exec_path();
 		printf("available %s in '%s'\n", title, exec_path);
 		printf("----------------");
 		mput_char('-', strlen(title) + strlen(exec_path));
diff --git a/tools/perf/util/run-command.c b/tools/perf/util/run-command.c
index 34622b5..910c0f6 100644
--- a/tools/perf/util/run-command.c
+++ b/tools/perf/util/run-command.c
@@ -112,8 +112,8 @@ int start_command(struct child_process *cmd)
 		}
 		if (cmd->preexec_cb)
 			cmd->preexec_cb();
-		if (cmd->perf_cmd) {
-			execv_perf_cmd(cmd->argv);
+		if (cmd->exec_cmd) {
+			execv_cmd(cmd->argv);
 		} else {
 			execvp(cmd->argv[0], (char *const*) cmd->argv);
 		}
@@ -207,7 +207,7 @@ static void prepare_run_command_v_opt(struct child_process *cmd,
 	memset(cmd, 0, sizeof(*cmd));
 	cmd->argv = argv;
 	cmd->no_stdin = opt & RUN_COMMAND_NO_STDIN ? 1 : 0;
-	cmd->perf_cmd = opt & RUN_PERF_CMD ? 1 : 0;
+	cmd->exec_cmd = opt & RUN_EXEC_CMD ? 1 : 0;
 	cmd->stdout_to_stderr = opt & RUN_COMMAND_STDOUT_TO_STDERR ? 1 : 0;
 }
 
diff --git a/tools/perf/util/run-command.h b/tools/perf/util/run-command.h
index 1ef264d..cf7d655 100644
--- a/tools/perf/util/run-command.h
+++ b/tools/perf/util/run-command.h
@@ -41,7 +41,7 @@ struct child_process {
 	unsigned no_stdin:1;
 	unsigned no_stdout:1;
 	unsigned no_stderr:1;
-	unsigned perf_cmd:1; /* if this is to be perf sub-command */
+	unsigned exec_cmd:1; /* if this is to be external sub-command */
 	unsigned stdout_to_stderr:1;
 	void (*preexec_cb)(void);
 };
@@ -51,7 +51,7 @@ int finish_command(struct child_process *);
 int run_command(struct child_process *);
 
 #define RUN_COMMAND_NO_STDIN 1
-#define RUN_PERF_CMD	     2	/*If this is to be perf sub-command */
+#define RUN_EXEC_CMD	     2	/*If this is to be external sub-command */
 #define RUN_COMMAND_STDOUT_TO_STDERR 4
 int run_command_v_opt(const char **argv, int opt);
 
-- 
cgit v1.1


From 2f4ce5ec1d447beb42143a9653716a2ab025161e Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Tue, 15 Dec 2015 09:39:38 -0600
Subject: perf tools: Finalize subcmd independence

For the files that will be moved to the subcmd library, remove all their
perf-specific includes and duplicate any needed functionality.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/6e12946f0f26ce4d543d34db68d9dae3c8551cb9.1450193761.git.jpoimboe@redhat.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/exec_cmd.c      | 64 ++++++++++++++++++++++++++++++++----
 tools/perf/util/help.c          | 47 +++++++++++++++++++++++---
 tools/perf/util/help.h          |  4 ++-
 tools/perf/util/pager.c         |  7 +++-
 tools/perf/util/parse-options.c | 73 +++++++++++++++++++++++++----------------
 tools/perf/util/parse-options.h |  2 +-
 tools/perf/util/run-command.c   | 16 ++++++---
 tools/perf/util/run-command.h   |  2 ++
 tools/perf/util/sigchain.c      |  3 +-
 tools/perf/util/subcmd-util.h   | 67 +++++++++++++++++++++++++++++++++++++
 tools/perf/util/util.h          | 14 --------
 11 files changed, 237 insertions(+), 62 deletions(-)

diff --git a/tools/perf/util/exec_cmd.c b/tools/perf/util/exec_cmd.c
index 701111a..e7f9ed79 100644
--- a/tools/perf/util/exec_cmd.c
+++ b/tools/perf/util/exec_cmd.c
@@ -1,12 +1,17 @@
-#include "cache.h"
-#include "exec_cmd.h"
-#include "quote.h"
-#include "subcmd-config.h"
-
+#include <linux/compiler.h>
+#include <linux/string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
 #include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
 #include "subcmd-util.h"
+#include "exec_cmd.h"
+#include "subcmd-config.h"
 
 #define MAX_ARGS	32
+#define PATH_MAX	4096
 
 static const char *argv_exec_path;
 static const char *argv0_path;
@@ -20,6 +25,49 @@ void exec_cmd_init(const char *exec_name, const char *prefix,
 	subcmd_config.exec_path_env	= exec_path_env;
 }
 
+#define is_dir_sep(c) ((c) == '/')
+
+static int is_absolute_path(const char *path)
+{
+	return path[0] == '/';
+}
+
+static const char *get_pwd_cwd(void)
+{
+	static char cwd[PATH_MAX + 1];
+	char *pwd;
+	struct stat cwd_stat, pwd_stat;
+	if (getcwd(cwd, PATH_MAX) == NULL)
+		return NULL;
+	pwd = getenv("PWD");
+	if (pwd && strcmp(pwd, cwd)) {
+		stat(cwd, &cwd_stat);
+		if (!stat(pwd, &pwd_stat) &&
+		    pwd_stat.st_dev == cwd_stat.st_dev &&
+		    pwd_stat.st_ino == cwd_stat.st_ino) {
+			strlcpy(cwd, pwd, PATH_MAX);
+		}
+	}
+	return cwd;
+}
+
+static const char *make_nonrelative_path(const char *path)
+{
+	static char buf[PATH_MAX + 1];
+
+	if (is_absolute_path(path)) {
+		if (strlcpy(buf, path, PATH_MAX) >= PATH_MAX)
+			die("Too long path: %.*s", 60, path);
+	} else {
+		const char *cwd = get_pwd_cwd();
+		if (!cwd)
+			die("Cannot determine the current working directory");
+		if (snprintf(buf, PATH_MAX, "%s/%s", cwd, path) >= PATH_MAX)
+			die("Too long path: %.*s", 60, path);
+	}
+	return buf;
+}
+
 char *system_path(const char *path)
 {
 	char *buf = NULL;
@@ -151,8 +199,10 @@ int execl_cmd(const char *cmd,...)
 			break;
 	}
 	va_end(param);
-	if (MAX_ARGS <= argc)
-		return error("too many args to run %s", cmd);
+	if (MAX_ARGS <= argc) {
+		fprintf(stderr, " Error: too many args to run %s\n", cmd);
+		return -1;
+	}
 
 	argv[argc] = NULL;
 	return execv_cmd(argv);
diff --git a/tools/perf/util/help.c b/tools/perf/util/help.c
index 303a347..8169480 100644
--- a/tools/perf/util/help.c
+++ b/tools/perf/util/help.c
@@ -1,8 +1,15 @@
-#include "cache.h"
-#include "../builtin.h"
-#include "exec_cmd.h"
-#include "help.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <termios.h>
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <dirent.h>
 #include "subcmd-util.h"
+#include "help.h"
+#include "exec_cmd.h"
 
 void add_cmdname(struct cmdnames *cmds, const char *name, size_t len)
 {
@@ -70,6 +77,28 @@ void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes)
 	cmds->cnt = cj;
 }
 
+static void get_term_dimensions(struct winsize *ws)
+{
+	char *s = getenv("LINES");
+
+	if (s != NULL) {
+		ws->ws_row = atoi(s);
+		s = getenv("COLUMNS");
+		if (s != NULL) {
+			ws->ws_col = atoi(s);
+			if (ws->ws_row && ws->ws_col)
+				return;
+		}
+	}
+#ifdef TIOCGWINSZ
+	if (ioctl(1, TIOCGWINSZ, ws) == 0 &&
+	    ws->ws_row && ws->ws_col)
+		return;
+#endif
+	ws->ws_row = 25;
+	ws->ws_col = 80;
+}
+
 static void pretty_print_string_list(struct cmdnames *cmds, int longest)
 {
 	int cols = 1, rows;
@@ -113,6 +142,14 @@ static int is_executable(const char *name)
 	return st.st_mode & S_IXUSR;
 }
 
+static int has_extension(const char *filename, const char *ext)
+{
+	size_t len = strlen(filename);
+	size_t extlen = strlen(ext);
+
+	return len > extlen && !memcmp(filename + len - extlen, ext, extlen);
+}
+
 static void list_commands_in_dir(struct cmdnames *cmds,
 					 const char *path,
 					 const char *prefix)
@@ -168,7 +205,7 @@ void load_command_list(const char *prefix,
 		char *paths, *path, *colon;
 		path = paths = strdup(env_path);
 		while (1) {
-			if ((colon = strchr(path, PATH_SEP)))
+			if ((colon = strchr(path, ':')))
 				*colon = 0;
 			if (!exec_path || strcmp(path, exec_path))
 				list_commands_in_dir(other_cmds, path, prefix);
diff --git a/tools/perf/util/help.h b/tools/perf/util/help.h
index 14851b0..096c8bc 100644
--- a/tools/perf/util/help.h
+++ b/tools/perf/util/help.h
@@ -1,12 +1,14 @@
 #ifndef __PERF_HELP_H
 #define __PERF_HELP_H
 
+#include <sys/types.h>
+
 struct cmdnames {
 	size_t alloc;
 	size_t cnt;
 	struct cmdname {
 		size_t len; /* also used for similarity index in help.c */
-		char name[FLEX_ARRAY];
+		char name[];
 	} **names;
 };
 
diff --git a/tools/perf/util/pager.c b/tools/perf/util/pager.c
index d5ef62e..d50f3b58 100644
--- a/tools/perf/util/pager.c
+++ b/tools/perf/util/pager.c
@@ -1,4 +1,9 @@
-#include "cache.h"
+#include <sys/select.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+#include "pager.h"
 #include "run-command.h"
 #include "sigchain.h"
 #include "subcmd-config.h"
diff --git a/tools/perf/util/parse-options.c b/tools/perf/util/parse-options.c
index c1da2a5..981bb44 100644
--- a/tools/perf/util/parse-options.c
+++ b/tools/perf/util/parse-options.c
@@ -1,10 +1,14 @@
-#include "util.h"
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <ctype.h>
 #include "subcmd-util.h"
 #include "parse-options.h"
-#include "cache.h"
-#include "header.h"
 #include "subcmd-config.h"
-#include <linux/string.h>
+#include "pager.h"
 
 #define OPT_SHORT 1
 #define OPT_UNSET 2
@@ -14,20 +18,29 @@ char *error_buf;
 static int opterror(const struct option *opt, const char *reason, int flags)
 {
 	if (flags & OPT_SHORT)
-		return error("switch `%c' %s", opt->short_name, reason);
-	if (flags & OPT_UNSET)
-		return error("option `no-%s' %s", opt->long_name, reason);
-	return error("option `%s' %s", opt->long_name, reason);
+		fprintf(stderr, " Error: switch `%c' %s", opt->short_name, reason);
+	else if (flags & OPT_UNSET)
+		fprintf(stderr, " Error: option `no-%s' %s", opt->long_name, reason);
+	else
+		fprintf(stderr, " Error: option `%s' %s", opt->long_name, reason);
+
+	return -1;
+}
+
+static const char *skip_prefix(const char *str, const char *prefix)
+{
+	size_t len = strlen(prefix);
+	return strncmp(str, prefix, len) ? NULL : str + len;
 }
 
 static void optwarning(const struct option *opt, const char *reason, int flags)
 {
 	if (flags & OPT_SHORT)
-		warning("switch `%c' %s", opt->short_name, reason);
+		fprintf(stderr, " Warning: switch `%c' %s", opt->short_name, reason);
 	else if (flags & OPT_UNSET)
-		warning("option `no-%s' %s", opt->long_name, reason);
+		fprintf(stderr, " Warning: option `no-%s' %s", opt->long_name, reason);
 	else
-		warning("option `%s' %s", opt->long_name, reason);
+		fprintf(stderr, " Warning: option `%s' %s", opt->long_name, reason);
 }
 
 static int get_arg(struct parse_opt_ctx_t *p, const struct option *opt,
@@ -71,11 +84,11 @@ static int get_value(struct parse_opt_ctx_t *p,
 
 			if (((flags & OPT_SHORT) && p->excl_opt->short_name) ||
 			    p->excl_opt->long_name == NULL) {
-				scnprintf(msg, sizeof(msg), "cannot be used with switch `%c'",
-					  p->excl_opt->short_name);
+				snprintf(msg, sizeof(msg), "cannot be used with switch `%c'",
+					 p->excl_opt->short_name);
 			} else {
-				scnprintf(msg, sizeof(msg), "cannot be used with %s",
-					  p->excl_opt->long_name);
+				snprintf(msg, sizeof(msg), "cannot be used with %s",
+					 p->excl_opt->long_name);
 			}
 			opterror(opt, msg, flags);
 			return -3;
@@ -401,14 +414,16 @@ match:
 		return get_value(p, options, flags);
 	}
 
-	if (ambiguous_option)
-		return error("Ambiguous option: %s "
-			"(could be --%s%s or --%s%s)",
-			arg,
-			(ambiguous_flags & OPT_UNSET) ?  "no-" : "",
-			ambiguous_option->long_name,
-			(abbrev_flags & OPT_UNSET) ?  "no-" : "",
-			abbrev_option->long_name);
+	if (ambiguous_option) {
+		 fprintf(stderr,
+			 " Error: Ambiguous option: %s (could be --%s%s or --%s%s)",
+			 arg,
+			 (ambiguous_flags & OPT_UNSET) ?  "no-" : "",
+			 ambiguous_option->long_name,
+			 (abbrev_flags & OPT_UNSET) ?  "no-" : "",
+			 abbrev_option->long_name);
+		 return -1;
+	}
 	if (abbrev_option)
 		return get_value(p, abbrev_option, abbrev_flags);
 	return -2;
@@ -420,7 +435,7 @@ static void check_typos(const char *arg, const struct option *options)
 		return;
 
 	if (!prefixcmp(arg, "no-")) {
-		error ("did you mean `--%s` (with two dashes ?)", arg);
+		fprintf(stderr, " Error: did you mean `--%s` (with two dashes ?)", arg);
 		exit(129);
 	}
 
@@ -428,7 +443,7 @@ static void check_typos(const char *arg, const struct option *options)
 		if (!options->long_name)
 			continue;
 		if (!prefixcmp(options->long_name, arg)) {
-			error ("did you mean `--%s` (with two dashes ?)", arg);
+			fprintf(stderr, " Error: did you mean `--%s` (with two dashes ?)", arg);
 			exit(129);
 		}
 	}
@@ -746,16 +761,18 @@ static int option__cmp(const void *va, const void *vb)
 
 static struct option *options__order(const struct option *opts)
 {
-	int nr_opts = 0;
+	int nr_opts = 0, len;
 	const struct option *o = opts;
 	struct option *ordered;
 
 	for (o = opts; o->type != OPTION_END; o++)
 		++nr_opts;
 
-	ordered = memdup(opts, sizeof(*o) * (nr_opts + 1));
-	if (ordered == NULL)
+	len = sizeof(*o) * (nr_opts + 1);
+	ordered = malloc(len);
+	if (!ordered)
 		goto out;
+	memcpy(ordered, opts, len);
 
 	qsort(ordered, nr_opts, sizeof(*o), option__cmp);
 out:
diff --git a/tools/perf/util/parse-options.h b/tools/perf/util/parse-options.h
index d1544069..dec893f 100644
--- a/tools/perf/util/parse-options.h
+++ b/tools/perf/util/parse-options.h
@@ -1,8 +1,8 @@
 #ifndef __PERF_PARSE_OPTIONS_H
 #define __PERF_PARSE_OPTIONS_H
 
-#include <linux/kernel.h>
 #include <stdbool.h>
+#include <stdint.h>
 
 enum parse_opt_type {
 	/* special types */
diff --git a/tools/perf/util/run-command.c b/tools/perf/util/run-command.c
index 910c0f6..fed37d6 100644
--- a/tools/perf/util/run-command.c
+++ b/tools/perf/util/run-command.c
@@ -1,7 +1,15 @@
-#include "cache.h"
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/wait.h>
+#include "subcmd-util.h"
 #include "run-command.h"
 #include "exec_cmd.h"
-#include "debug.h"
+
+#define STRERR_BUFSIZE 128
 
 static inline void close_pair(int fd[2])
 {
@@ -164,8 +172,8 @@ static int wait_or_whine(pid_t pid)
 		if (waiting < 0) {
 			if (errno == EINTR)
 				continue;
-			error("waitpid failed (%s)",
-			      strerror_r(errno, sbuf, sizeof(sbuf)));
+			fprintf(stderr, " Error: waitpid failed (%s)",
+				strerror_r(errno, sbuf, sizeof(sbuf)));
 			return -ERR_RUN_COMMAND_WAITPID;
 		}
 		if (waiting != pid)
diff --git a/tools/perf/util/run-command.h b/tools/perf/util/run-command.h
index cf7d655..4a55393 100644
--- a/tools/perf/util/run-command.h
+++ b/tools/perf/util/run-command.h
@@ -1,6 +1,8 @@
 #ifndef __PERF_RUN_COMMAND_H
 #define __PERF_RUN_COMMAND_H
 
+#include <unistd.h>
+
 enum {
 	ERR_RUN_COMMAND_FORK = 10000,
 	ERR_RUN_COMMAND_EXEC,
diff --git a/tools/perf/util/sigchain.c b/tools/perf/util/sigchain.c
index ba785e9..3537c34 100644
--- a/tools/perf/util/sigchain.c
+++ b/tools/perf/util/sigchain.c
@@ -1,5 +1,6 @@
+#include <signal.h>
+#include "subcmd-util.h"
 #include "sigchain.h"
-#include "cache.h"
 
 #define SIGCHAIN_MAX_SIGNALS 32
 
diff --git a/tools/perf/util/subcmd-util.h b/tools/perf/util/subcmd-util.h
index 98fb9f9..321aeb1 100644
--- a/tools/perf/util/subcmd-util.h
+++ b/tools/perf/util/subcmd-util.h
@@ -1,8 +1,66 @@
 #ifndef __PERF_SUBCMD_UTIL_H
 #define __PERF_SUBCMD_UTIL_H
 
+#include <stdarg.h>
+#include <stdlib.h>
 #include <stdio.h>
 
+#define NORETURN __attribute__((__noreturn__))
+
+static inline void report(const char *prefix, const char *err, va_list params)
+{
+	char msg[1024];
+	vsnprintf(msg, sizeof(msg), err, params);
+	fprintf(stderr, " %s%s\n", prefix, msg);
+}
+
+static NORETURN inline void die(const char *err, ...)
+{
+	va_list params;
+
+	va_start(params, err);
+	report(" Fatal: ", err, params);
+	exit(128);
+	va_end(params);
+}
+
+#define zfree(ptr) ({ free(*ptr); *ptr = NULL; })
+
+#define alloc_nr(x) (((x)+16)*3/2)
+
+/*
+ * Realloc the buffer pointed at by variable 'x' so that it can hold
+ * at least 'nr' entries; the number of entries currently allocated
+ * is 'alloc', using the standard growing factor alloc_nr() macro.
+ *
+ * DO NOT USE any expression with side-effect for 'x' or 'alloc'.
+ */
+#define ALLOC_GROW(x, nr, alloc) \
+	do { \
+		if ((nr) > alloc) { \
+			if (alloc_nr(alloc) < (nr)) \
+				alloc = (nr); \
+			else \
+				alloc = alloc_nr(alloc); \
+			x = xrealloc((x), alloc * sizeof(*(x))); \
+		} \
+	} while(0)
+
+static inline void *xrealloc(void *ptr, size_t size)
+{
+	void *ret = realloc(ptr, size);
+	if (!ret && !size)
+		ret = realloc(ptr, 1);
+	if (!ret) {
+		ret = realloc(ptr, size);
+		if (!ret && !size)
+			ret = realloc(ptr, 1);
+		if (!ret)
+			die("Out of memory, realloc failed");
+	}
+	return ret;
+}
+
 #define astrcatf(out, fmt, ...)						\
 ({									\
 	char *tmp = *(out);						\
@@ -21,4 +79,13 @@ static inline void astrcat(char **out, const char *add)
 	free(tmp);
 }
 
+static inline int prefixcmp(const char *str, const char *prefix)
+{
+	for (; ; str++, prefix++)
+		if (!*prefix)
+			return 0;
+		else if (*str != *prefix)
+			return (unsigned char)*prefix - (unsigned char)*str;
+}
+
 #endif /* __PERF_SUBCMD_UTIL_H */
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
index 150858f..4b519c5 100644
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -151,12 +151,6 @@ extern void set_warning_routine(void (*routine)(const char *err, va_list params)
 extern int prefixcmp(const char *str, const char *prefix);
 extern void set_buildid_dir(const char *dir);
 
-static inline const char *skip_prefix(const char *str, const char *prefix)
-{
-	size_t len = strlen(prefix);
-	return strncmp(str, prefix, len) ? NULL : str + len;
-}
-
 #ifdef __GLIBC_PREREQ
 #if __GLIBC_PREREQ(2, 1)
 #define HAVE_STRCHRNUL
@@ -187,14 +181,6 @@ static inline void *zalloc(size_t size)
 
 #define zfree(ptr) ({ free(*ptr); *ptr = NULL; })
 
-static inline int has_extension(const char *filename, const char *ext)
-{
-	size_t len = strlen(filename);
-	size_t extlen = strlen(ext);
-
-	return len > extlen && !memcmp(filename + len - extlen, ext, extlen);
-}
-
 /* Sane ctype - no locale, and works with signed chars */
 #undef isascii
 #undef isspace
-- 
cgit v1.1


From 4b6ab94eabe4f55371cff4569750bb3996c55db6 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Tue, 15 Dec 2015 09:39:39 -0600
Subject: perf subcmd: Create subcmd library

Move the subcommand-related files from perf to a new library named
libsubcmd.a.

Since we're moving files anyway, go ahead and rename 'exec_cmd.*' to
'exec-cmd.*' to be consistent with the naming of all the other files.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/c0a838d4c878ab17fee50998811612b2281355c1.1450193761.git.jpoimboe@redhat.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/lib/subcmd/Build                 |   7 +
 tools/lib/subcmd/Makefile              |  48 ++
 tools/lib/subcmd/exec-cmd.c            | 209 +++++++
 tools/lib/subcmd/exec-cmd.h            |  16 +
 tools/lib/subcmd/help.c                | 268 +++++++++
 tools/lib/subcmd/help.h                |  34 ++
 tools/lib/subcmd/pager.c               | 100 ++++
 tools/lib/subcmd/pager.h               |   9 +
 tools/lib/subcmd/parse-options.c       | 983 +++++++++++++++++++++++++++++++++
 tools/lib/subcmd/parse-options.h       | 228 ++++++++
 tools/lib/subcmd/run-command.c         | 227 ++++++++
 tools/lib/subcmd/run-command.h         |  60 ++
 tools/lib/subcmd/sigchain.c            |  53 ++
 tools/lib/subcmd/sigchain.h            |  10 +
 tools/lib/subcmd/subcmd-config.c       |  11 +
 tools/lib/subcmd/subcmd-config.h       |  14 +
 tools/lib/subcmd/subcmd-util.h         |  91 +++
 tools/perf/MANIFEST                    |   1 +
 tools/perf/Makefile.perf               |  20 +-
 tools/perf/arch/x86/util/intel-pt.c    |   2 +-
 tools/perf/bench/futex-hash.c          |   2 +-
 tools/perf/bench/futex-lock-pi.c       |   2 +-
 tools/perf/bench/futex-requeue.c       |   2 +-
 tools/perf/bench/futex-wake-parallel.c |   2 +-
 tools/perf/bench/futex-wake.c          |   2 +-
 tools/perf/bench/mem-functions.c       |   2 +-
 tools/perf/bench/numa.c                |   2 +-
 tools/perf/bench/sched-messaging.c     |   2 +-
 tools/perf/bench/sched-pipe.c          |   2 +-
 tools/perf/builtin-annotate.c          |   2 +-
 tools/perf/builtin-bench.c             |   2 +-
 tools/perf/builtin-buildid-cache.c     |   2 +-
 tools/perf/builtin-buildid-list.c      |   2 +-
 tools/perf/builtin-config.c            |   2 +-
 tools/perf/builtin-data.c              |   2 +-
 tools/perf/builtin-evlist.c            |   2 +-
 tools/perf/builtin-help.c              |   8 +-
 tools/perf/builtin-inject.c            |   2 +-
 tools/perf/builtin-kmem.c              |   2 +-
 tools/perf/builtin-kvm.c               |   2 +-
 tools/perf/builtin-list.c              |   2 +-
 tools/perf/builtin-lock.c              |   2 +-
 tools/perf/builtin-mem.c               |   2 +-
 tools/perf/builtin-probe.c             |   2 +-
 tools/perf/builtin-record.c            |   2 +-
 tools/perf/builtin-report.c            |   2 +-
 tools/perf/builtin-sched.c             |   2 +-
 tools/perf/builtin-script.c            |   4 +-
 tools/perf/builtin-stat.c              |   2 +-
 tools/perf/builtin-timechart.c         |   2 +-
 tools/perf/builtin-top.c               |   2 +-
 tools/perf/builtin-trace.c             |   4 +-
 tools/perf/perf.c                      |   6 +-
 tools/perf/tests/attr.c                |   2 +-
 tools/perf/tests/builtin-test.c        |   2 +-
 tools/perf/util/Build                  |   7 -
 tools/perf/util/auxtrace.c             |   2 +-
 tools/perf/util/cache.h                |   2 +-
 tools/perf/util/cgroup.c               |   2 +-
 tools/perf/util/config.c               |   2 +-
 tools/perf/util/evlist.c               |   2 +-
 tools/perf/util/exec_cmd.c             | 209 -------
 tools/perf/util/exec_cmd.h             |  16 -
 tools/perf/util/help-unknown-cmd.c     |   2 +-
 tools/perf/util/help.c                 | 268 ---------
 tools/perf/util/help.h                 |  34 --
 tools/perf/util/pager.c                | 100 ----
 tools/perf/util/pager.h                |   9 -
 tools/perf/util/parse-branch-options.c |   2 +-
 tools/perf/util/parse-events.c         |   4 +-
 tools/perf/util/parse-options.c        | 983 ---------------------------------
 tools/perf/util/parse-options.h        | 228 --------
 tools/perf/util/parse-regs-options.c   |   2 +-
 tools/perf/util/run-command.c          | 227 --------
 tools/perf/util/run-command.h          |  60 --
 tools/perf/util/sigchain.c             |  53 --
 tools/perf/util/sigchain.h             |  10 -
 tools/perf/util/sort.h                 |   2 +-
 tools/perf/util/subcmd-config.c        |  11 -
 tools/perf/util/subcmd-config.h        |  14 -
 tools/perf/util/subcmd-util.h          |  91 ---
 81 files changed, 2439 insertions(+), 2378 deletions(-)
 create mode 100644 tools/lib/subcmd/Build
 create mode 100644 tools/lib/subcmd/Makefile
 create mode 100644 tools/lib/subcmd/exec-cmd.c
 create mode 100644 tools/lib/subcmd/exec-cmd.h
 create mode 100644 tools/lib/subcmd/help.c
 create mode 100644 tools/lib/subcmd/help.h
 create mode 100644 tools/lib/subcmd/pager.c
 create mode 100644 tools/lib/subcmd/pager.h
 create mode 100644 tools/lib/subcmd/parse-options.c
 create mode 100644 tools/lib/subcmd/parse-options.h
 create mode 100644 tools/lib/subcmd/run-command.c
 create mode 100644 tools/lib/subcmd/run-command.h
 create mode 100644 tools/lib/subcmd/sigchain.c
 create mode 100644 tools/lib/subcmd/sigchain.h
 create mode 100644 tools/lib/subcmd/subcmd-config.c
 create mode 100644 tools/lib/subcmd/subcmd-config.h
 create mode 100644 tools/lib/subcmd/subcmd-util.h
 delete mode 100644 tools/perf/util/exec_cmd.c
 delete mode 100644 tools/perf/util/exec_cmd.h
 delete mode 100644 tools/perf/util/help.c
 delete mode 100644 tools/perf/util/help.h
 delete mode 100644 tools/perf/util/pager.c
 delete mode 100644 tools/perf/util/pager.h
 delete mode 100644 tools/perf/util/parse-options.c
 delete mode 100644 tools/perf/util/parse-options.h
 delete mode 100644 tools/perf/util/run-command.c
 delete mode 100644 tools/perf/util/run-command.h
 delete mode 100644 tools/perf/util/sigchain.c
 delete mode 100644 tools/perf/util/sigchain.h
 delete mode 100644 tools/perf/util/subcmd-config.c
 delete mode 100644 tools/perf/util/subcmd-config.h
 delete mode 100644 tools/perf/util/subcmd-util.h

diff --git a/tools/lib/subcmd/Build b/tools/lib/subcmd/Build
new file mode 100644
index 0000000..ee31288
--- /dev/null
+++ b/tools/lib/subcmd/Build
@@ -0,0 +1,7 @@
+libsubcmd-y += exec-cmd.o
+libsubcmd-y += help.o
+libsubcmd-y += pager.o
+libsubcmd-y += parse-options.o
+libsubcmd-y += run-command.o
+libsubcmd-y += sigchain.o
+libsubcmd-y += subcmd-config.o
diff --git a/tools/lib/subcmd/Makefile b/tools/lib/subcmd/Makefile
new file mode 100644
index 0000000..629cf8c
--- /dev/null
+++ b/tools/lib/subcmd/Makefile
@@ -0,0 +1,48 @@
+include ../../scripts/Makefile.include
+include ../../perf/config/utilities.mak		# QUIET_CLEAN
+
+ifeq ($(srctree),)
+srctree := $(patsubst %/,%,$(dir $(shell pwd)))
+srctree := $(patsubst %/,%,$(dir $(srctree)))
+srctree := $(patsubst %/,%,$(dir $(srctree)))
+#$(info Determined 'srctree' to be $(srctree))
+endif
+
+CC = $(CROSS_COMPILE)gcc
+AR = $(CROSS_COMPILE)ar
+RM = rm -f
+
+MAKEFLAGS += --no-print-directory
+
+LIBFILE = $(OUTPUT)libsubcmd.a
+
+CFLAGS := $(EXTRA_WARNINGS) $(EXTRA_CFLAGS)
+CFLAGS += -ggdb3 -Wall -Wextra -std=gnu99 -Werror -O6 -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=2 -fPIC
+CFLAGS += -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE
+
+CFLAGS += -I$(srctree)/tools/include/
+CFLAGS += -I$(srctree)/include/uapi
+CFLAGS += -I$(srctree)/include
+
+SUBCMD_IN := $(OUTPUT)libsubcmd-in.o
+
+all:
+
+export srctree OUTPUT CC LD CFLAGS V
+include $(srctree)/tools/build/Makefile.include
+
+all: fixdep $(LIBFILE)
+
+$(SUBCMD_IN): FORCE
+	@$(MAKE) $(build)=libsubcmd
+
+$(LIBFILE): $(SUBCMD_IN)
+	$(QUIET_AR)$(RM) $@ && $(AR) rcs $@ $(SUBCMD_IN)
+
+clean:
+	$(call QUIET_CLEAN, libsubcmd) $(RM) $(LIBFILE); \
+	find $(if $(OUTPUT),$(OUTPUT),.) -name \*.o -or -name \*.o.cmd -or -name \*.o.d | xargs $(RM)
+
+FORCE:
+
+.PHONY: clean FORCE
diff --git a/tools/lib/subcmd/exec-cmd.c b/tools/lib/subcmd/exec-cmd.c
new file mode 100644
index 0000000..1ae833a
--- /dev/null
+++ b/tools/lib/subcmd/exec-cmd.c
@@ -0,0 +1,209 @@
+#include <linux/compiler.h>
+#include <linux/string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "subcmd-util.h"
+#include "exec-cmd.h"
+#include "subcmd-config.h"
+
+#define MAX_ARGS	32
+#define PATH_MAX	4096
+
+static const char *argv_exec_path;
+static const char *argv0_path;
+
+void exec_cmd_init(const char *exec_name, const char *prefix,
+		   const char *exec_path, const char *exec_path_env)
+{
+	subcmd_config.exec_name		= exec_name;
+	subcmd_config.prefix		= prefix;
+	subcmd_config.exec_path		= exec_path;
+	subcmd_config.exec_path_env	= exec_path_env;
+}
+
+#define is_dir_sep(c) ((c) == '/')
+
+static int is_absolute_path(const char *path)
+{
+	return path[0] == '/';
+}
+
+static const char *get_pwd_cwd(void)
+{
+	static char cwd[PATH_MAX + 1];
+	char *pwd;
+	struct stat cwd_stat, pwd_stat;
+	if (getcwd(cwd, PATH_MAX) == NULL)
+		return NULL;
+	pwd = getenv("PWD");
+	if (pwd && strcmp(pwd, cwd)) {
+		stat(cwd, &cwd_stat);
+		if (!stat(pwd, &pwd_stat) &&
+		    pwd_stat.st_dev == cwd_stat.st_dev &&
+		    pwd_stat.st_ino == cwd_stat.st_ino) {
+			strlcpy(cwd, pwd, PATH_MAX);
+		}
+	}
+	return cwd;
+}
+
+static const char *make_nonrelative_path(const char *path)
+{
+	static char buf[PATH_MAX + 1];
+
+	if (is_absolute_path(path)) {
+		if (strlcpy(buf, path, PATH_MAX) >= PATH_MAX)
+			die("Too long path: %.*s", 60, path);
+	} else {
+		const char *cwd = get_pwd_cwd();
+		if (!cwd)
+			die("Cannot determine the current working directory");
+		if (snprintf(buf, PATH_MAX, "%s/%s", cwd, path) >= PATH_MAX)
+			die("Too long path: %.*s", 60, path);
+	}
+	return buf;
+}
+
+char *system_path(const char *path)
+{
+	char *buf = NULL;
+
+	if (is_absolute_path(path))
+		return strdup(path);
+
+	astrcatf(&buf, "%s/%s", subcmd_config.prefix, path);
+
+	return buf;
+}
+
+const char *extract_argv0_path(const char *argv0)
+{
+	const char *slash;
+
+	if (!argv0 || !*argv0)
+		return NULL;
+	slash = argv0 + strlen(argv0);
+
+	while (argv0 <= slash && !is_dir_sep(*slash))
+		slash--;
+
+	if (slash >= argv0) {
+		argv0_path = strndup(argv0, slash - argv0);
+		return argv0_path ? slash + 1 : NULL;
+	}
+
+	return argv0;
+}
+
+void set_argv_exec_path(const char *exec_path)
+{
+	argv_exec_path = exec_path;
+	/*
+	 * Propagate this setting to external programs.
+	 */
+	setenv(subcmd_config.exec_path_env, exec_path, 1);
+}
+
+
+/* Returns the highest-priority location to look for subprograms. */
+char *get_argv_exec_path(void)
+{
+	char *env;
+
+	if (argv_exec_path)
+		return strdup(argv_exec_path);
+
+	env = getenv(subcmd_config.exec_path_env);
+	if (env && *env)
+		return strdup(env);
+
+	return system_path(subcmd_config.exec_path);
+}
+
+static void add_path(char **out, const char *path)
+{
+	if (path && *path) {
+		if (is_absolute_path(path))
+			astrcat(out, path);
+		else
+			astrcat(out, make_nonrelative_path(path));
+
+		astrcat(out, ":");
+	}
+}
+
+void setup_path(void)
+{
+	const char *old_path = getenv("PATH");
+	char *new_path = NULL;
+	char *tmp = get_argv_exec_path();
+
+	add_path(&new_path, tmp);
+	add_path(&new_path, argv0_path);
+	free(tmp);
+
+	if (old_path)
+		astrcat(&new_path, old_path);
+	else
+		astrcat(&new_path, "/usr/local/bin:/usr/bin:/bin");
+
+	setenv("PATH", new_path, 1);
+
+	free(new_path);
+}
+
+static const char **prepare_exec_cmd(const char **argv)
+{
+	int argc;
+	const char **nargv;
+
+	for (argc = 0; argv[argc]; argc++)
+		; /* just counting */
+	nargv = malloc(sizeof(*nargv) * (argc + 2));
+
+	nargv[0] = subcmd_config.exec_name;
+	for (argc = 0; argv[argc]; argc++)
+		nargv[argc + 1] = argv[argc];
+	nargv[argc + 1] = NULL;
+	return nargv;
+}
+
+int execv_cmd(const char **argv) {
+	const char **nargv = prepare_exec_cmd(argv);
+
+	/* execvp() can only ever return if it fails */
+	execvp(subcmd_config.exec_name, (char **)nargv);
+
+	free(nargv);
+	return -1;
+}
+
+
+int execl_cmd(const char *cmd,...)
+{
+	int argc;
+	const char *argv[MAX_ARGS + 1];
+	const char *arg;
+	va_list param;
+
+	va_start(param, cmd);
+	argv[0] = cmd;
+	argc = 1;
+	while (argc < MAX_ARGS) {
+		arg = argv[argc++] = va_arg(param, char *);
+		if (!arg)
+			break;
+	}
+	va_end(param);
+	if (MAX_ARGS <= argc) {
+		fprintf(stderr, " Error: too many args to run %s\n", cmd);
+		return -1;
+	}
+
+	argv[argc] = NULL;
+	return execv_cmd(argv);
+}
diff --git a/tools/lib/subcmd/exec-cmd.h b/tools/lib/subcmd/exec-cmd.h
new file mode 100644
index 0000000..f1bd343
--- /dev/null
+++ b/tools/lib/subcmd/exec-cmd.h
@@ -0,0 +1,16 @@
+#ifndef __PERF_EXEC_CMD_H
+#define __PERF_EXEC_CMD_H
+
+extern void exec_cmd_init(const char *exec_name, const char *prefix,
+			  const char *exec_path, const char *exec_path_env);
+
+extern void set_argv_exec_path(const char *exec_path);
+extern const char *extract_argv0_path(const char *path);
+extern void setup_path(void);
+extern int execv_cmd(const char **argv); /* NULL terminated */
+extern int execl_cmd(const char *cmd, ...);
+/* get_argv_exec_path and system_path return malloc'd string, caller must free it */
+extern char *get_argv_exec_path(void);
+extern char *system_path(const char *path);
+
+#endif /* __PERF_EXEC_CMD_H */
diff --git a/tools/lib/subcmd/help.c b/tools/lib/subcmd/help.c
new file mode 100644
index 0000000..e228c3c
--- /dev/null
+++ b/tools/lib/subcmd/help.c
@@ -0,0 +1,268 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <termios.h>
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <dirent.h>
+#include "subcmd-util.h"
+#include "help.h"
+#include "exec-cmd.h"
+
+void add_cmdname(struct cmdnames *cmds, const char *name, size_t len)
+{
+	struct cmdname *ent = malloc(sizeof(*ent) + len + 1);
+
+	ent->len = len;
+	memcpy(ent->name, name, len);
+	ent->name[len] = 0;
+
+	ALLOC_GROW(cmds->names, cmds->cnt + 1, cmds->alloc);
+	cmds->names[cmds->cnt++] = ent;
+}
+
+void clean_cmdnames(struct cmdnames *cmds)
+{
+	unsigned int i;
+
+	for (i = 0; i < cmds->cnt; ++i)
+		zfree(&cmds->names[i]);
+	zfree(&cmds->names);
+	cmds->cnt = 0;
+	cmds->alloc = 0;
+}
+
+int cmdname_compare(const void *a_, const void *b_)
+{
+	struct cmdname *a = *(struct cmdname **)a_;
+	struct cmdname *b = *(struct cmdname **)b_;
+	return strcmp(a->name, b->name);
+}
+
+void uniq(struct cmdnames *cmds)
+{
+	unsigned int i, j;
+
+	if (!cmds->cnt)
+		return;
+
+	for (i = j = 1; i < cmds->cnt; i++)
+		if (strcmp(cmds->names[i]->name, cmds->names[i-1]->name))
+			cmds->names[j++] = cmds->names[i];
+
+	cmds->cnt = j;
+}
+
+void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes)
+{
+	size_t ci, cj, ei;
+	int cmp;
+
+	ci = cj = ei = 0;
+	while (ci < cmds->cnt && ei < excludes->cnt) {
+		cmp = strcmp(cmds->names[ci]->name, excludes->names[ei]->name);
+		if (cmp < 0)
+			cmds->names[cj++] = cmds->names[ci++];
+		else if (cmp == 0)
+			ci++, ei++;
+		else if (cmp > 0)
+			ei++;
+	}
+
+	while (ci < cmds->cnt)
+		cmds->names[cj++] = cmds->names[ci++];
+
+	cmds->cnt = cj;
+}
+
+static void get_term_dimensions(struct winsize *ws)
+{
+	char *s = getenv("LINES");
+
+	if (s != NULL) {
+		ws->ws_row = atoi(s);
+		s = getenv("COLUMNS");
+		if (s != NULL) {
+			ws->ws_col = atoi(s);
+			if (ws->ws_row && ws->ws_col)
+				return;
+		}
+	}
+#ifdef TIOCGWINSZ
+	if (ioctl(1, TIOCGWINSZ, ws) == 0 &&
+	    ws->ws_row && ws->ws_col)
+		return;
+#endif
+	ws->ws_row = 25;
+	ws->ws_col = 80;
+}
+
+static void pretty_print_string_list(struct cmdnames *cmds, int longest)
+{
+	int cols = 1, rows;
+	int space = longest + 1; /* min 1 SP between words */
+	struct winsize win;
+	int max_cols;
+	int i, j;
+
+	get_term_dimensions(&win);
+	max_cols = win.ws_col - 1; /* don't print *on* the edge */
+
+	if (space < max_cols)
+		cols = max_cols / space;
+	rows = (cmds->cnt + cols - 1) / cols;
+
+	for (i = 0; i < rows; i++) {
+		printf("  ");
+
+		for (j = 0; j < cols; j++) {
+			unsigned int n = j * rows + i;
+			unsigned int size = space;
+
+			if (n >= cmds->cnt)
+				break;
+			if (j == cols-1 || n + rows >= cmds->cnt)
+				size = 1;
+			printf("%-*s", size, cmds->names[n]->name);
+		}
+		putchar('\n');
+	}
+}
+
+static int is_executable(const char *name)
+{
+	struct stat st;
+
+	if (stat(name, &st) || /* stat, not lstat */
+	    !S_ISREG(st.st_mode))
+		return 0;
+
+	return st.st_mode & S_IXUSR;
+}
+
+static int has_extension(const char *filename, const char *ext)
+{
+	size_t len = strlen(filename);
+	size_t extlen = strlen(ext);
+
+	return len > extlen && !memcmp(filename + len - extlen, ext, extlen);
+}
+
+static void list_commands_in_dir(struct cmdnames *cmds,
+					 const char *path,
+					 const char *prefix)
+{
+	int prefix_len;
+	DIR *dir = opendir(path);
+	struct dirent *de;
+	char *buf = NULL;
+
+	if (!dir)
+		return;
+	if (!prefix)
+		prefix = "perf-";
+	prefix_len = strlen(prefix);
+
+	astrcatf(&buf, "%s/", path);
+
+	while ((de = readdir(dir)) != NULL) {
+		int entlen;
+
+		if (prefixcmp(de->d_name, prefix))
+			continue;
+
+		astrcat(&buf, de->d_name);
+		if (!is_executable(buf))
+			continue;
+
+		entlen = strlen(de->d_name) - prefix_len;
+		if (has_extension(de->d_name, ".exe"))
+			entlen -= 4;
+
+		add_cmdname(cmds, de->d_name + prefix_len, entlen);
+	}
+	closedir(dir);
+	free(buf);
+}
+
+void load_command_list(const char *prefix,
+		struct cmdnames *main_cmds,
+		struct cmdnames *other_cmds)
+{
+	const char *env_path = getenv("PATH");
+	char *exec_path = get_argv_exec_path();
+
+	if (exec_path) {
+		list_commands_in_dir(main_cmds, exec_path, prefix);
+		qsort(main_cmds->names, main_cmds->cnt,
+		      sizeof(*main_cmds->names), cmdname_compare);
+		uniq(main_cmds);
+	}
+
+	if (env_path) {
+		char *paths, *path, *colon;
+		path = paths = strdup(env_path);
+		while (1) {
+			if ((colon = strchr(path, ':')))
+				*colon = 0;
+			if (!exec_path || strcmp(path, exec_path))
+				list_commands_in_dir(other_cmds, path, prefix);
+
+			if (!colon)
+				break;
+			path = colon + 1;
+		}
+		free(paths);
+
+		qsort(other_cmds->names, other_cmds->cnt,
+		      sizeof(*other_cmds->names), cmdname_compare);
+		uniq(other_cmds);
+	}
+	free(exec_path);
+	exclude_cmds(other_cmds, main_cmds);
+}
+
+void list_commands(const char *title, struct cmdnames *main_cmds,
+		   struct cmdnames *other_cmds)
+{
+	unsigned int i, longest = 0;
+
+	for (i = 0; i < main_cmds->cnt; i++)
+		if (longest < main_cmds->names[i]->len)
+			longest = main_cmds->names[i]->len;
+	for (i = 0; i < other_cmds->cnt; i++)
+		if (longest < other_cmds->names[i]->len)
+			longest = other_cmds->names[i]->len;
+
+	if (main_cmds->cnt) {
+		char *exec_path = get_argv_exec_path();
+		printf("available %s in '%s'\n", title, exec_path);
+		printf("----------------");
+		mput_char('-', strlen(title) + strlen(exec_path));
+		putchar('\n');
+		pretty_print_string_list(main_cmds, longest);
+		putchar('\n');
+		free(exec_path);
+	}
+
+	if (other_cmds->cnt) {
+		printf("%s available from elsewhere on your $PATH\n", title);
+		printf("---------------------------------------");
+		mput_char('-', strlen(title));
+		putchar('\n');
+		pretty_print_string_list(other_cmds, longest);
+		putchar('\n');
+	}
+}
+
+int is_in_cmdlist(struct cmdnames *c, const char *s)
+{
+	unsigned int i;
+
+	for (i = 0; i < c->cnt; i++)
+		if (!strcmp(s, c->names[i]->name))
+			return 1;
+	return 0;
+}
diff --git a/tools/lib/subcmd/help.h b/tools/lib/subcmd/help.h
new file mode 100644
index 0000000..096c8bc
--- /dev/null
+++ b/tools/lib/subcmd/help.h
@@ -0,0 +1,34 @@
+#ifndef __PERF_HELP_H
+#define __PERF_HELP_H
+
+#include <sys/types.h>
+
+struct cmdnames {
+	size_t alloc;
+	size_t cnt;
+	struct cmdname {
+		size_t len; /* also used for similarity index in help.c */
+		char name[];
+	} **names;
+};
+
+static inline void mput_char(char c, unsigned int num)
+{
+	while(num--)
+		putchar(c);
+}
+
+void load_command_list(const char *prefix,
+		struct cmdnames *main_cmds,
+		struct cmdnames *other_cmds);
+void add_cmdname(struct cmdnames *cmds, const char *name, size_t len);
+void clean_cmdnames(struct cmdnames *cmds);
+int cmdname_compare(const void *a, const void *b);
+void uniq(struct cmdnames *cmds);
+/* Here we require that excludes is a sorted list. */
+void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes);
+int is_in_cmdlist(struct cmdnames *c, const char *s);
+void list_commands(const char *title, struct cmdnames *main_cmds,
+		   struct cmdnames *other_cmds);
+
+#endif /* __PERF_HELP_H */
diff --git a/tools/lib/subcmd/pager.c b/tools/lib/subcmd/pager.c
new file mode 100644
index 0000000..d50f3b58
--- /dev/null
+++ b/tools/lib/subcmd/pager.c
@@ -0,0 +1,100 @@
+#include <sys/select.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+#include "pager.h"
+#include "run-command.h"
+#include "sigchain.h"
+#include "subcmd-config.h"
+
+/*
+ * This is split up from the rest of git so that we can do
+ * something different on Windows.
+ */
+
+static int spawned_pager;
+
+void pager_init(const char *pager_env)
+{
+	subcmd_config.pager_env = pager_env;
+}
+
+static void pager_preexec(void)
+{
+	/*
+	 * Work around bug in "less" by not starting it until we
+	 * have real input
+	 */
+	fd_set in;
+
+	FD_ZERO(&in);
+	FD_SET(0, &in);
+	select(1, &in, NULL, &in, NULL);
+
+	setenv("LESS", "FRSX", 0);
+}
+
+static const char *pager_argv[] = { "sh", "-c", NULL, NULL };
+static struct child_process pager_process;
+
+static void wait_for_pager(void)
+{
+	fflush(stdout);
+	fflush(stderr);
+	/* signal EOF to pager */
+	close(1);
+	close(2);
+	finish_command(&pager_process);
+}
+
+static void wait_for_pager_signal(int signo)
+{
+	wait_for_pager();
+	sigchain_pop(signo);
+	raise(signo);
+}
+
+void setup_pager(void)
+{
+	const char *pager = getenv(subcmd_config.pager_env);
+
+	if (!isatty(1))
+		return;
+	if (!pager)
+		pager = getenv("PAGER");
+	if (!(pager || access("/usr/bin/pager", X_OK)))
+		pager = "/usr/bin/pager";
+	if (!(pager || access("/usr/bin/less", X_OK)))
+		pager = "/usr/bin/less";
+	if (!pager)
+		pager = "cat";
+	if (!*pager || !strcmp(pager, "cat"))
+		return;
+
+	spawned_pager = 1; /* means we are emitting to terminal */
+
+	/* spawn the pager */
+	pager_argv[2] = pager;
+	pager_process.argv = pager_argv;
+	pager_process.in = -1;
+	pager_process.preexec_cb = pager_preexec;
+
+	if (start_command(&pager_process))
+		return;
+
+	/* original process continues, but writes to the pipe */
+	dup2(pager_process.in, 1);
+	if (isatty(2))
+		dup2(pager_process.in, 2);
+	close(pager_process.in);
+
+	/* this makes sure that the parent terminates after the pager */
+	sigchain_push_common(wait_for_pager_signal);
+	atexit(wait_for_pager);
+}
+
+int pager_in_use(void)
+{
+	return spawned_pager;
+}
diff --git a/tools/lib/subcmd/pager.h b/tools/lib/subcmd/pager.h
new file mode 100644
index 0000000..d6a591a
--- /dev/null
+++ b/tools/lib/subcmd/pager.h
@@ -0,0 +1,9 @@
+#ifndef __PERF_PAGER_H
+#define __PERF_PAGER_H
+
+extern void pager_init(const char *pager_env);
+
+extern void setup_pager(void);
+extern int pager_in_use(void);
+
+#endif /* __PERF_PAGER_H */
diff --git a/tools/lib/subcmd/parse-options.c b/tools/lib/subcmd/parse-options.c
new file mode 100644
index 0000000..981bb44
--- /dev/null
+++ b/tools/lib/subcmd/parse-options.c
@@ -0,0 +1,983 @@
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <ctype.h>
+#include "subcmd-util.h"
+#include "parse-options.h"
+#include "subcmd-config.h"
+#include "pager.h"
+
+#define OPT_SHORT 1
+#define OPT_UNSET 2
+
+char *error_buf;
+
+static int opterror(const struct option *opt, const char *reason, int flags)
+{
+	if (flags & OPT_SHORT)
+		fprintf(stderr, " Error: switch `%c' %s", opt->short_name, reason);
+	else if (flags & OPT_UNSET)
+		fprintf(stderr, " Error: option `no-%s' %s", opt->long_name, reason);
+	else
+		fprintf(stderr, " Error: option `%s' %s", opt->long_name, reason);
+
+	return -1;
+}
+
+static const char *skip_prefix(const char *str, const char *prefix)
+{
+	size_t len = strlen(prefix);
+	return strncmp(str, prefix, len) ? NULL : str + len;
+}
+
+static void optwarning(const struct option *opt, const char *reason, int flags)
+{
+	if (flags & OPT_SHORT)
+		fprintf(stderr, " Warning: switch `%c' %s", opt->short_name, reason);
+	else if (flags & OPT_UNSET)
+		fprintf(stderr, " Warning: option `no-%s' %s", opt->long_name, reason);
+	else
+		fprintf(stderr, " Warning: option `%s' %s", opt->long_name, reason);
+}
+
+static int get_arg(struct parse_opt_ctx_t *p, const struct option *opt,
+		   int flags, const char **arg)
+{
+	const char *res;
+
+	if (p->opt) {
+		res = p->opt;
+		p->opt = NULL;
+	} else if ((opt->flags & PARSE_OPT_LASTARG_DEFAULT) && (p->argc == 1 ||
+		    **(p->argv + 1) == '-')) {
+		res = (const char *)opt->defval;
+	} else if (p->argc > 1) {
+		p->argc--;
+		res = *++p->argv;
+	} else
+		return opterror(opt, "requires a value", flags);
+	if (arg)
+		*arg = res;
+	return 0;
+}
+
+static int get_value(struct parse_opt_ctx_t *p,
+		     const struct option *opt, int flags)
+{
+	const char *s, *arg = NULL;
+	const int unset = flags & OPT_UNSET;
+	int err;
+
+	if (unset && p->opt)
+		return opterror(opt, "takes no value", flags);
+	if (unset && (opt->flags & PARSE_OPT_NONEG))
+		return opterror(opt, "isn't available", flags);
+	if (opt->flags & PARSE_OPT_DISABLED)
+		return opterror(opt, "is not usable", flags);
+
+	if (opt->flags & PARSE_OPT_EXCLUSIVE) {
+		if (p->excl_opt && p->excl_opt != opt) {
+			char msg[128];
+
+			if (((flags & OPT_SHORT) && p->excl_opt->short_name) ||
+			    p->excl_opt->long_name == NULL) {
+				snprintf(msg, sizeof(msg), "cannot be used with switch `%c'",
+					 p->excl_opt->short_name);
+			} else {
+				snprintf(msg, sizeof(msg), "cannot be used with %s",
+					 p->excl_opt->long_name);
+			}
+			opterror(opt, msg, flags);
+			return -3;
+		}
+		p->excl_opt = opt;
+	}
+	if (!(flags & OPT_SHORT) && p->opt) {
+		switch (opt->type) {
+		case OPTION_CALLBACK:
+			if (!(opt->flags & PARSE_OPT_NOARG))
+				break;
+			/* FALLTHROUGH */
+		case OPTION_BOOLEAN:
+		case OPTION_INCR:
+		case OPTION_BIT:
+		case OPTION_SET_UINT:
+		case OPTION_SET_PTR:
+			return opterror(opt, "takes no value", flags);
+		case OPTION_END:
+		case OPTION_ARGUMENT:
+		case OPTION_GROUP:
+		case OPTION_STRING:
+		case OPTION_INTEGER:
+		case OPTION_UINTEGER:
+		case OPTION_LONG:
+		case OPTION_U64:
+		default:
+			break;
+		}
+	}
+
+	if (opt->flags & PARSE_OPT_NOBUILD) {
+		char reason[128];
+		bool noarg = false;
+
+		err = snprintf(reason, sizeof(reason),
+				opt->flags & PARSE_OPT_CANSKIP ?
+					"is being ignored because %s " :
+					"is not available because %s",
+				opt->build_opt);
+		reason[sizeof(reason) - 1] = '\0';
+
+		if (err < 0)
+			strncpy(reason, opt->flags & PARSE_OPT_CANSKIP ?
+					"is being ignored" :
+					"is not available",
+					sizeof(reason));
+
+		if (!(opt->flags & PARSE_OPT_CANSKIP))
+			return opterror(opt, reason, flags);
+
+		err = 0;
+		if (unset)
+			noarg = true;
+		if (opt->flags & PARSE_OPT_NOARG)
+			noarg = true;
+		if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
+			noarg = true;
+
+		switch (opt->type) {
+		case OPTION_BOOLEAN:
+		case OPTION_INCR:
+		case OPTION_BIT:
+		case OPTION_SET_UINT:
+		case OPTION_SET_PTR:
+		case OPTION_END:
+		case OPTION_ARGUMENT:
+		case OPTION_GROUP:
+			noarg = true;
+			break;
+		case OPTION_CALLBACK:
+		case OPTION_STRING:
+		case OPTION_INTEGER:
+		case OPTION_UINTEGER:
+		case OPTION_LONG:
+		case OPTION_U64:
+		default:
+			break;
+		}
+
+		if (!noarg)
+			err = get_arg(p, opt, flags, NULL);
+		if (err)
+			return err;
+
+		optwarning(opt, reason, flags);
+		return 0;
+	}
+
+	switch (opt->type) {
+	case OPTION_BIT:
+		if (unset)
+			*(int *)opt->value &= ~opt->defval;
+		else
+			*(int *)opt->value |= opt->defval;
+		return 0;
+
+	case OPTION_BOOLEAN:
+		*(bool *)opt->value = unset ? false : true;
+		if (opt->set)
+			*(bool *)opt->set = true;
+		return 0;
+
+	case OPTION_INCR:
+		*(int *)opt->value = unset ? 0 : *(int *)opt->value + 1;
+		return 0;
+
+	case OPTION_SET_UINT:
+		*(unsigned int *)opt->value = unset ? 0 : opt->defval;
+		return 0;
+
+	case OPTION_SET_PTR:
+		*(void **)opt->value = unset ? NULL : (void *)opt->defval;
+		return 0;
+
+	case OPTION_STRING:
+		err = 0;
+		if (unset)
+			*(const char **)opt->value = NULL;
+		else if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
+			*(const char **)opt->value = (const char *)opt->defval;
+		else
+			err = get_arg(p, opt, flags, (const char **)opt->value);
+
+		/* PARSE_OPT_NOEMPTY: Allow NULL but disallow empty string. */
+		if (opt->flags & PARSE_OPT_NOEMPTY) {
+			const char *val = *(const char **)opt->value;
+
+			if (!val)
+				return err;
+
+			/* Similar to unset if we are given an empty string. */
+			if (val[0] == '\0') {
+				*(const char **)opt->value = NULL;
+				return 0;
+			}
+		}
+
+		return err;
+
+	case OPTION_CALLBACK:
+		if (unset)
+			return (*opt->callback)(opt, NULL, 1) ? (-1) : 0;
+		if (opt->flags & PARSE_OPT_NOARG)
+			return (*opt->callback)(opt, NULL, 0) ? (-1) : 0;
+		if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
+			return (*opt->callback)(opt, NULL, 0) ? (-1) : 0;
+		if (get_arg(p, opt, flags, &arg))
+			return -1;
+		return (*opt->callback)(opt, arg, 0) ? (-1) : 0;
+
+	case OPTION_INTEGER:
+		if (unset) {
+			*(int *)opt->value = 0;
+			return 0;
+		}
+		if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+			*(int *)opt->value = opt->defval;
+			return 0;
+		}
+		if (get_arg(p, opt, flags, &arg))
+			return -1;
+		*(int *)opt->value = strtol(arg, (char **)&s, 10);
+		if (*s)
+			return opterror(opt, "expects a numerical value", flags);
+		return 0;
+
+	case OPTION_UINTEGER:
+		if (unset) {
+			*(unsigned int *)opt->value = 0;
+			return 0;
+		}
+		if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+			*(unsigned int *)opt->value = opt->defval;
+			return 0;
+		}
+		if (get_arg(p, opt, flags, &arg))
+			return -1;
+		*(unsigned int *)opt->value = strtol(arg, (char **)&s, 10);
+		if (*s)
+			return opterror(opt, "expects a numerical value", flags);
+		return 0;
+
+	case OPTION_LONG:
+		if (unset) {
+			*(long *)opt->value = 0;
+			return 0;
+		}
+		if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+			*(long *)opt->value = opt->defval;
+			return 0;
+		}
+		if (get_arg(p, opt, flags, &arg))
+			return -1;
+		*(long *)opt->value = strtol(arg, (char **)&s, 10);
+		if (*s)
+			return opterror(opt, "expects a numerical value", flags);
+		return 0;
+
+	case OPTION_U64:
+		if (unset) {
+			*(u64 *)opt->value = 0;
+			return 0;
+		}
+		if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+			*(u64 *)opt->value = opt->defval;
+			return 0;
+		}
+		if (get_arg(p, opt, flags, &arg))
+			return -1;
+		*(u64 *)opt->value = strtoull(arg, (char **)&s, 10);
+		if (*s)
+			return opterror(opt, "expects a numerical value", flags);
+		return 0;
+
+	case OPTION_END:
+	case OPTION_ARGUMENT:
+	case OPTION_GROUP:
+	default:
+		die("should not happen, someone must be hit on the forehead");
+	}
+}
+
+static int parse_short_opt(struct parse_opt_ctx_t *p, const struct option *options)
+{
+	for (; options->type != OPTION_END; options++) {
+		if (options->short_name == *p->opt) {
+			p->opt = p->opt[1] ? p->opt + 1 : NULL;
+			return get_value(p, options, OPT_SHORT);
+		}
+	}
+	return -2;
+}
+
+static int parse_long_opt(struct parse_opt_ctx_t *p, const char *arg,
+                          const struct option *options)
+{
+	const char *arg_end = strchr(arg, '=');
+	const struct option *abbrev_option = NULL, *ambiguous_option = NULL;
+	int abbrev_flags = 0, ambiguous_flags = 0;
+
+	if (!arg_end)
+		arg_end = arg + strlen(arg);
+
+	for (; options->type != OPTION_END; options++) {
+		const char *rest;
+		int flags = 0;
+
+		if (!options->long_name)
+			continue;
+
+		rest = skip_prefix(arg, options->long_name);
+		if (options->type == OPTION_ARGUMENT) {
+			if (!rest)
+				continue;
+			if (*rest == '=')
+				return opterror(options, "takes no value", flags);
+			if (*rest)
+				continue;
+			p->out[p->cpidx++] = arg - 2;
+			return 0;
+		}
+		if (!rest) {
+			if (!prefixcmp(options->long_name, "no-")) {
+				/*
+				 * The long name itself starts with "no-", so
+				 * accept the option without "no-" so that users
+				 * do not have to enter "no-no-" to get the
+				 * negation.
+				 */
+				rest = skip_prefix(arg, options->long_name + 3);
+				if (rest) {
+					flags |= OPT_UNSET;
+					goto match;
+				}
+				/* Abbreviated case */
+				if (!prefixcmp(options->long_name + 3, arg)) {
+					flags |= OPT_UNSET;
+					goto is_abbreviated;
+				}
+			}
+			/* abbreviated? */
+			if (!strncmp(options->long_name, arg, arg_end - arg)) {
+is_abbreviated:
+				if (abbrev_option) {
+					/*
+					 * If this is abbreviated, it is
+					 * ambiguous. So when there is no
+					 * exact match later, we need to
+					 * error out.
+					 */
+					ambiguous_option = abbrev_option;
+					ambiguous_flags = abbrev_flags;
+				}
+				if (!(flags & OPT_UNSET) && *arg_end)
+					p->opt = arg_end + 1;
+				abbrev_option = options;
+				abbrev_flags = flags;
+				continue;
+			}
+			/* negated and abbreviated very much? */
+			if (!prefixcmp("no-", arg)) {
+				flags |= OPT_UNSET;
+				goto is_abbreviated;
+			}
+			/* negated? */
+			if (strncmp(arg, "no-", 3))
+				continue;
+			flags |= OPT_UNSET;
+			rest = skip_prefix(arg + 3, options->long_name);
+			/* abbreviated and negated? */
+			if (!rest && !prefixcmp(options->long_name, arg + 3))
+				goto is_abbreviated;
+			if (!rest)
+				continue;
+		}
+match:
+		if (*rest) {
+			if (*rest != '=')
+				continue;
+			p->opt = rest + 1;
+		}
+		return get_value(p, options, flags);
+	}
+
+	if (ambiguous_option) {
+		 fprintf(stderr,
+			 " Error: Ambiguous option: %s (could be --%s%s or --%s%s)",
+			 arg,
+			 (ambiguous_flags & OPT_UNSET) ?  "no-" : "",
+			 ambiguous_option->long_name,
+			 (abbrev_flags & OPT_UNSET) ?  "no-" : "",
+			 abbrev_option->long_name);
+		 return -1;
+	}
+	if (abbrev_option)
+		return get_value(p, abbrev_option, abbrev_flags);
+	return -2;
+}
+
+static void check_typos(const char *arg, const struct option *options)
+{
+	if (strlen(arg) < 3)
+		return;
+
+	if (!prefixcmp(arg, "no-")) {
+		fprintf(stderr, " Error: did you mean `--%s` (with two dashes ?)", arg);
+		exit(129);
+	}
+
+	for (; options->type != OPTION_END; options++) {
+		if (!options->long_name)
+			continue;
+		if (!prefixcmp(options->long_name, arg)) {
+			fprintf(stderr, " Error: did you mean `--%s` (with two dashes ?)", arg);
+			exit(129);
+		}
+	}
+}
+
+static void parse_options_start(struct parse_opt_ctx_t *ctx,
+				int argc, const char **argv, int flags)
+{
+	memset(ctx, 0, sizeof(*ctx));
+	ctx->argc = argc - 1;
+	ctx->argv = argv + 1;
+	ctx->out  = argv;
+	ctx->cpidx = ((flags & PARSE_OPT_KEEP_ARGV0) != 0);
+	ctx->flags = flags;
+	if ((flags & PARSE_OPT_KEEP_UNKNOWN) &&
+	    (flags & PARSE_OPT_STOP_AT_NON_OPTION))
+		die("STOP_AT_NON_OPTION and KEEP_UNKNOWN don't go together");
+}
+
+static int usage_with_options_internal(const char * const *,
+				       const struct option *, int,
+				       struct parse_opt_ctx_t *);
+
+static int parse_options_step(struct parse_opt_ctx_t *ctx,
+			      const struct option *options,
+			      const char * const usagestr[])
+{
+	int internal_help = !(ctx->flags & PARSE_OPT_NO_INTERNAL_HELP);
+	int excl_short_opt = 1;
+	const char *arg;
+
+	/* we must reset ->opt, unknown short option leave it dangling */
+	ctx->opt = NULL;
+
+	for (; ctx->argc; ctx->argc--, ctx->argv++) {
+		arg = ctx->argv[0];
+		if (*arg != '-' || !arg[1]) {
+			if (ctx->flags & PARSE_OPT_STOP_AT_NON_OPTION)
+				break;
+			ctx->out[ctx->cpidx++] = ctx->argv[0];
+			continue;
+		}
+
+		if (arg[1] != '-') {
+			ctx->opt = ++arg;
+			if (internal_help && *ctx->opt == 'h') {
+				return usage_with_options_internal(usagestr, options, 0, ctx);
+			}
+			switch (parse_short_opt(ctx, options)) {
+			case -1:
+				return parse_options_usage(usagestr, options, arg, 1);
+			case -2:
+				goto unknown;
+			case -3:
+				goto exclusive;
+			default:
+				break;
+			}
+			if (ctx->opt)
+				check_typos(arg, options);
+			while (ctx->opt) {
+				if (internal_help && *ctx->opt == 'h')
+					return usage_with_options_internal(usagestr, options, 0, ctx);
+				arg = ctx->opt;
+				switch (parse_short_opt(ctx, options)) {
+				case -1:
+					return parse_options_usage(usagestr, options, arg, 1);
+				case -2:
+					/* fake a short option thing to hide the fact that we may have
+					 * started to parse aggregated stuff
+					 *
+					 * This is leaky, too bad.
+					 */
+					ctx->argv[0] = strdup(ctx->opt - 1);
+					*(char *)ctx->argv[0] = '-';
+					goto unknown;
+				case -3:
+					goto exclusive;
+				default:
+					break;
+				}
+			}
+			continue;
+		}
+
+		if (!arg[2]) { /* "--" */
+			if (!(ctx->flags & PARSE_OPT_KEEP_DASHDASH)) {
+				ctx->argc--;
+				ctx->argv++;
+			}
+			break;
+		}
+
+		arg += 2;
+		if (internal_help && !strcmp(arg, "help-all"))
+			return usage_with_options_internal(usagestr, options, 1, ctx);
+		if (internal_help && !strcmp(arg, "help"))
+			return usage_with_options_internal(usagestr, options, 0, ctx);
+		if (!strcmp(arg, "list-opts"))
+			return PARSE_OPT_LIST_OPTS;
+		if (!strcmp(arg, "list-cmds"))
+			return PARSE_OPT_LIST_SUBCMDS;
+		switch (parse_long_opt(ctx, arg, options)) {
+		case -1:
+			return parse_options_usage(usagestr, options, arg, 0);
+		case -2:
+			goto unknown;
+		case -3:
+			excl_short_opt = 0;
+			goto exclusive;
+		default:
+			break;
+		}
+		continue;
+unknown:
+		if (!(ctx->flags & PARSE_OPT_KEEP_UNKNOWN))
+			return PARSE_OPT_UNKNOWN;
+		ctx->out[ctx->cpidx++] = ctx->argv[0];
+		ctx->opt = NULL;
+	}
+	return PARSE_OPT_DONE;
+
+exclusive:
+	parse_options_usage(usagestr, options, arg, excl_short_opt);
+	if ((excl_short_opt && ctx->excl_opt->short_name) ||
+	    ctx->excl_opt->long_name == NULL) {
+		char opt = ctx->excl_opt->short_name;
+		parse_options_usage(NULL, options, &opt, 1);
+	} else {
+		parse_options_usage(NULL, options, ctx->excl_opt->long_name, 0);
+	}
+	return PARSE_OPT_HELP;
+}
+
+static int parse_options_end(struct parse_opt_ctx_t *ctx)
+{
+	memmove(ctx->out + ctx->cpidx, ctx->argv, ctx->argc * sizeof(*ctx->out));
+	ctx->out[ctx->cpidx + ctx->argc] = NULL;
+	return ctx->cpidx + ctx->argc;
+}
+
+int parse_options_subcommand(int argc, const char **argv, const struct option *options,
+			const char *const subcommands[], const char *usagestr[], int flags)
+{
+	struct parse_opt_ctx_t ctx;
+
+	/* build usage string if it's not provided */
+	if (subcommands && !usagestr[0]) {
+		char *buf = NULL;
+
+		astrcatf(&buf, "%s %s [<options>] {", subcmd_config.exec_name, argv[0]);
+
+		for (int i = 0; subcommands[i]; i++) {
+			if (i)
+				astrcat(&buf, "|");
+			astrcat(&buf, subcommands[i]);
+		}
+		astrcat(&buf, "}");
+
+		usagestr[0] = buf;
+	}
+
+	parse_options_start(&ctx, argc, argv, flags);
+	switch (parse_options_step(&ctx, options, usagestr)) {
+	case PARSE_OPT_HELP:
+		exit(129);
+	case PARSE_OPT_DONE:
+		break;
+	case PARSE_OPT_LIST_OPTS:
+		while (options->type != OPTION_END) {
+			if (options->long_name)
+				printf("--%s ", options->long_name);
+			options++;
+		}
+		putchar('\n');
+		exit(130);
+	case PARSE_OPT_LIST_SUBCMDS:
+		if (subcommands) {
+			for (int i = 0; subcommands[i]; i++)
+				printf("%s ", subcommands[i]);
+		}
+		putchar('\n');
+		exit(130);
+	default: /* PARSE_OPT_UNKNOWN */
+		if (ctx.argv[0][1] == '-')
+			astrcatf(&error_buf, "unknown option `%s'",
+				 ctx.argv[0] + 2);
+		else
+			astrcatf(&error_buf, "unknown switch `%c'", *ctx.opt);
+		usage_with_options(usagestr, options);
+	}
+
+	return parse_options_end(&ctx);
+}
+
+int parse_options(int argc, const char **argv, const struct option *options,
+		  const char * const usagestr[], int flags)
+{
+	return parse_options_subcommand(argc, argv, options, NULL,
+					(const char **) usagestr, flags);
+}
+
+#define USAGE_OPTS_WIDTH 24
+#define USAGE_GAP         2
+
+static void print_option_help(const struct option *opts, int full)
+{
+	size_t pos;
+	int pad;
+
+	if (opts->type == OPTION_GROUP) {
+		fputc('\n', stderr);
+		if (*opts->help)
+			fprintf(stderr, "%s\n", opts->help);
+		return;
+	}
+	if (!full && (opts->flags & PARSE_OPT_HIDDEN))
+		return;
+	if (opts->flags & PARSE_OPT_DISABLED)
+		return;
+
+	pos = fprintf(stderr, "    ");
+	if (opts->short_name)
+		pos += fprintf(stderr, "-%c", opts->short_name);
+	else
+		pos += fprintf(stderr, "    ");
+
+	if (opts->long_name && opts->short_name)
+		pos += fprintf(stderr, ", ");
+	if (opts->long_name)
+		pos += fprintf(stderr, "--%s", opts->long_name);
+
+	switch (opts->type) {
+	case OPTION_ARGUMENT:
+		break;
+	case OPTION_LONG:
+	case OPTION_U64:
+	case OPTION_INTEGER:
+	case OPTION_UINTEGER:
+		if (opts->flags & PARSE_OPT_OPTARG)
+			if (opts->long_name)
+				pos += fprintf(stderr, "[=<n>]");
+			else
+				pos += fprintf(stderr, "[<n>]");
+		else
+			pos += fprintf(stderr, " <n>");
+		break;
+	case OPTION_CALLBACK:
+		if (opts->flags & PARSE_OPT_NOARG)
+			break;
+		/* FALLTHROUGH */
+	case OPTION_STRING:
+		if (opts->argh) {
+			if (opts->flags & PARSE_OPT_OPTARG)
+				if (opts->long_name)
+					pos += fprintf(stderr, "[=<%s>]", opts->argh);
+				else
+					pos += fprintf(stderr, "[<%s>]", opts->argh);
+			else
+				pos += fprintf(stderr, " <%s>", opts->argh);
+		} else {
+			if (opts->flags & PARSE_OPT_OPTARG)
+				if (opts->long_name)
+					pos += fprintf(stderr, "[=...]");
+				else
+					pos += fprintf(stderr, "[...]");
+			else
+				pos += fprintf(stderr, " ...");
+		}
+		break;
+	default: /* OPTION_{BIT,BOOLEAN,SET_UINT,SET_PTR} */
+	case OPTION_END:
+	case OPTION_GROUP:
+	case OPTION_BIT:
+	case OPTION_BOOLEAN:
+	case OPTION_INCR:
+	case OPTION_SET_UINT:
+	case OPTION_SET_PTR:
+		break;
+	}
+
+	if (pos <= USAGE_OPTS_WIDTH)
+		pad = USAGE_OPTS_WIDTH - pos;
+	else {
+		fputc('\n', stderr);
+		pad = USAGE_OPTS_WIDTH;
+	}
+	fprintf(stderr, "%*s%s\n", pad + USAGE_GAP, "", opts->help);
+	if (opts->flags & PARSE_OPT_NOBUILD)
+		fprintf(stderr, "%*s(not built-in because %s)\n",
+			USAGE_OPTS_WIDTH + USAGE_GAP, "",
+			opts->build_opt);
+}
+
+static int option__cmp(const void *va, const void *vb)
+{
+	const struct option *a = va, *b = vb;
+	int sa = tolower(a->short_name), sb = tolower(b->short_name), ret;
+
+	if (sa == 0)
+		sa = 'z' + 1;
+	if (sb == 0)
+		sb = 'z' + 1;
+
+	ret = sa - sb;
+
+	if (ret == 0) {
+		const char *la = a->long_name ?: "",
+			   *lb = b->long_name ?: "";
+		ret = strcmp(la, lb);
+	}
+
+	return ret;
+}
+
+static struct option *options__order(const struct option *opts)
+{
+	int nr_opts = 0, len;
+	const struct option *o = opts;
+	struct option *ordered;
+
+	for (o = opts; o->type != OPTION_END; o++)
+		++nr_opts;
+
+	len = sizeof(*o) * (nr_opts + 1);
+	ordered = malloc(len);
+	if (!ordered)
+		goto out;
+	memcpy(ordered, opts, len);
+
+	qsort(ordered, nr_opts, sizeof(*o), option__cmp);
+out:
+	return ordered;
+}
+
+static bool option__in_argv(const struct option *opt, const struct parse_opt_ctx_t *ctx)
+{
+	int i;
+
+	for (i = 1; i < ctx->argc; ++i) {
+		const char *arg = ctx->argv[i];
+
+		if (arg[0] != '-') {
+			if (arg[1] == '\0') {
+				if (arg[0] == opt->short_name)
+					return true;
+				continue;
+			}
+
+			if (opt->long_name && strcmp(opt->long_name, arg) == 0)
+				return true;
+
+			if (opt->help && strcasestr(opt->help, arg) != NULL)
+				return true;
+
+			continue;
+		}
+
+		if (arg[1] == opt->short_name ||
+		    (arg[1] == '-' && opt->long_name && strcmp(opt->long_name, arg + 2) == 0))
+			return true;
+	}
+
+	return false;
+}
+
+static int usage_with_options_internal(const char * const *usagestr,
+				       const struct option *opts, int full,
+				       struct parse_opt_ctx_t *ctx)
+{
+	struct option *ordered;
+
+	if (!usagestr)
+		return PARSE_OPT_HELP;
+
+	setup_pager();
+
+	if (error_buf) {
+		fprintf(stderr, "  Error: %s\n", error_buf);
+		zfree(&error_buf);
+	}
+
+	fprintf(stderr, "\n Usage: %s\n", *usagestr++);
+	while (*usagestr && **usagestr)
+		fprintf(stderr, "    or: %s\n", *usagestr++);
+	while (*usagestr) {
+		fprintf(stderr, "%s%s\n",
+				**usagestr ? "    " : "",
+				*usagestr);
+		usagestr++;
+	}
+
+	if (opts->type != OPTION_GROUP)
+		fputc('\n', stderr);
+
+	ordered = options__order(opts);
+	if (ordered)
+		opts = ordered;
+
+	for (  ; opts->type != OPTION_END; opts++) {
+		if (ctx && ctx->argc > 1 && !option__in_argv(opts, ctx))
+			continue;
+		print_option_help(opts, full);
+	}
+
+	fputc('\n', stderr);
+
+	free(ordered);
+
+	return PARSE_OPT_HELP;
+}
+
+void usage_with_options(const char * const *usagestr,
+			const struct option *opts)
+{
+	usage_with_options_internal(usagestr, opts, 0, NULL);
+	exit(129);
+}
+
+void usage_with_options_msg(const char * const *usagestr,
+			    const struct option *opts, const char *fmt, ...)
+{
+	va_list ap;
+	char *tmp = error_buf;
+
+	va_start(ap, fmt);
+	if (vasprintf(&error_buf, fmt, ap) == -1)
+		die("vasprintf failed");
+	va_end(ap);
+
+	free(tmp);
+
+	usage_with_options_internal(usagestr, opts, 0, NULL);
+	exit(129);
+}
+
+int parse_options_usage(const char * const *usagestr,
+			const struct option *opts,
+			const char *optstr, bool short_opt)
+{
+	if (!usagestr)
+		goto opt;
+
+	fprintf(stderr, "\n Usage: %s\n", *usagestr++);
+	while (*usagestr && **usagestr)
+		fprintf(stderr, "    or: %s\n", *usagestr++);
+	while (*usagestr) {
+		fprintf(stderr, "%s%s\n",
+				**usagestr ? "    " : "",
+				*usagestr);
+		usagestr++;
+	}
+	fputc('\n', stderr);
+
+opt:
+	for (  ; opts->type != OPTION_END; opts++) {
+		if (short_opt) {
+			if (opts->short_name == *optstr) {
+				print_option_help(opts, 0);
+				break;
+			}
+			continue;
+		}
+
+		if (opts->long_name == NULL)
+			continue;
+
+		if (!prefixcmp(opts->long_name, optstr))
+			print_option_help(opts, 0);
+		if (!prefixcmp("no-", optstr) &&
+		    !prefixcmp(opts->long_name, optstr + 3))
+			print_option_help(opts, 0);
+	}
+
+	return PARSE_OPT_HELP;
+}
+
+
+int parse_opt_verbosity_cb(const struct option *opt,
+			   const char *arg __maybe_unused,
+			   int unset)
+{
+	int *target = opt->value;
+
+	if (unset)
+		/* --no-quiet, --no-verbose */
+		*target = 0;
+	else if (opt->short_name == 'v') {
+		if (*target >= 0)
+			(*target)++;
+		else
+			*target = 1;
+	} else {
+		if (*target <= 0)
+			(*target)--;
+		else
+			*target = -1;
+	}
+	return 0;
+}
+
+static struct option *
+find_option(struct option *opts, int shortopt, const char *longopt)
+{
+	for (; opts->type != OPTION_END; opts++) {
+		if ((shortopt && opts->short_name == shortopt) ||
+		    (opts->long_name && longopt &&
+		     !strcmp(opts->long_name, longopt)))
+			return opts;
+	}
+	return NULL;
+}
+
+void set_option_flag(struct option *opts, int shortopt, const char *longopt,
+		     int flag)
+{
+	struct option *opt = find_option(opts, shortopt, longopt);
+
+	if (opt)
+		opt->flags |= flag;
+	return;
+}
+
+void set_option_nobuild(struct option *opts, int shortopt,
+			const char *longopt,
+			const char *build_opt,
+			bool can_skip)
+{
+	struct option *opt = find_option(opts, shortopt, longopt);
+
+	if (!opt)
+		return;
+
+	opt->flags |= PARSE_OPT_NOBUILD;
+	opt->flags |= can_skip ? PARSE_OPT_CANSKIP : 0;
+	opt->build_opt = build_opt;
+}
diff --git a/tools/lib/subcmd/parse-options.h b/tools/lib/subcmd/parse-options.h
new file mode 100644
index 0000000..dec893f
--- /dev/null
+++ b/tools/lib/subcmd/parse-options.h
@@ -0,0 +1,228 @@
+#ifndef __PERF_PARSE_OPTIONS_H
+#define __PERF_PARSE_OPTIONS_H
+
+#include <stdbool.h>
+#include <stdint.h>
+
+enum parse_opt_type {
+	/* special types */
+	OPTION_END,
+	OPTION_ARGUMENT,
+	OPTION_GROUP,
+	/* options with no arguments */
+	OPTION_BIT,
+	OPTION_BOOLEAN,
+	OPTION_INCR,
+	OPTION_SET_UINT,
+	OPTION_SET_PTR,
+	/* options with arguments (usually) */
+	OPTION_STRING,
+	OPTION_INTEGER,
+	OPTION_LONG,
+	OPTION_CALLBACK,
+	OPTION_U64,
+	OPTION_UINTEGER,
+};
+
+enum parse_opt_flags {
+	PARSE_OPT_KEEP_DASHDASH = 1,
+	PARSE_OPT_STOP_AT_NON_OPTION = 2,
+	PARSE_OPT_KEEP_ARGV0 = 4,
+	PARSE_OPT_KEEP_UNKNOWN = 8,
+	PARSE_OPT_NO_INTERNAL_HELP = 16,
+};
+
+enum parse_opt_option_flags {
+	PARSE_OPT_OPTARG  = 1,
+	PARSE_OPT_NOARG   = 2,
+	PARSE_OPT_NONEG   = 4,
+	PARSE_OPT_HIDDEN  = 8,
+	PARSE_OPT_LASTARG_DEFAULT = 16,
+	PARSE_OPT_DISABLED = 32,
+	PARSE_OPT_EXCLUSIVE = 64,
+	PARSE_OPT_NOEMPTY  = 128,
+	PARSE_OPT_NOBUILD  = 256,
+	PARSE_OPT_CANSKIP  = 512,
+};
+
+struct option;
+typedef int parse_opt_cb(const struct option *, const char *arg, int unset);
+
+/*
+ * `type`::
+ *   holds the type of the option, you must have an OPTION_END last in your
+ *   array.
+ *
+ * `short_name`::
+ *   the character to use as a short option name, '\0' if none.
+ *
+ * `long_name`::
+ *   the long option name, without the leading dashes, NULL if none.
+ *
+ * `value`::
+ *   stores pointers to the values to be filled.
+ *
+ * `argh`::
+ *   token to explain the kind of argument this option wants. Keep it
+ *   homogenous across the repository.
+ *
+ * `help`::
+ *   the short help associated to what the option does.
+ *   Must never be NULL (except for OPTION_END).
+ *   OPTION_GROUP uses this pointer to store the group header.
+ *
+ * `flags`::
+ *   mask of parse_opt_option_flags.
+ *   PARSE_OPT_OPTARG: says that the argument is optionnal (not for BOOLEANs)
+ *   PARSE_OPT_NOARG: says that this option takes no argument, for CALLBACKs
+ *   PARSE_OPT_NONEG: says that this option cannot be negated
+ *   PARSE_OPT_HIDDEN this option is skipped in the default usage, showed in
+ *                    the long one.
+ *
+ * `callback`::
+ *   pointer to the callback to use for OPTION_CALLBACK.
+ *
+ * `defval`::
+ *   default value to fill (*->value) with for PARSE_OPT_OPTARG.
+ *   OPTION_{BIT,SET_UINT,SET_PTR} store the {mask,integer,pointer} to put in
+ *   the value when met.
+ *   CALLBACKS can use it like they want.
+ *
+ * `set`::
+ *   whether an option was set by the user
+ */
+struct option {
+	enum parse_opt_type type;
+	int short_name;
+	const char *long_name;
+	void *value;
+	const char *argh;
+	const char *help;
+	const char *build_opt;
+
+	int flags;
+	parse_opt_cb *callback;
+	intptr_t defval;
+	bool *set;
+	void *data;
+};
+
+#define check_vtype(v, type) ( BUILD_BUG_ON_ZERO(!__builtin_types_compatible_p(typeof(v), type)) + v )
+
+#define OPT_END()                   { .type = OPTION_END }
+#define OPT_ARGUMENT(l, h)          { .type = OPTION_ARGUMENT, .long_name = (l), .help = (h) }
+#define OPT_GROUP(h)                { .type = OPTION_GROUP, .help = (h) }
+#define OPT_BIT(s, l, v, h, b)      { .type = OPTION_BIT, .short_name = (s), .long_name = (l), .value = check_vtype(v, int *), .help = (h), .defval = (b) }
+#define OPT_BOOLEAN(s, l, v, h)     { .type = OPTION_BOOLEAN, .short_name = (s), .long_name = (l), .value = check_vtype(v, bool *), .help = (h) }
+#define OPT_BOOLEAN_FLAG(s, l, v, h, f)     { .type = OPTION_BOOLEAN, .short_name = (s), .long_name = (l), .value = check_vtype(v, bool *), .help = (h), .flags = (f) }
+#define OPT_BOOLEAN_SET(s, l, v, os, h) \
+	{ .type = OPTION_BOOLEAN, .short_name = (s), .long_name = (l), \
+	.value = check_vtype(v, bool *), .help = (h), \
+	.set = check_vtype(os, bool *)}
+#define OPT_INCR(s, l, v, h)        { .type = OPTION_INCR, .short_name = (s), .long_name = (l), .value = check_vtype(v, int *), .help = (h) }
+#define OPT_SET_UINT(s, l, v, h, i)  { .type = OPTION_SET_UINT, .short_name = (s), .long_name = (l), .value = check_vtype(v, unsigned int *), .help = (h), .defval = (i) }
+#define OPT_SET_PTR(s, l, v, h, p)  { .type = OPTION_SET_PTR, .short_name = (s), .long_name = (l), .value = (v), .help = (h), .defval = (p) }
+#define OPT_INTEGER(s, l, v, h)     { .type = OPTION_INTEGER, .short_name = (s), .long_name = (l), .value = check_vtype(v, int *), .help = (h) }
+#define OPT_UINTEGER(s, l, v, h)    { .type = OPTION_UINTEGER, .short_name = (s), .long_name = (l), .value = check_vtype(v, unsigned int *), .help = (h) }
+#define OPT_LONG(s, l, v, h)        { .type = OPTION_LONG, .short_name = (s), .long_name = (l), .value = check_vtype(v, long *), .help = (h) }
+#define OPT_U64(s, l, v, h)         { .type = OPTION_U64, .short_name = (s), .long_name = (l), .value = check_vtype(v, u64 *), .help = (h) }
+#define OPT_STRING(s, l, v, a, h)   { .type = OPTION_STRING,  .short_name = (s), .long_name = (l), .value = check_vtype(v, const char **), (a), .help = (h) }
+#define OPT_STRING_OPTARG(s, l, v, a, h, d) \
+	{ .type = OPTION_STRING,  .short_name = (s), .long_name = (l), \
+	  .value = check_vtype(v, const char **), (a), .help = (h), \
+	  .flags = PARSE_OPT_OPTARG, .defval = (intptr_t)(d) }
+#define OPT_STRING_NOEMPTY(s, l, v, a, h)   { .type = OPTION_STRING,  .short_name = (s), .long_name = (l), .value = check_vtype(v, const char **), (a), .help = (h), .flags = PARSE_OPT_NOEMPTY}
+#define OPT_DATE(s, l, v, h) \
+	{ .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), .value = (v), .argh = "time", .help = (h), .callback = parse_opt_approxidate_cb }
+#define OPT_CALLBACK(s, l, v, a, h, f) \
+	{ .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), .value = (v), (a), .help = (h), .callback = (f) }
+#define OPT_CALLBACK_NOOPT(s, l, v, a, h, f) \
+	{ .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), .value = (v), (a), .help = (h), .callback = (f), .flags = PARSE_OPT_NOARG }
+#define OPT_CALLBACK_DEFAULT(s, l, v, a, h, f, d) \
+	{ .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), .value = (v), (a), .help = (h), .callback = (f), .defval = (intptr_t)d, .flags = PARSE_OPT_LASTARG_DEFAULT }
+#define OPT_CALLBACK_DEFAULT_NOOPT(s, l, v, a, h, f, d) \
+	{ .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l),\
+	.value = (v), (a), .help = (h), .callback = (f), .defval = (intptr_t)d,\
+	.flags = PARSE_OPT_LASTARG_DEFAULT | PARSE_OPT_NOARG}
+#define OPT_CALLBACK_OPTARG(s, l, v, d, a, h, f) \
+	{ .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), \
+	  .value = (v), (a), .help = (h), .callback = (f), \
+	  .flags = PARSE_OPT_OPTARG, .data = (d) }
+
+/* parse_options() will filter out the processed options and leave the
+ * non-option argments in argv[].
+ * Returns the number of arguments left in argv[].
+ *
+ * NOTE: parse_options() and parse_options_subcommand() may call exit() in the
+ * case of an error (or for 'special' options like --list-cmds or --list-opts).
+ */
+extern int parse_options(int argc, const char **argv,
+                         const struct option *options,
+                         const char * const usagestr[], int flags);
+
+extern int parse_options_subcommand(int argc, const char **argv,
+				const struct option *options,
+				const char *const subcommands[],
+				const char *usagestr[], int flags);
+
+extern NORETURN void usage_with_options(const char * const *usagestr,
+                                        const struct option *options);
+extern NORETURN __attribute__((format(printf,3,4)))
+void usage_with_options_msg(const char * const *usagestr,
+			    const struct option *options,
+			    const char *fmt, ...);
+
+/*----- incremantal advanced APIs -----*/
+
+enum {
+	PARSE_OPT_HELP = -1,
+	PARSE_OPT_DONE,
+	PARSE_OPT_LIST_OPTS,
+	PARSE_OPT_LIST_SUBCMDS,
+	PARSE_OPT_UNKNOWN,
+};
+
+/*
+ * It's okay for the caller to consume argv/argc in the usual way.
+ * Other fields of that structure are private to parse-options and should not
+ * be modified in any way.
+ */
+struct parse_opt_ctx_t {
+	const char **argv;
+	const char **out;
+	int argc, cpidx;
+	const char *opt;
+	const struct option *excl_opt;
+	int flags;
+};
+
+extern int parse_options_usage(const char * const *usagestr,
+			       const struct option *opts,
+			       const char *optstr,
+			       bool short_opt);
+
+
+/*----- some often used options -----*/
+extern int parse_opt_abbrev_cb(const struct option *, const char *, int);
+extern int parse_opt_approxidate_cb(const struct option *, const char *, int);
+extern int parse_opt_verbosity_cb(const struct option *, const char *, int);
+
+#define OPT__VERBOSE(var)  OPT_BOOLEAN('v', "verbose", (var), "be verbose")
+#define OPT__QUIET(var)    OPT_BOOLEAN('q', "quiet",   (var), "be quiet")
+#define OPT__VERBOSITY(var) \
+	{ OPTION_CALLBACK, 'v', "verbose", (var), NULL, "be more verbose", \
+	  PARSE_OPT_NOARG, &parse_opt_verbosity_cb, 0 }, \
+	{ OPTION_CALLBACK, 'q', "quiet", (var), NULL, "be more quiet", \
+	  PARSE_OPT_NOARG, &parse_opt_verbosity_cb, 0 }
+#define OPT__DRY_RUN(var)  OPT_BOOLEAN('n', "dry-run", (var), "dry run")
+#define OPT__ABBREV(var)  \
+	{ OPTION_CALLBACK, 0, "abbrev", (var), "n", \
+	  "use <n> digits to display SHA-1s", \
+	  PARSE_OPT_OPTARG, &parse_opt_abbrev_cb, 0 }
+
+extern const char *parse_options_fix_filename(const char *prefix, const char *file);
+
+void set_option_flag(struct option *opts, int sopt, const char *lopt, int flag);
+void set_option_nobuild(struct option *opts, int shortopt, const char *longopt,
+			const char *build_opt, bool can_skip);
+#endif /* __PERF_PARSE_OPTIONS_H */
diff --git a/tools/lib/subcmd/run-command.c b/tools/lib/subcmd/run-command.c
new file mode 100644
index 0000000..f4f6c9e
--- /dev/null
+++ b/tools/lib/subcmd/run-command.c
@@ -0,0 +1,227 @@
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/wait.h>
+#include "subcmd-util.h"
+#include "run-command.h"
+#include "exec-cmd.h"
+
+#define STRERR_BUFSIZE 128
+
+static inline void close_pair(int fd[2])
+{
+	close(fd[0]);
+	close(fd[1]);
+}
+
+static inline void dup_devnull(int to)
+{
+	int fd = open("/dev/null", O_RDWR);
+	dup2(fd, to);
+	close(fd);
+}
+
+int start_command(struct child_process *cmd)
+{
+	int need_in, need_out, need_err;
+	int fdin[2], fdout[2], fderr[2];
+	char sbuf[STRERR_BUFSIZE];
+
+	/*
+	 * In case of errors we must keep the promise to close FDs
+	 * that have been passed in via ->in and ->out.
+	 */
+
+	need_in = !cmd->no_stdin && cmd->in < 0;
+	if (need_in) {
+		if (pipe(fdin) < 0) {
+			if (cmd->out > 0)
+				close(cmd->out);
+			return -ERR_RUN_COMMAND_PIPE;
+		}
+		cmd->in = fdin[1];
+	}
+
+	need_out = !cmd->no_stdout
+		&& !cmd->stdout_to_stderr
+		&& cmd->out < 0;
+	if (need_out) {
+		if (pipe(fdout) < 0) {
+			if (need_in)
+				close_pair(fdin);
+			else if (cmd->in)
+				close(cmd->in);
+			return -ERR_RUN_COMMAND_PIPE;
+		}
+		cmd->out = fdout[0];
+	}
+
+	need_err = !cmd->no_stderr && cmd->err < 0;
+	if (need_err) {
+		if (pipe(fderr) < 0) {
+			if (need_in)
+				close_pair(fdin);
+			else if (cmd->in)
+				close(cmd->in);
+			if (need_out)
+				close_pair(fdout);
+			else if (cmd->out)
+				close(cmd->out);
+			return -ERR_RUN_COMMAND_PIPE;
+		}
+		cmd->err = fderr[0];
+	}
+
+	fflush(NULL);
+	cmd->pid = fork();
+	if (!cmd->pid) {
+		if (cmd->no_stdin)
+			dup_devnull(0);
+		else if (need_in) {
+			dup2(fdin[0], 0);
+			close_pair(fdin);
+		} else if (cmd->in) {
+			dup2(cmd->in, 0);
+			close(cmd->in);
+		}
+
+		if (cmd->no_stderr)
+			dup_devnull(2);
+		else if (need_err) {
+			dup2(fderr[1], 2);
+			close_pair(fderr);
+		}
+
+		if (cmd->no_stdout)
+			dup_devnull(1);
+		else if (cmd->stdout_to_stderr)
+			dup2(2, 1);
+		else if (need_out) {
+			dup2(fdout[1], 1);
+			close_pair(fdout);
+		} else if (cmd->out > 1) {
+			dup2(cmd->out, 1);
+			close(cmd->out);
+		}
+
+		if (cmd->dir && chdir(cmd->dir))
+			die("exec %s: cd to %s failed (%s)", cmd->argv[0],
+			    cmd->dir, strerror_r(errno, sbuf, sizeof(sbuf)));
+		if (cmd->env) {
+			for (; *cmd->env; cmd->env++) {
+				if (strchr(*cmd->env, '='))
+					putenv((char*)*cmd->env);
+				else
+					unsetenv(*cmd->env);
+			}
+		}
+		if (cmd->preexec_cb)
+			cmd->preexec_cb();
+		if (cmd->exec_cmd) {
+			execv_cmd(cmd->argv);
+		} else {
+			execvp(cmd->argv[0], (char *const*) cmd->argv);
+		}
+		exit(127);
+	}
+
+	if (cmd->pid < 0) {
+		int err = errno;
+		if (need_in)
+			close_pair(fdin);
+		else if (cmd->in)
+			close(cmd->in);
+		if (need_out)
+			close_pair(fdout);
+		else if (cmd->out)
+			close(cmd->out);
+		if (need_err)
+			close_pair(fderr);
+		return err == ENOENT ?
+			-ERR_RUN_COMMAND_EXEC :
+			-ERR_RUN_COMMAND_FORK;
+	}
+
+	if (need_in)
+		close(fdin[0]);
+	else if (cmd->in)
+		close(cmd->in);
+
+	if (need_out)
+		close(fdout[1]);
+	else if (cmd->out)
+		close(cmd->out);
+
+	if (need_err)
+		close(fderr[1]);
+
+	return 0;
+}
+
+static int wait_or_whine(pid_t pid)
+{
+	char sbuf[STRERR_BUFSIZE];
+
+	for (;;) {
+		int status, code;
+		pid_t waiting = waitpid(pid, &status, 0);
+
+		if (waiting < 0) {
+			if (errno == EINTR)
+				continue;
+			fprintf(stderr, " Error: waitpid failed (%s)",
+				strerror_r(errno, sbuf, sizeof(sbuf)));
+			return -ERR_RUN_COMMAND_WAITPID;
+		}
+		if (waiting != pid)
+			return -ERR_RUN_COMMAND_WAITPID_WRONG_PID;
+		if (WIFSIGNALED(status))
+			return -ERR_RUN_COMMAND_WAITPID_SIGNAL;
+
+		if (!WIFEXITED(status))
+			return -ERR_RUN_COMMAND_WAITPID_NOEXIT;
+		code = WEXITSTATUS(status);
+		switch (code) {
+		case 127:
+			return -ERR_RUN_COMMAND_EXEC;
+		case 0:
+			return 0;
+		default:
+			return -code;
+		}
+	}
+}
+
+int finish_command(struct child_process *cmd)
+{
+	return wait_or_whine(cmd->pid);
+}
+
+int run_command(struct child_process *cmd)
+{
+	int code = start_command(cmd);
+	if (code)
+		return code;
+	return finish_command(cmd);
+}
+
+static void prepare_run_command_v_opt(struct child_process *cmd,
+				      const char **argv,
+				      int opt)
+{
+	memset(cmd, 0, sizeof(*cmd));
+	cmd->argv = argv;
+	cmd->no_stdin = opt & RUN_COMMAND_NO_STDIN ? 1 : 0;
+	cmd->exec_cmd = opt & RUN_EXEC_CMD ? 1 : 0;
+	cmd->stdout_to_stderr = opt & RUN_COMMAND_STDOUT_TO_STDERR ? 1 : 0;
+}
+
+int run_command_v_opt(const char **argv, int opt)
+{
+	struct child_process cmd;
+	prepare_run_command_v_opt(&cmd, argv, opt);
+	return run_command(&cmd);
+}
diff --git a/tools/lib/subcmd/run-command.h b/tools/lib/subcmd/run-command.h
new file mode 100644
index 0000000..4a55393
--- /dev/null
+++ b/tools/lib/subcmd/run-command.h
@@ -0,0 +1,60 @@
+#ifndef __PERF_RUN_COMMAND_H
+#define __PERF_RUN_COMMAND_H
+
+#include <unistd.h>
+
+enum {
+	ERR_RUN_COMMAND_FORK = 10000,
+	ERR_RUN_COMMAND_EXEC,
+	ERR_RUN_COMMAND_PIPE,
+	ERR_RUN_COMMAND_WAITPID,
+	ERR_RUN_COMMAND_WAITPID_WRONG_PID,
+	ERR_RUN_COMMAND_WAITPID_SIGNAL,
+	ERR_RUN_COMMAND_WAITPID_NOEXIT,
+};
+#define IS_RUN_COMMAND_ERR(x) (-(x) >= ERR_RUN_COMMAND_FORK)
+
+struct child_process {
+	const char **argv;
+	pid_t pid;
+	/*
+	 * Using .in, .out, .err:
+	 * - Specify 0 for no redirections (child inherits stdin, stdout,
+	 *   stderr from parent).
+	 * - Specify -1 to have a pipe allocated as follows:
+	 *     .in: returns the writable pipe end; parent writes to it,
+	 *          the readable pipe end becomes child's stdin
+	 *     .out, .err: returns the readable pipe end; parent reads from
+	 *          it, the writable pipe end becomes child's stdout/stderr
+	 *   The caller of start_command() must close the returned FDs
+	 *   after it has completed reading from/writing to it!
+	 * - Specify > 0 to set a channel to a particular FD as follows:
+	 *     .in: a readable FD, becomes child's stdin
+	 *     .out: a writable FD, becomes child's stdout/stderr
+	 *     .err > 0 not supported
+	 *   The specified FD is closed by start_command(), even in case
+	 *   of errors!
+	 */
+	int in;
+	int out;
+	int err;
+	const char *dir;
+	const char *const *env;
+	unsigned no_stdin:1;
+	unsigned no_stdout:1;
+	unsigned no_stderr:1;
+	unsigned exec_cmd:1; /* if this is to be external sub-command */
+	unsigned stdout_to_stderr:1;
+	void (*preexec_cb)(void);
+};
+
+int start_command(struct child_process *);
+int finish_command(struct child_process *);
+int run_command(struct child_process *);
+
+#define RUN_COMMAND_NO_STDIN 1
+#define RUN_EXEC_CMD	     2	/*If this is to be external sub-command */
+#define RUN_COMMAND_STDOUT_TO_STDERR 4
+int run_command_v_opt(const char **argv, int opt);
+
+#endif /* __PERF_RUN_COMMAND_H */
diff --git a/tools/lib/subcmd/sigchain.c b/tools/lib/subcmd/sigchain.c
new file mode 100644
index 0000000..3537c34
--- /dev/null
+++ b/tools/lib/subcmd/sigchain.c
@@ -0,0 +1,53 @@
+#include <signal.h>
+#include "subcmd-util.h"
+#include "sigchain.h"
+
+#define SIGCHAIN_MAX_SIGNALS 32
+
+struct sigchain_signal {
+	sigchain_fun *old;
+	int n;
+	int alloc;
+};
+static struct sigchain_signal signals[SIGCHAIN_MAX_SIGNALS];
+
+static void check_signum(int sig)
+{
+	if (sig < 1 || sig >= SIGCHAIN_MAX_SIGNALS)
+		die("BUG: signal out of range: %d", sig);
+}
+
+static int sigchain_push(int sig, sigchain_fun f)
+{
+	struct sigchain_signal *s = signals + sig;
+	check_signum(sig);
+
+	ALLOC_GROW(s->old, s->n + 1, s->alloc);
+	s->old[s->n] = signal(sig, f);
+	if (s->old[s->n] == SIG_ERR)
+		return -1;
+	s->n++;
+	return 0;
+}
+
+int sigchain_pop(int sig)
+{
+	struct sigchain_signal *s = signals + sig;
+	check_signum(sig);
+	if (s->n < 1)
+		return 0;
+
+	if (signal(sig, s->old[s->n - 1]) == SIG_ERR)
+		return -1;
+	s->n--;
+	return 0;
+}
+
+void sigchain_push_common(sigchain_fun f)
+{
+	sigchain_push(SIGINT, f);
+	sigchain_push(SIGHUP, f);
+	sigchain_push(SIGTERM, f);
+	sigchain_push(SIGQUIT, f);
+	sigchain_push(SIGPIPE, f);
+}
diff --git a/tools/lib/subcmd/sigchain.h b/tools/lib/subcmd/sigchain.h
new file mode 100644
index 0000000..959d64e
--- /dev/null
+++ b/tools/lib/subcmd/sigchain.h
@@ -0,0 +1,10 @@
+#ifndef __PERF_SIGCHAIN_H
+#define __PERF_SIGCHAIN_H
+
+typedef void (*sigchain_fun)(int);
+
+int sigchain_pop(int sig);
+
+void sigchain_push_common(sigchain_fun f);
+
+#endif /* __PERF_SIGCHAIN_H */
diff --git a/tools/lib/subcmd/subcmd-config.c b/tools/lib/subcmd/subcmd-config.c
new file mode 100644
index 0000000..d017c72
--- /dev/null
+++ b/tools/lib/subcmd/subcmd-config.c
@@ -0,0 +1,11 @@
+#include "subcmd-config.h"
+
+#define UNDEFINED "SUBCMD_HAS_NOT_BEEN_INITIALIZED"
+
+struct subcmd_config subcmd_config = {
+	.exec_name	= UNDEFINED,
+	.prefix		= UNDEFINED,
+	.exec_path	= UNDEFINED,
+	.exec_path_env	= UNDEFINED,
+	.pager_env	= UNDEFINED,
+};
diff --git a/tools/lib/subcmd/subcmd-config.h b/tools/lib/subcmd/subcmd-config.h
new file mode 100644
index 0000000..cc85140
--- /dev/null
+++ b/tools/lib/subcmd/subcmd-config.h
@@ -0,0 +1,14 @@
+#ifndef __PERF_SUBCMD_CONFIG_H
+#define __PERF_SUBCMD_CONFIG_H
+
+struct subcmd_config {
+	const char *exec_name;
+	const char *prefix;
+	const char *exec_path;
+	const char *exec_path_env;
+	const char *pager_env;
+};
+
+extern struct subcmd_config subcmd_config;
+
+#endif /* __PERF_SUBCMD_CONFIG_H */
diff --git a/tools/lib/subcmd/subcmd-util.h b/tools/lib/subcmd/subcmd-util.h
new file mode 100644
index 0000000..321aeb1
--- /dev/null
+++ b/tools/lib/subcmd/subcmd-util.h
@@ -0,0 +1,91 @@
+#ifndef __PERF_SUBCMD_UTIL_H
+#define __PERF_SUBCMD_UTIL_H
+
+#include <stdarg.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#define NORETURN __attribute__((__noreturn__))
+
+static inline void report(const char *prefix, const char *err, va_list params)
+{
+	char msg[1024];
+	vsnprintf(msg, sizeof(msg), err, params);
+	fprintf(stderr, " %s%s\n", prefix, msg);
+}
+
+static NORETURN inline void die(const char *err, ...)
+{
+	va_list params;
+
+	va_start(params, err);
+	report(" Fatal: ", err, params);
+	exit(128);
+	va_end(params);
+}
+
+#define zfree(ptr) ({ free(*ptr); *ptr = NULL; })
+
+#define alloc_nr(x) (((x)+16)*3/2)
+
+/*
+ * Realloc the buffer pointed at by variable 'x' so that it can hold
+ * at least 'nr' entries; the number of entries currently allocated
+ * is 'alloc', using the standard growing factor alloc_nr() macro.
+ *
+ * DO NOT USE any expression with side-effect for 'x' or 'alloc'.
+ */
+#define ALLOC_GROW(x, nr, alloc) \
+	do { \
+		if ((nr) > alloc) { \
+			if (alloc_nr(alloc) < (nr)) \
+				alloc = (nr); \
+			else \
+				alloc = alloc_nr(alloc); \
+			x = xrealloc((x), alloc * sizeof(*(x))); \
+		} \
+	} while(0)
+
+static inline void *xrealloc(void *ptr, size_t size)
+{
+	void *ret = realloc(ptr, size);
+	if (!ret && !size)
+		ret = realloc(ptr, 1);
+	if (!ret) {
+		ret = realloc(ptr, size);
+		if (!ret && !size)
+			ret = realloc(ptr, 1);
+		if (!ret)
+			die("Out of memory, realloc failed");
+	}
+	return ret;
+}
+
+#define astrcatf(out, fmt, ...)						\
+({									\
+	char *tmp = *(out);						\
+	if (asprintf((out), "%s" fmt, tmp ?: "", ## __VA_ARGS__) == -1)	\
+		die("asprintf failed");					\
+	free(tmp);							\
+})
+
+static inline void astrcat(char **out, const char *add)
+{
+	char *tmp = *out;
+
+	if (asprintf(out, "%s%s", tmp ?: "", add) == -1)
+		die("asprintf failed");
+
+	free(tmp);
+}
+
+static inline int prefixcmp(const char *str, const char *prefix)
+{
+	for (; ; str++, prefix++)
+		if (!*prefix)
+			return 0;
+		else if (*str != *prefix)
+			return (unsigned char)*prefix - (unsigned char)*str;
+}
+
+#endif /* __PERF_SUBCMD_UTIL_H */
diff --git a/tools/perf/MANIFEST b/tools/perf/MANIFEST
index 2562eac..ce3932e 100644
--- a/tools/perf/MANIFEST
+++ b/tools/perf/MANIFEST
@@ -20,6 +20,7 @@ tools/lib/traceevent
 tools/lib/bpf
 tools/lib/api
 tools/lib/bpf
+tools/lib/subcmd
 tools/lib/hweight.c
 tools/lib/rbtree.c
 tools/lib/string.c
diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index 388ec64f..569fcf0 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -145,9 +145,10 @@ BISON   = bison
 STRIP   = strip
 AWK     = awk
 
-LIB_DIR          = $(srctree)/tools/lib/api/
+LIB_DIR		= $(srctree)/tools/lib/api/
 TRACE_EVENT_DIR = $(srctree)/tools/lib/traceevent/
-BPF_DIR = $(srctree)/tools/lib/bpf/
+BPF_DIR		= $(srctree)/tools/lib/bpf/
+SUBCMD_DIR	= $(srctree)/tools/lib/subcmd/
 
 # include config/Makefile by default and rule out
 # non-config cases
@@ -184,6 +185,7 @@ strip-libs = $(filter-out -l%,$(1))
 ifneq ($(OUTPUT),)
   TE_PATH=$(OUTPUT)
   BPF_PATH=$(OUTPUT)
+  SUBCMD_PATH=$(OUTPUT)
 ifneq ($(subdir),)
   API_PATH=$(OUTPUT)/../lib/api/
 else
@@ -193,6 +195,7 @@ else
   TE_PATH=$(TRACE_EVENT_DIR)
   API_PATH=$(LIB_DIR)
   BPF_PATH=$(BPF_DIR)
+  SUBCMD_PATH=$(SUBCMD_DIR)
 endif
 
 LIBTRACEEVENT = $(TE_PATH)libtraceevent.a
@@ -206,6 +209,8 @@ export LIBAPI
 
 LIBBPF = $(BPF_PATH)libbpf.a
 
+LIBSUBCMD = $(SUBCMD_PATH)libsubcmd.a
+
 # python extension build directories
 PYTHON_EXTBUILD     := $(OUTPUT)python_ext_build/
 PYTHON_EXTBUILD_LIB := $(PYTHON_EXTBUILD)lib/
@@ -257,7 +262,7 @@ export PERL_PATH
 
 LIB_FILE=$(OUTPUT)libperf.a
 
-PERFLIBS = $(LIB_FILE) $(LIBAPI) $(LIBTRACEEVENT)
+PERFLIBS = $(LIB_FILE) $(LIBAPI) $(LIBTRACEEVENT) $(LIBSUBCMD)
 ifndef NO_LIBBPF
   PERFLIBS += $(LIBBPF)
 endif
@@ -437,6 +442,13 @@ $(LIBBPF)-clean:
 	$(call QUIET_CLEAN, libbpf)
 	$(Q)$(MAKE) -C $(BPF_DIR) O=$(OUTPUT) clean >/dev/null
 
+$(LIBSUBCMD): fixdep FORCE
+	$(Q)$(MAKE) -C $(SUBCMD_DIR) O=$(OUTPUT) $(OUTPUT)libsubcmd.a
+
+$(LIBSUBCMD)-clean:
+	$(call QUIET_CLEAN, libsubcmd)
+	$(Q)$(MAKE) -C $(SUBCMD_DIR) O=$(OUTPUT) clean
+
 help:
 	@echo 'Perf make targets:'
 	@echo '  doc		- make *all* documentation (see below)'
@@ -584,7 +596,7 @@ config-clean:
 	$(call QUIET_CLEAN, config)
 	$(Q)$(MAKE) -C $(srctree)/tools/build/feature/ $(if $(OUTPUT),OUTPUT=$(OUTPUT)feature/,) clean >/dev/null
 
-clean: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean config-clean
+clean: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clean config-clean
 	$(call QUIET_CLEAN, core-objs)  $(RM) $(LIB_FILE) $(OUTPUT)perf-archive $(OUTPUT)perf-with-kcore $(LANG_BINDINGS)
 	$(Q)find $(if $(OUTPUT),$(OUTPUT),.) -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete
 	$(Q)$(RM) $(OUTPUT).config-detected
diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c
index b02af06..b64d462 100644
--- a/tools/perf/arch/x86/util/intel-pt.c
+++ b/tools/perf/arch/x86/util/intel-pt.c
@@ -26,7 +26,7 @@
 #include "../../util/evlist.h"
 #include "../../util/evsel.h"
 #include "../../util/cpumap.h"
-#include "../../util/parse-options.h"
+#include <subcmd/parse-options.h>
 #include "../../util/parse-events.h"
 #include "../../util/pmu.h"
 #include "../../util/debug.h"
diff --git a/tools/perf/bench/futex-hash.c b/tools/perf/bench/futex-hash.c
index fc9bebd..0999ac5 100644
--- a/tools/perf/bench/futex-hash.c
+++ b/tools/perf/bench/futex-hash.c
@@ -11,7 +11,7 @@
 #include "../perf.h"
 #include "../util/util.h"
 #include "../util/stat.h"
-#include "../util/parse-options.h"
+#include <subcmd/parse-options.h>
 #include "../util/header.h"
 #include "bench.h"
 #include "futex.h"
diff --git a/tools/perf/bench/futex-lock-pi.c b/tools/perf/bench/futex-lock-pi.c
index bc6a16a..6a18ce2 100644
--- a/tools/perf/bench/futex-lock-pi.c
+++ b/tools/perf/bench/futex-lock-pi.c
@@ -5,7 +5,7 @@
 #include "../perf.h"
 #include "../util/util.h"
 #include "../util/stat.h"
-#include "../util/parse-options.h"
+#include <subcmd/parse-options.h>
 #include "../util/header.h"
 #include "bench.h"
 #include "futex.h"
diff --git a/tools/perf/bench/futex-requeue.c b/tools/perf/bench/futex-requeue.c
index ad0d9b5..7182386 100644
--- a/tools/perf/bench/futex-requeue.c
+++ b/tools/perf/bench/futex-requeue.c
@@ -11,7 +11,7 @@
 #include "../perf.h"
 #include "../util/util.h"
 #include "../util/stat.h"
-#include "../util/parse-options.h"
+#include <subcmd/parse-options.h>
 #include "../util/header.h"
 #include "bench.h"
 #include "futex.h"
diff --git a/tools/perf/bench/futex-wake-parallel.c b/tools/perf/bench/futex-wake-parallel.c
index 6d8c9fa..91aaf2a 100644
--- a/tools/perf/bench/futex-wake-parallel.c
+++ b/tools/perf/bench/futex-wake-parallel.c
@@ -10,7 +10,7 @@
 #include "../perf.h"
 #include "../util/util.h"
 #include "../util/stat.h"
-#include "../util/parse-options.h"
+#include <subcmd/parse-options.h>
 #include "../util/header.h"
 #include "bench.h"
 #include "futex.h"
diff --git a/tools/perf/bench/futex-wake.c b/tools/perf/bench/futex-wake.c
index e5e41d3..f416bd7 100644
--- a/tools/perf/bench/futex-wake.c
+++ b/tools/perf/bench/futex-wake.c
@@ -11,7 +11,7 @@
 #include "../perf.h"
 #include "../util/util.h"
 #include "../util/stat.h"
-#include "../util/parse-options.h"
+#include <subcmd/parse-options.h>
 #include "../util/header.h"
 #include "bench.h"
 #include "futex.h"
diff --git a/tools/perf/bench/mem-functions.c b/tools/perf/bench/mem-functions.c
index 9419b94..a91aa85 100644
--- a/tools/perf/bench/mem-functions.c
+++ b/tools/perf/bench/mem-functions.c
@@ -8,7 +8,7 @@
 
 #include "../perf.h"
 #include "../util/util.h"
-#include "../util/parse-options.h"
+#include <subcmd/parse-options.h>
 #include "../util/header.h"
 #include "../util/cloexec.h"
 #include "bench.h"
diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c
index 492df27..5049d63 100644
--- a/tools/perf/bench/numa.c
+++ b/tools/perf/bench/numa.c
@@ -7,7 +7,7 @@
 #include "../perf.h"
 #include "../builtin.h"
 #include "../util/util.h"
-#include "../util/parse-options.h"
+#include <subcmd/parse-options.h>
 #include "../util/cloexec.h"
 
 #include "bench.h"
diff --git a/tools/perf/bench/sched-messaging.c b/tools/perf/bench/sched-messaging.c
index d4ff1b5..bfaf950 100644
--- a/tools/perf/bench/sched-messaging.c
+++ b/tools/perf/bench/sched-messaging.c
@@ -11,7 +11,7 @@
 
 #include "../perf.h"
 #include "../util/util.h"
-#include "../util/parse-options.h"
+#include <subcmd/parse-options.h>
 #include "../builtin.h"
 #include "bench.h"
 
diff --git a/tools/perf/bench/sched-pipe.c b/tools/perf/bench/sched-pipe.c
index 005cc28..1dc2d13 100644
--- a/tools/perf/bench/sched-pipe.c
+++ b/tools/perf/bench/sched-pipe.c
@@ -10,7 +10,7 @@
  */
 #include "../perf.h"
 #include "../util/util.h"
-#include "../util/parse-options.h"
+#include <subcmd/parse-options.h>
 #include "../builtin.h"
 #include "bench.h"
 
diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 1f00dc7..e18f1b9 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -21,7 +21,7 @@
 #include "util/evsel.h"
 #include "util/annotate.h"
 #include "util/event.h"
-#include "util/parse-options.h"
+#include <subcmd/parse-options.h>
 #include "util/parse-events.h"
 #include "util/thread.h"
 #include "util/sort.h"
diff --git a/tools/perf/builtin-bench.c b/tools/perf/builtin-bench.c
index b17aed3..a1cddc6 100644
--- a/tools/perf/builtin-bench.c
+++ b/tools/perf/builtin-bench.c
@@ -16,7 +16,7 @@
  */
 #include "perf.h"
 #include "util/util.h"
-#include "util/parse-options.h"
+#include <subcmd/parse-options.h>
 #include "builtin.h"
 #include "bench/bench.h"
 
diff --git a/tools/perf/builtin-buildid-cache.c b/tools/perf/builtin-buildid-cache.c
index 7b8450c..d93bff7 100644
--- a/tools/perf/builtin-buildid-cache.c
+++ b/tools/perf/builtin-buildid-cache.c
@@ -16,7 +16,7 @@
 #include "util/cache.h"
 #include "util/debug.h"
 #include "util/header.h"
-#include "util/parse-options.h"
+#include <subcmd/parse-options.h>
 #include "util/strlist.h"
 #include "util/build-id.h"
 #include "util/session.h"
diff --git a/tools/perf/builtin-buildid-list.c b/tools/perf/builtin-buildid-list.c
index 6419f57..5e914ee 100644
--- a/tools/perf/builtin-buildid-list.c
+++ b/tools/perf/builtin-buildid-list.c
@@ -12,7 +12,7 @@
 #include "util/build-id.h"
 #include "util/cache.h"
 #include "util/debug.h"
-#include "util/parse-options.h"
+#include <subcmd/parse-options.h>
 #include "util/session.h"
 #include "util/symbol.h"
 #include "util/data.h"
diff --git a/tools/perf/builtin-config.c b/tools/perf/builtin-config.c
index 427ea7a..f04e804 100644
--- a/tools/perf/builtin-config.c
+++ b/tools/perf/builtin-config.c
@@ -9,7 +9,7 @@
 #include "perf.h"
 
 #include "util/cache.h"
-#include "util/parse-options.h"
+#include <subcmd/parse-options.h>
 #include "util/util.h"
 #include "util/debug.h"
 
diff --git a/tools/perf/builtin-data.c b/tools/perf/builtin-data.c
index d6525bc..b97bc15 100644
--- a/tools/perf/builtin-data.c
+++ b/tools/perf/builtin-data.c
@@ -2,7 +2,7 @@
 #include "builtin.h"
 #include "perf.h"
 #include "debug.h"
-#include "parse-options.h"
+#include <subcmd/parse-options.h>
 #include "data-convert-bt.h"
 
 typedef int (*data_cmd_fn_t)(int argc, const char **argv, const char *prefix);
diff --git a/tools/perf/builtin-evlist.c b/tools/perf/builtin-evlist.c
index f4d6251..08a7d36 100644
--- a/tools/perf/builtin-evlist.c
+++ b/tools/perf/builtin-evlist.c
@@ -12,7 +12,7 @@
 #include "util/evlist.h"
 #include "util/evsel.h"
 #include "util/parse-events.h"
-#include "util/parse-options.h"
+#include <subcmd/parse-options.h>
 #include "util/session.h"
 #include "util/data.h"
 #include "util/debug.h"
diff --git a/tools/perf/builtin-help.c b/tools/perf/builtin-help.c
index 275aa64..96c1a4c 100644
--- a/tools/perf/builtin-help.c
+++ b/tools/perf/builtin-help.c
@@ -6,11 +6,11 @@
 #include "perf.h"
 #include "util/cache.h"
 #include "builtin.h"
-#include "util/exec_cmd.h"
+#include <subcmd/exec-cmd.h>
 #include "common-cmds.h"
-#include "util/parse-options.h"
-#include "util/run-command.h"
-#include "util/help.h"
+#include <subcmd/parse-options.h>
+#include <subcmd/run-command.h>
+#include <subcmd/help.h>
 #include "util/debug.h"
 
 static struct man_viewer_list {
diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index 99d127f..0022e02 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -18,7 +18,7 @@
 #include "util/data.h"
 #include "util/auxtrace.h"
 
-#include "util/parse-options.h"
+#include <subcmd/parse-options.h>
 
 #include <linux/list.h>
 
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index 93ce665..1180105 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -12,7 +12,7 @@
 #include "util/tool.h"
 #include "util/callchain.h"
 
-#include "util/parse-options.h"
+#include <subcmd/parse-options.h>
 #include "util/trace-event.h"
 #include "util/data.h"
 #include "util/cpumap.h"
diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c
index 031f9f5..4418d92 100644
--- a/tools/perf/builtin-kvm.c
+++ b/tools/perf/builtin-kvm.c
@@ -10,7 +10,7 @@
 #include "util/header.h"
 #include "util/session.h"
 #include "util/intlist.h"
-#include "util/parse-options.h"
+#include <subcmd/parse-options.h>
 #include "util/trace-event.h"
 #include "util/debug.h"
 #include "util/tool.h"
diff --git a/tools/perf/builtin-list.c b/tools/perf/builtin-list.c
index bf679e2..5e22db4 100644
--- a/tools/perf/builtin-list.c
+++ b/tools/perf/builtin-list.c
@@ -14,7 +14,7 @@
 #include "util/parse-events.h"
 #include "util/cache.h"
 #include "util/pmu.h"
-#include "util/parse-options.h"
+#include <subcmd/parse-options.h>
 
 int cmd_list(int argc, const char **argv, const char *prefix __maybe_unused)
 {
diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index de16aae..ce3bfb4 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -9,7 +9,7 @@
 #include "util/thread.h"
 #include "util/header.h"
 
-#include "util/parse-options.h"
+#include <subcmd/parse-options.h>
 #include "util/trace-event.h"
 
 #include "util/debug.h"
diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c
index 80170aa..3901700 100644
--- a/tools/perf/builtin-mem.c
+++ b/tools/perf/builtin-mem.c
@@ -1,7 +1,7 @@
 #include "builtin.h"
 #include "perf.h"
 
-#include "util/parse-options.h"
+#include <subcmd/parse-options.h>
 #include "util/trace-event.h"
 #include "util/tool.h"
 #include "util/session.h"
diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c
index dbe2ea5..9af859b 100644
--- a/tools/perf/builtin-probe.c
+++ b/tools/perf/builtin-probe.c
@@ -37,7 +37,7 @@
 #include "util/strfilter.h"
 #include "util/symbol.h"
 #include "util/debug.h"
-#include "util/parse-options.h"
+#include <subcmd/parse-options.h>
 #include "util/probe-finder.h"
 #include "util/probe-event.h"
 #include "util/probe-file.h"
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index a3b4930..1435ef6 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -11,7 +11,7 @@
 
 #include "util/build-id.h"
 #include "util/util.h"
-#include "util/parse-options.h"
+#include <subcmd/parse-options.h>
 #include "util/parse-events.h"
 
 #include "util/callchain.h"
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 5a45466..2a7330b 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -27,7 +27,7 @@
 #include "util/session.h"
 #include "util/tool.h"
 
-#include "util/parse-options.h"
+#include <subcmd/parse-options.h>
 #include "util/parse-events.h"
 
 #include "util/thread.h"
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index e3d3e32..871b55ae 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -12,7 +12,7 @@
 #include "util/tool.h"
 #include "util/cloexec.h"
 
-#include "util/parse-options.h"
+#include <subcmd/parse-options.h>
 #include "util/trace-event.h"
 
 #include "util/debug.h"
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 571016f..bcc3542 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -3,9 +3,9 @@
 #include "perf.h"
 #include "util/cache.h"
 #include "util/debug.h"
-#include "util/exec_cmd.h"
+#include <subcmd/exec-cmd.h>
 #include "util/header.h"
-#include "util/parse-options.h"
+#include <subcmd/parse-options.h>
 #include "util/perf_regs.h"
 #include "util/session.h"
 #include "util/tool.h"
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 25a95f4..bbf42ee 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -45,7 +45,7 @@
 #include "builtin.h"
 #include "util/cgroup.h"
 #include "util/util.h"
-#include "util/parse-options.h"
+#include <subcmd/parse-options.h>
 #include "util/parse-events.h"
 #include "util/pmu.h"
 #include "util/event.h"
diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c
index 30e5962..bd7a775 100644
--- a/tools/perf/builtin-timechart.c
+++ b/tools/perf/builtin-timechart.c
@@ -30,7 +30,7 @@
 
 #include "perf.h"
 #include "util/header.h"
-#include "util/parse-options.h"
+#include <subcmd/parse-options.h>
 #include "util/parse-events.h"
 #include "util/event.h"
 #include "util/session.h"
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 92fe963..9ebd67a 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -34,7 +34,7 @@
 #include "util/top.h"
 #include "util/util.h"
 #include <linux/rbtree.h>
-#include "util/parse-options.h"
+#include <subcmd/parse-options.h>
 #include "util/parse-events.h"
 #include "util/cpumap.h"
 #include "util/xyarray.h"
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index c783d8f..20916dd 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -22,11 +22,11 @@
 #include "util/color.h"
 #include "util/debug.h"
 #include "util/evlist.h"
-#include "util/exec_cmd.h"
+#include <subcmd/exec-cmd.h>
 #include "util/machine.h"
 #include "util/session.h"
 #include "util/thread.h"
-#include "util/parse-options.h"
+#include <subcmd/parse-options.h>
 #include "util/strlist.h"
 #include "util/intlist.h"
 #include "util/thread_map.h"
diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index 6894325..cb1d249 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -9,12 +9,12 @@
 #include "builtin.h"
 
 #include "util/env.h"
-#include "util/exec_cmd.h"
+#include <subcmd/exec-cmd.h>
 #include "util/cache.h"
 #include "util/quote.h"
-#include "util/run-command.h"
+#include <subcmd/run-command.h>
 #include "util/parse-events.h"
-#include "util/parse-options.h"
+#include <subcmd/parse-options.h>
 #include "util/bpf-loader.h"
 #include "util/debug.h"
 #include <api/fs/tracing_path.h>
diff --git a/tools/perf/tests/attr.c b/tools/perf/tests/attr.c
index 6337f1c..28d1605 100644
--- a/tools/perf/tests/attr.c
+++ b/tools/perf/tests/attr.c
@@ -24,7 +24,7 @@
 #include <linux/kernel.h>
 #include "../perf.h"
 #include "util.h"
-#include "exec_cmd.h"
+#include <subcmd/exec-cmd.h>
 #include "tests.h"
 
 #define ENV "PERF_TEST_ATTR"
diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c
index fa98406..0372d59 100644
--- a/tools/perf/tests/builtin-test.c
+++ b/tools/perf/tests/builtin-test.c
@@ -11,7 +11,7 @@
 #include "tests.h"
 #include "debug.h"
 #include "color.h"
-#include "parse-options.h"
+#include <subcmd/parse-options.h>
 #include "symbol.h"
 
 struct test __weak arch_tests[] = {
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index 196beef..94b1099 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -9,13 +9,10 @@ libperf-y += env.o
 libperf-y += event.o
 libperf-y += evlist.o
 libperf-y += evsel.o
-libperf-y += exec_cmd.o
 libperf-y += find_next_bit.o
-libperf-y += help.o
 libperf-y += kallsyms.o
 libperf-y += levenshtein.o
 libperf-y += llvm-utils.o
-libperf-y += parse-options.o
 libperf-y += parse-events.o
 libperf-y += perf_regs.o
 libperf-y += path.o
@@ -23,7 +20,6 @@ libperf-y += rbtree.o
 libperf-y += libstring.o
 libperf-y += bitmap.o
 libperf-y += hweight.o
-libperf-y += run-command.o
 libperf-y += quote.o
 libperf-y += strbuf.o
 libperf-y += string.o
@@ -32,11 +28,9 @@ libperf-y += strfilter.o
 libperf-y += top.o
 libperf-y += usage.o
 libperf-y += wrapper.o
-libperf-y += sigchain.o
 libperf-y += dso.o
 libperf-y += symbol.o
 libperf-y += color.o
-libperf-y += pager.o
 libperf-y += header.o
 libperf-y += callchain.o
 libperf-y += values.o
@@ -88,7 +82,6 @@ libperf-y += parse-branch-options.o
 libperf-y += parse-regs-options.o
 libperf-y += term.o
 libperf-y += help-unknown-cmd.o
-libperf-y += subcmd-config.o
 
 libperf-$(CONFIG_LIBBPF) += bpf-loader.o
 libperf-$(CONFIG_BPF_PROLOGUE) += bpf-prologue.o
diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c
index 7f10430a..360fda0 100644
--- a/tools/perf/util/auxtrace.c
+++ b/tools/perf/util/auxtrace.c
@@ -45,7 +45,7 @@
 #include "event.h"
 #include "session.h"
 #include "debug.h"
-#include "parse-options.h"
+#include <subcmd/parse-options.h>
 
 #include "intel-pt.h"
 #include "intel-bts.h"
diff --git a/tools/perf/util/cache.h b/tools/perf/util/cache.h
index fc6a745..07b5d63 100644
--- a/tools/perf/util/cache.h
+++ b/tools/perf/util/cache.h
@@ -4,7 +4,7 @@
 #include <stdbool.h>
 #include "util.h"
 #include "strbuf.h"
-#include "pager.h"
+#include <subcmd/pager.h>
 #include "../perf.h"
 #include "../ui/ui.h"
 
diff --git a/tools/perf/util/cgroup.c b/tools/perf/util/cgroup.c
index 32e12ecf..90aa1b4 100644
--- a/tools/perf/util/cgroup.c
+++ b/tools/perf/util/cgroup.c
@@ -1,6 +1,6 @@
 #include "util.h"
 #include "../perf.h"
-#include "parse-options.h"
+#include <subcmd/parse-options.h>
 #include "evsel.h"
 #include "cgroup.h"
 #include "evlist.h"
diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c
index 2e452ac..d3e12e3 100644
--- a/tools/perf/util/config.c
+++ b/tools/perf/util/config.c
@@ -10,7 +10,7 @@
  */
 #include "util.h"
 #include "cache.h"
-#include "exec_cmd.h"
+#include <subcmd/exec-cmd.h>
 #include "util/hist.h"  /* perf_hist_config */
 #include "util/llvm-utils.h"   /* perf_llvm_config */
 
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index d1b6c20..8c44aad 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -18,7 +18,7 @@
 #include <unistd.h>
 
 #include "parse-events.h"
-#include "parse-options.h"
+#include <subcmd/parse-options.h>
 
 #include <sys/mman.h>
 
diff --git a/tools/perf/util/exec_cmd.c b/tools/perf/util/exec_cmd.c
deleted file mode 100644
index e7f9ed79..0000000
--- a/tools/perf/util/exec_cmd.c
+++ /dev/null
@@ -1,209 +0,0 @@
-#include <linux/compiler.h>
-#include <linux/string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <unistd.h>
-#include <string.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include "subcmd-util.h"
-#include "exec_cmd.h"
-#include "subcmd-config.h"
-
-#define MAX_ARGS	32
-#define PATH_MAX	4096
-
-static const char *argv_exec_path;
-static const char *argv0_path;
-
-void exec_cmd_init(const char *exec_name, const char *prefix,
-		   const char *exec_path, const char *exec_path_env)
-{
-	subcmd_config.exec_name		= exec_name;
-	subcmd_config.prefix		= prefix;
-	subcmd_config.exec_path		= exec_path;
-	subcmd_config.exec_path_env	= exec_path_env;
-}
-
-#define is_dir_sep(c) ((c) == '/')
-
-static int is_absolute_path(const char *path)
-{
-	return path[0] == '/';
-}
-
-static const char *get_pwd_cwd(void)
-{
-	static char cwd[PATH_MAX + 1];
-	char *pwd;
-	struct stat cwd_stat, pwd_stat;
-	if (getcwd(cwd, PATH_MAX) == NULL)
-		return NULL;
-	pwd = getenv("PWD");
-	if (pwd && strcmp(pwd, cwd)) {
-		stat(cwd, &cwd_stat);
-		if (!stat(pwd, &pwd_stat) &&
-		    pwd_stat.st_dev == cwd_stat.st_dev &&
-		    pwd_stat.st_ino == cwd_stat.st_ino) {
-			strlcpy(cwd, pwd, PATH_MAX);
-		}
-	}
-	return cwd;
-}
-
-static const char *make_nonrelative_path(const char *path)
-{
-	static char buf[PATH_MAX + 1];
-
-	if (is_absolute_path(path)) {
-		if (strlcpy(buf, path, PATH_MAX) >= PATH_MAX)
-			die("Too long path: %.*s", 60, path);
-	} else {
-		const char *cwd = get_pwd_cwd();
-		if (!cwd)
-			die("Cannot determine the current working directory");
-		if (snprintf(buf, PATH_MAX, "%s/%s", cwd, path) >= PATH_MAX)
-			die("Too long path: %.*s", 60, path);
-	}
-	return buf;
-}
-
-char *system_path(const char *path)
-{
-	char *buf = NULL;
-
-	if (is_absolute_path(path))
-		return strdup(path);
-
-	astrcatf(&buf, "%s/%s", subcmd_config.prefix, path);
-
-	return buf;
-}
-
-const char *extract_argv0_path(const char *argv0)
-{
-	const char *slash;
-
-	if (!argv0 || !*argv0)
-		return NULL;
-	slash = argv0 + strlen(argv0);
-
-	while (argv0 <= slash && !is_dir_sep(*slash))
-		slash--;
-
-	if (slash >= argv0) {
-		argv0_path = strndup(argv0, slash - argv0);
-		return argv0_path ? slash + 1 : NULL;
-	}
-
-	return argv0;
-}
-
-void set_argv_exec_path(const char *exec_path)
-{
-	argv_exec_path = exec_path;
-	/*
-	 * Propagate this setting to external programs.
-	 */
-	setenv(subcmd_config.exec_path_env, exec_path, 1);
-}
-
-
-/* Returns the highest-priority location to look for subprograms. */
-char *get_argv_exec_path(void)
-{
-	char *env;
-
-	if (argv_exec_path)
-		return strdup(argv_exec_path);
-
-	env = getenv(subcmd_config.exec_path_env);
-	if (env && *env)
-		return strdup(env);
-
-	return system_path(subcmd_config.exec_path);
-}
-
-static void add_path(char **out, const char *path)
-{
-	if (path && *path) {
-		if (is_absolute_path(path))
-			astrcat(out, path);
-		else
-			astrcat(out, make_nonrelative_path(path));
-
-		astrcat(out, ":");
-	}
-}
-
-void setup_path(void)
-{
-	const char *old_path = getenv("PATH");
-	char *new_path = NULL;
-	char *tmp = get_argv_exec_path();
-
-	add_path(&new_path, tmp);
-	add_path(&new_path, argv0_path);
-	free(tmp);
-
-	if (old_path)
-		astrcat(&new_path, old_path);
-	else
-		astrcat(&new_path, "/usr/local/bin:/usr/bin:/bin");
-
-	setenv("PATH", new_path, 1);
-
-	free(new_path);
-}
-
-static const char **prepare_exec_cmd(const char **argv)
-{
-	int argc;
-	const char **nargv;
-
-	for (argc = 0; argv[argc]; argc++)
-		; /* just counting */
-	nargv = malloc(sizeof(*nargv) * (argc + 2));
-
-	nargv[0] = subcmd_config.exec_name;
-	for (argc = 0; argv[argc]; argc++)
-		nargv[argc + 1] = argv[argc];
-	nargv[argc + 1] = NULL;
-	return nargv;
-}
-
-int execv_cmd(const char **argv) {
-	const char **nargv = prepare_exec_cmd(argv);
-
-	/* execvp() can only ever return if it fails */
-	execvp(subcmd_config.exec_name, (char **)nargv);
-
-	free(nargv);
-	return -1;
-}
-
-
-int execl_cmd(const char *cmd,...)
-{
-	int argc;
-	const char *argv[MAX_ARGS + 1];
-	const char *arg;
-	va_list param;
-
-	va_start(param, cmd);
-	argv[0] = cmd;
-	argc = 1;
-	while (argc < MAX_ARGS) {
-		arg = argv[argc++] = va_arg(param, char *);
-		if (!arg)
-			break;
-	}
-	va_end(param);
-	if (MAX_ARGS <= argc) {
-		fprintf(stderr, " Error: too many args to run %s\n", cmd);
-		return -1;
-	}
-
-	argv[argc] = NULL;
-	return execv_cmd(argv);
-}
diff --git a/tools/perf/util/exec_cmd.h b/tools/perf/util/exec_cmd.h
deleted file mode 100644
index f1bd343..0000000
--- a/tools/perf/util/exec_cmd.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef __PERF_EXEC_CMD_H
-#define __PERF_EXEC_CMD_H
-
-extern void exec_cmd_init(const char *exec_name, const char *prefix,
-			  const char *exec_path, const char *exec_path_env);
-
-extern void set_argv_exec_path(const char *exec_path);
-extern const char *extract_argv0_path(const char *path);
-extern void setup_path(void);
-extern int execv_cmd(const char **argv); /* NULL terminated */
-extern int execl_cmd(const char *cmd, ...);
-/* get_argv_exec_path and system_path return malloc'd string, caller must free it */
-extern char *get_argv_exec_path(void);
-extern char *system_path(const char *path);
-
-#endif /* __PERF_EXEC_CMD_H */
diff --git a/tools/perf/util/help-unknown-cmd.c b/tools/perf/util/help-unknown-cmd.c
index a0820f1..dc1e41c 100644
--- a/tools/perf/util/help-unknown-cmd.c
+++ b/tools/perf/util/help-unknown-cmd.c
@@ -1,5 +1,5 @@
 #include "cache.h"
-#include "help.h"
+#include <subcmd/help.h>
 #include "../builtin.h"
 #include "levenshtein.h"
 
diff --git a/tools/perf/util/help.c b/tools/perf/util/help.c
deleted file mode 100644
index 8169480..0000000
--- a/tools/perf/util/help.c
+++ /dev/null
@@ -1,268 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <termios.h>
-#include <sys/ioctl.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <unistd.h>
-#include <dirent.h>
-#include "subcmd-util.h"
-#include "help.h"
-#include "exec_cmd.h"
-
-void add_cmdname(struct cmdnames *cmds, const char *name, size_t len)
-{
-	struct cmdname *ent = malloc(sizeof(*ent) + len + 1);
-
-	ent->len = len;
-	memcpy(ent->name, name, len);
-	ent->name[len] = 0;
-
-	ALLOC_GROW(cmds->names, cmds->cnt + 1, cmds->alloc);
-	cmds->names[cmds->cnt++] = ent;
-}
-
-void clean_cmdnames(struct cmdnames *cmds)
-{
-	unsigned int i;
-
-	for (i = 0; i < cmds->cnt; ++i)
-		zfree(&cmds->names[i]);
-	zfree(&cmds->names);
-	cmds->cnt = 0;
-	cmds->alloc = 0;
-}
-
-int cmdname_compare(const void *a_, const void *b_)
-{
-	struct cmdname *a = *(struct cmdname **)a_;
-	struct cmdname *b = *(struct cmdname **)b_;
-	return strcmp(a->name, b->name);
-}
-
-void uniq(struct cmdnames *cmds)
-{
-	unsigned int i, j;
-
-	if (!cmds->cnt)
-		return;
-
-	for (i = j = 1; i < cmds->cnt; i++)
-		if (strcmp(cmds->names[i]->name, cmds->names[i-1]->name))
-			cmds->names[j++] = cmds->names[i];
-
-	cmds->cnt = j;
-}
-
-void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes)
-{
-	size_t ci, cj, ei;
-	int cmp;
-
-	ci = cj = ei = 0;
-	while (ci < cmds->cnt && ei < excludes->cnt) {
-		cmp = strcmp(cmds->names[ci]->name, excludes->names[ei]->name);
-		if (cmp < 0)
-			cmds->names[cj++] = cmds->names[ci++];
-		else if (cmp == 0)
-			ci++, ei++;
-		else if (cmp > 0)
-			ei++;
-	}
-
-	while (ci < cmds->cnt)
-		cmds->names[cj++] = cmds->names[ci++];
-
-	cmds->cnt = cj;
-}
-
-static void get_term_dimensions(struct winsize *ws)
-{
-	char *s = getenv("LINES");
-
-	if (s != NULL) {
-		ws->ws_row = atoi(s);
-		s = getenv("COLUMNS");
-		if (s != NULL) {
-			ws->ws_col = atoi(s);
-			if (ws->ws_row && ws->ws_col)
-				return;
-		}
-	}
-#ifdef TIOCGWINSZ
-	if (ioctl(1, TIOCGWINSZ, ws) == 0 &&
-	    ws->ws_row && ws->ws_col)
-		return;
-#endif
-	ws->ws_row = 25;
-	ws->ws_col = 80;
-}
-
-static void pretty_print_string_list(struct cmdnames *cmds, int longest)
-{
-	int cols = 1, rows;
-	int space = longest + 1; /* min 1 SP between words */
-	struct winsize win;
-	int max_cols;
-	int i, j;
-
-	get_term_dimensions(&win);
-	max_cols = win.ws_col - 1; /* don't print *on* the edge */
-
-	if (space < max_cols)
-		cols = max_cols / space;
-	rows = (cmds->cnt + cols - 1) / cols;
-
-	for (i = 0; i < rows; i++) {
-		printf("  ");
-
-		for (j = 0; j < cols; j++) {
-			unsigned int n = j * rows + i;
-			unsigned int size = space;
-
-			if (n >= cmds->cnt)
-				break;
-			if (j == cols-1 || n + rows >= cmds->cnt)
-				size = 1;
-			printf("%-*s", size, cmds->names[n]->name);
-		}
-		putchar('\n');
-	}
-}
-
-static int is_executable(const char *name)
-{
-	struct stat st;
-
-	if (stat(name, &st) || /* stat, not lstat */
-	    !S_ISREG(st.st_mode))
-		return 0;
-
-	return st.st_mode & S_IXUSR;
-}
-
-static int has_extension(const char *filename, const char *ext)
-{
-	size_t len = strlen(filename);
-	size_t extlen = strlen(ext);
-
-	return len > extlen && !memcmp(filename + len - extlen, ext, extlen);
-}
-
-static void list_commands_in_dir(struct cmdnames *cmds,
-					 const char *path,
-					 const char *prefix)
-{
-	int prefix_len;
-	DIR *dir = opendir(path);
-	struct dirent *de;
-	char *buf = NULL;
-
-	if (!dir)
-		return;
-	if (!prefix)
-		prefix = "perf-";
-	prefix_len = strlen(prefix);
-
-	astrcatf(&buf, "%s/", path);
-
-	while ((de = readdir(dir)) != NULL) {
-		int entlen;
-
-		if (prefixcmp(de->d_name, prefix))
-			continue;
-
-		astrcat(&buf, de->d_name);
-		if (!is_executable(buf))
-			continue;
-
-		entlen = strlen(de->d_name) - prefix_len;
-		if (has_extension(de->d_name, ".exe"))
-			entlen -= 4;
-
-		add_cmdname(cmds, de->d_name + prefix_len, entlen);
-	}
-	closedir(dir);
-	free(buf);
-}
-
-void load_command_list(const char *prefix,
-		struct cmdnames *main_cmds,
-		struct cmdnames *other_cmds)
-{
-	const char *env_path = getenv("PATH");
-	char *exec_path = get_argv_exec_path();
-
-	if (exec_path) {
-		list_commands_in_dir(main_cmds, exec_path, prefix);
-		qsort(main_cmds->names, main_cmds->cnt,
-		      sizeof(*main_cmds->names), cmdname_compare);
-		uniq(main_cmds);
-	}
-
-	if (env_path) {
-		char *paths, *path, *colon;
-		path = paths = strdup(env_path);
-		while (1) {
-			if ((colon = strchr(path, ':')))
-				*colon = 0;
-			if (!exec_path || strcmp(path, exec_path))
-				list_commands_in_dir(other_cmds, path, prefix);
-
-			if (!colon)
-				break;
-			path = colon + 1;
-		}
-		free(paths);
-
-		qsort(other_cmds->names, other_cmds->cnt,
-		      sizeof(*other_cmds->names), cmdname_compare);
-		uniq(other_cmds);
-	}
-	free(exec_path);
-	exclude_cmds(other_cmds, main_cmds);
-}
-
-void list_commands(const char *title, struct cmdnames *main_cmds,
-		   struct cmdnames *other_cmds)
-{
-	unsigned int i, longest = 0;
-
-	for (i = 0; i < main_cmds->cnt; i++)
-		if (longest < main_cmds->names[i]->len)
-			longest = main_cmds->names[i]->len;
-	for (i = 0; i < other_cmds->cnt; i++)
-		if (longest < other_cmds->names[i]->len)
-			longest = other_cmds->names[i]->len;
-
-	if (main_cmds->cnt) {
-		char *exec_path = get_argv_exec_path();
-		printf("available %s in '%s'\n", title, exec_path);
-		printf("----------------");
-		mput_char('-', strlen(title) + strlen(exec_path));
-		putchar('\n');
-		pretty_print_string_list(main_cmds, longest);
-		putchar('\n');
-		free(exec_path);
-	}
-
-	if (other_cmds->cnt) {
-		printf("%s available from elsewhere on your $PATH\n", title);
-		printf("---------------------------------------");
-		mput_char('-', strlen(title));
-		putchar('\n');
-		pretty_print_string_list(other_cmds, longest);
-		putchar('\n');
-	}
-}
-
-int is_in_cmdlist(struct cmdnames *c, const char *s)
-{
-	unsigned int i;
-
-	for (i = 0; i < c->cnt; i++)
-		if (!strcmp(s, c->names[i]->name))
-			return 1;
-	return 0;
-}
diff --git a/tools/perf/util/help.h b/tools/perf/util/help.h
deleted file mode 100644
index 096c8bc..0000000
--- a/tools/perf/util/help.h
+++ /dev/null
@@ -1,34 +0,0 @@
-#ifndef __PERF_HELP_H
-#define __PERF_HELP_H
-
-#include <sys/types.h>
-
-struct cmdnames {
-	size_t alloc;
-	size_t cnt;
-	struct cmdname {
-		size_t len; /* also used for similarity index in help.c */
-		char name[];
-	} **names;
-};
-
-static inline void mput_char(char c, unsigned int num)
-{
-	while(num--)
-		putchar(c);
-}
-
-void load_command_list(const char *prefix,
-		struct cmdnames *main_cmds,
-		struct cmdnames *other_cmds);
-void add_cmdname(struct cmdnames *cmds, const char *name, size_t len);
-void clean_cmdnames(struct cmdnames *cmds);
-int cmdname_compare(const void *a, const void *b);
-void uniq(struct cmdnames *cmds);
-/* Here we require that excludes is a sorted list. */
-void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes);
-int is_in_cmdlist(struct cmdnames *c, const char *s);
-void list_commands(const char *title, struct cmdnames *main_cmds,
-		   struct cmdnames *other_cmds);
-
-#endif /* __PERF_HELP_H */
diff --git a/tools/perf/util/pager.c b/tools/perf/util/pager.c
deleted file mode 100644
index d50f3b58..0000000
--- a/tools/perf/util/pager.c
+++ /dev/null
@@ -1,100 +0,0 @@
-#include <sys/select.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <signal.h>
-#include "pager.h"
-#include "run-command.h"
-#include "sigchain.h"
-#include "subcmd-config.h"
-
-/*
- * This is split up from the rest of git so that we can do
- * something different on Windows.
- */
-
-static int spawned_pager;
-
-void pager_init(const char *pager_env)
-{
-	subcmd_config.pager_env = pager_env;
-}
-
-static void pager_preexec(void)
-{
-	/*
-	 * Work around bug in "less" by not starting it until we
-	 * have real input
-	 */
-	fd_set in;
-
-	FD_ZERO(&in);
-	FD_SET(0, &in);
-	select(1, &in, NULL, &in, NULL);
-
-	setenv("LESS", "FRSX", 0);
-}
-
-static const char *pager_argv[] = { "sh", "-c", NULL, NULL };
-static struct child_process pager_process;
-
-static void wait_for_pager(void)
-{
-	fflush(stdout);
-	fflush(stderr);
-	/* signal EOF to pager */
-	close(1);
-	close(2);
-	finish_command(&pager_process);
-}
-
-static void wait_for_pager_signal(int signo)
-{
-	wait_for_pager();
-	sigchain_pop(signo);
-	raise(signo);
-}
-
-void setup_pager(void)
-{
-	const char *pager = getenv(subcmd_config.pager_env);
-
-	if (!isatty(1))
-		return;
-	if (!pager)
-		pager = getenv("PAGER");
-	if (!(pager || access("/usr/bin/pager", X_OK)))
-		pager = "/usr/bin/pager";
-	if (!(pager || access("/usr/bin/less", X_OK)))
-		pager = "/usr/bin/less";
-	if (!pager)
-		pager = "cat";
-	if (!*pager || !strcmp(pager, "cat"))
-		return;
-
-	spawned_pager = 1; /* means we are emitting to terminal */
-
-	/* spawn the pager */
-	pager_argv[2] = pager;
-	pager_process.argv = pager_argv;
-	pager_process.in = -1;
-	pager_process.preexec_cb = pager_preexec;
-
-	if (start_command(&pager_process))
-		return;
-
-	/* original process continues, but writes to the pipe */
-	dup2(pager_process.in, 1);
-	if (isatty(2))
-		dup2(pager_process.in, 2);
-	close(pager_process.in);
-
-	/* this makes sure that the parent terminates after the pager */
-	sigchain_push_common(wait_for_pager_signal);
-	atexit(wait_for_pager);
-}
-
-int pager_in_use(void)
-{
-	return spawned_pager;
-}
diff --git a/tools/perf/util/pager.h b/tools/perf/util/pager.h
deleted file mode 100644
index d6a591a..0000000
--- a/tools/perf/util/pager.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef __PERF_PAGER_H
-#define __PERF_PAGER_H
-
-extern void pager_init(const char *pager_env);
-
-extern void setup_pager(void);
-extern int pager_in_use(void);
-
-#endif /* __PERF_PAGER_H */
diff --git a/tools/perf/util/parse-branch-options.c b/tools/perf/util/parse-branch-options.c
index 355eecf..afc088d 100644
--- a/tools/perf/util/parse-branch-options.c
+++ b/tools/perf/util/parse-branch-options.c
@@ -1,7 +1,7 @@
 #include "perf.h"
 #include "util/util.h"
 #include "util/debug.h"
-#include "util/parse-options.h"
+#include <subcmd/parse-options.h>
 #include "util/parse-branch-options.h"
 
 #define BRANCH_OPT(n, m) \
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 6fc8cd7..4f7b0ef 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -4,9 +4,9 @@
 #include "../perf.h"
 #include "evlist.h"
 #include "evsel.h"
-#include "parse-options.h"
+#include <subcmd/parse-options.h>
 #include "parse-events.h"
-#include "exec_cmd.h"
+#include <subcmd/exec-cmd.h>
 #include "string.h"
 #include "symbol.h"
 #include "cache.h"
diff --git a/tools/perf/util/parse-options.c b/tools/perf/util/parse-options.c
deleted file mode 100644
index 981bb44..0000000
--- a/tools/perf/util/parse-options.c
+++ /dev/null
@@ -1,983 +0,0 @@
-#include <linux/compiler.h>
-#include <linux/types.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <string.h>
-#include <ctype.h>
-#include "subcmd-util.h"
-#include "parse-options.h"
-#include "subcmd-config.h"
-#include "pager.h"
-
-#define OPT_SHORT 1
-#define OPT_UNSET 2
-
-char *error_buf;
-
-static int opterror(const struct option *opt, const char *reason, int flags)
-{
-	if (flags & OPT_SHORT)
-		fprintf(stderr, " Error: switch `%c' %s", opt->short_name, reason);
-	else if (flags & OPT_UNSET)
-		fprintf(stderr, " Error: option `no-%s' %s", opt->long_name, reason);
-	else
-		fprintf(stderr, " Error: option `%s' %s", opt->long_name, reason);
-
-	return -1;
-}
-
-static const char *skip_prefix(const char *str, const char *prefix)
-{
-	size_t len = strlen(prefix);
-	return strncmp(str, prefix, len) ? NULL : str + len;
-}
-
-static void optwarning(const struct option *opt, const char *reason, int flags)
-{
-	if (flags & OPT_SHORT)
-		fprintf(stderr, " Warning: switch `%c' %s", opt->short_name, reason);
-	else if (flags & OPT_UNSET)
-		fprintf(stderr, " Warning: option `no-%s' %s", opt->long_name, reason);
-	else
-		fprintf(stderr, " Warning: option `%s' %s", opt->long_name, reason);
-}
-
-static int get_arg(struct parse_opt_ctx_t *p, const struct option *opt,
-		   int flags, const char **arg)
-{
-	const char *res;
-
-	if (p->opt) {
-		res = p->opt;
-		p->opt = NULL;
-	} else if ((opt->flags & PARSE_OPT_LASTARG_DEFAULT) && (p->argc == 1 ||
-		    **(p->argv + 1) == '-')) {
-		res = (const char *)opt->defval;
-	} else if (p->argc > 1) {
-		p->argc--;
-		res = *++p->argv;
-	} else
-		return opterror(opt, "requires a value", flags);
-	if (arg)
-		*arg = res;
-	return 0;
-}
-
-static int get_value(struct parse_opt_ctx_t *p,
-		     const struct option *opt, int flags)
-{
-	const char *s, *arg = NULL;
-	const int unset = flags & OPT_UNSET;
-	int err;
-
-	if (unset && p->opt)
-		return opterror(opt, "takes no value", flags);
-	if (unset && (opt->flags & PARSE_OPT_NONEG))
-		return opterror(opt, "isn't available", flags);
-	if (opt->flags & PARSE_OPT_DISABLED)
-		return opterror(opt, "is not usable", flags);
-
-	if (opt->flags & PARSE_OPT_EXCLUSIVE) {
-		if (p->excl_opt && p->excl_opt != opt) {
-			char msg[128];
-
-			if (((flags & OPT_SHORT) && p->excl_opt->short_name) ||
-			    p->excl_opt->long_name == NULL) {
-				snprintf(msg, sizeof(msg), "cannot be used with switch `%c'",
-					 p->excl_opt->short_name);
-			} else {
-				snprintf(msg, sizeof(msg), "cannot be used with %s",
-					 p->excl_opt->long_name);
-			}
-			opterror(opt, msg, flags);
-			return -3;
-		}
-		p->excl_opt = opt;
-	}
-	if (!(flags & OPT_SHORT) && p->opt) {
-		switch (opt->type) {
-		case OPTION_CALLBACK:
-			if (!(opt->flags & PARSE_OPT_NOARG))
-				break;
-			/* FALLTHROUGH */
-		case OPTION_BOOLEAN:
-		case OPTION_INCR:
-		case OPTION_BIT:
-		case OPTION_SET_UINT:
-		case OPTION_SET_PTR:
-			return opterror(opt, "takes no value", flags);
-		case OPTION_END:
-		case OPTION_ARGUMENT:
-		case OPTION_GROUP:
-		case OPTION_STRING:
-		case OPTION_INTEGER:
-		case OPTION_UINTEGER:
-		case OPTION_LONG:
-		case OPTION_U64:
-		default:
-			break;
-		}
-	}
-
-	if (opt->flags & PARSE_OPT_NOBUILD) {
-		char reason[128];
-		bool noarg = false;
-
-		err = snprintf(reason, sizeof(reason),
-				opt->flags & PARSE_OPT_CANSKIP ?
-					"is being ignored because %s " :
-					"is not available because %s",
-				opt->build_opt);
-		reason[sizeof(reason) - 1] = '\0';
-
-		if (err < 0)
-			strncpy(reason, opt->flags & PARSE_OPT_CANSKIP ?
-					"is being ignored" :
-					"is not available",
-					sizeof(reason));
-
-		if (!(opt->flags & PARSE_OPT_CANSKIP))
-			return opterror(opt, reason, flags);
-
-		err = 0;
-		if (unset)
-			noarg = true;
-		if (opt->flags & PARSE_OPT_NOARG)
-			noarg = true;
-		if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
-			noarg = true;
-
-		switch (opt->type) {
-		case OPTION_BOOLEAN:
-		case OPTION_INCR:
-		case OPTION_BIT:
-		case OPTION_SET_UINT:
-		case OPTION_SET_PTR:
-		case OPTION_END:
-		case OPTION_ARGUMENT:
-		case OPTION_GROUP:
-			noarg = true;
-			break;
-		case OPTION_CALLBACK:
-		case OPTION_STRING:
-		case OPTION_INTEGER:
-		case OPTION_UINTEGER:
-		case OPTION_LONG:
-		case OPTION_U64:
-		default:
-			break;
-		}
-
-		if (!noarg)
-			err = get_arg(p, opt, flags, NULL);
-		if (err)
-			return err;
-
-		optwarning(opt, reason, flags);
-		return 0;
-	}
-
-	switch (opt->type) {
-	case OPTION_BIT:
-		if (unset)
-			*(int *)opt->value &= ~opt->defval;
-		else
-			*(int *)opt->value |= opt->defval;
-		return 0;
-
-	case OPTION_BOOLEAN:
-		*(bool *)opt->value = unset ? false : true;
-		if (opt->set)
-			*(bool *)opt->set = true;
-		return 0;
-
-	case OPTION_INCR:
-		*(int *)opt->value = unset ? 0 : *(int *)opt->value + 1;
-		return 0;
-
-	case OPTION_SET_UINT:
-		*(unsigned int *)opt->value = unset ? 0 : opt->defval;
-		return 0;
-
-	case OPTION_SET_PTR:
-		*(void **)opt->value = unset ? NULL : (void *)opt->defval;
-		return 0;
-
-	case OPTION_STRING:
-		err = 0;
-		if (unset)
-			*(const char **)opt->value = NULL;
-		else if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
-			*(const char **)opt->value = (const char *)opt->defval;
-		else
-			err = get_arg(p, opt, flags, (const char **)opt->value);
-
-		/* PARSE_OPT_NOEMPTY: Allow NULL but disallow empty string. */
-		if (opt->flags & PARSE_OPT_NOEMPTY) {
-			const char *val = *(const char **)opt->value;
-
-			if (!val)
-				return err;
-
-			/* Similar to unset if we are given an empty string. */
-			if (val[0] == '\0') {
-				*(const char **)opt->value = NULL;
-				return 0;
-			}
-		}
-
-		return err;
-
-	case OPTION_CALLBACK:
-		if (unset)
-			return (*opt->callback)(opt, NULL, 1) ? (-1) : 0;
-		if (opt->flags & PARSE_OPT_NOARG)
-			return (*opt->callback)(opt, NULL, 0) ? (-1) : 0;
-		if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
-			return (*opt->callback)(opt, NULL, 0) ? (-1) : 0;
-		if (get_arg(p, opt, flags, &arg))
-			return -1;
-		return (*opt->callback)(opt, arg, 0) ? (-1) : 0;
-
-	case OPTION_INTEGER:
-		if (unset) {
-			*(int *)opt->value = 0;
-			return 0;
-		}
-		if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
-			*(int *)opt->value = opt->defval;
-			return 0;
-		}
-		if (get_arg(p, opt, flags, &arg))
-			return -1;
-		*(int *)opt->value = strtol(arg, (char **)&s, 10);
-		if (*s)
-			return opterror(opt, "expects a numerical value", flags);
-		return 0;
-
-	case OPTION_UINTEGER:
-		if (unset) {
-			*(unsigned int *)opt->value = 0;
-			return 0;
-		}
-		if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
-			*(unsigned int *)opt->value = opt->defval;
-			return 0;
-		}
-		if (get_arg(p, opt, flags, &arg))
-			return -1;
-		*(unsigned int *)opt->value = strtol(arg, (char **)&s, 10);
-		if (*s)
-			return opterror(opt, "expects a numerical value", flags);
-		return 0;
-
-	case OPTION_LONG:
-		if (unset) {
-			*(long *)opt->value = 0;
-			return 0;
-		}
-		if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
-			*(long *)opt->value = opt->defval;
-			return 0;
-		}
-		if (get_arg(p, opt, flags, &arg))
-			return -1;
-		*(long *)opt->value = strtol(arg, (char **)&s, 10);
-		if (*s)
-			return opterror(opt, "expects a numerical value", flags);
-		return 0;
-
-	case OPTION_U64:
-		if (unset) {
-			*(u64 *)opt->value = 0;
-			return 0;
-		}
-		if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
-			*(u64 *)opt->value = opt->defval;
-			return 0;
-		}
-		if (get_arg(p, opt, flags, &arg))
-			return -1;
-		*(u64 *)opt->value = strtoull(arg, (char **)&s, 10);
-		if (*s)
-			return opterror(opt, "expects a numerical value", flags);
-		return 0;
-
-	case OPTION_END:
-	case OPTION_ARGUMENT:
-	case OPTION_GROUP:
-	default:
-		die("should not happen, someone must be hit on the forehead");
-	}
-}
-
-static int parse_short_opt(struct parse_opt_ctx_t *p, const struct option *options)
-{
-	for (; options->type != OPTION_END; options++) {
-		if (options->short_name == *p->opt) {
-			p->opt = p->opt[1] ? p->opt + 1 : NULL;
-			return get_value(p, options, OPT_SHORT);
-		}
-	}
-	return -2;
-}
-
-static int parse_long_opt(struct parse_opt_ctx_t *p, const char *arg,
-                          const struct option *options)
-{
-	const char *arg_end = strchr(arg, '=');
-	const struct option *abbrev_option = NULL, *ambiguous_option = NULL;
-	int abbrev_flags = 0, ambiguous_flags = 0;
-
-	if (!arg_end)
-		arg_end = arg + strlen(arg);
-
-	for (; options->type != OPTION_END; options++) {
-		const char *rest;
-		int flags = 0;
-
-		if (!options->long_name)
-			continue;
-
-		rest = skip_prefix(arg, options->long_name);
-		if (options->type == OPTION_ARGUMENT) {
-			if (!rest)
-				continue;
-			if (*rest == '=')
-				return opterror(options, "takes no value", flags);
-			if (*rest)
-				continue;
-			p->out[p->cpidx++] = arg - 2;
-			return 0;
-		}
-		if (!rest) {
-			if (!prefixcmp(options->long_name, "no-")) {
-				/*
-				 * The long name itself starts with "no-", so
-				 * accept the option without "no-" so that users
-				 * do not have to enter "no-no-" to get the
-				 * negation.
-				 */
-				rest = skip_prefix(arg, options->long_name + 3);
-				if (rest) {
-					flags |= OPT_UNSET;
-					goto match;
-				}
-				/* Abbreviated case */
-				if (!prefixcmp(options->long_name + 3, arg)) {
-					flags |= OPT_UNSET;
-					goto is_abbreviated;
-				}
-			}
-			/* abbreviated? */
-			if (!strncmp(options->long_name, arg, arg_end - arg)) {
-is_abbreviated:
-				if (abbrev_option) {
-					/*
-					 * If this is abbreviated, it is
-					 * ambiguous. So when there is no
-					 * exact match later, we need to
-					 * error out.
-					 */
-					ambiguous_option = abbrev_option;
-					ambiguous_flags = abbrev_flags;
-				}
-				if (!(flags & OPT_UNSET) && *arg_end)
-					p->opt = arg_end + 1;
-				abbrev_option = options;
-				abbrev_flags = flags;
-				continue;
-			}
-			/* negated and abbreviated very much? */
-			if (!prefixcmp("no-", arg)) {
-				flags |= OPT_UNSET;
-				goto is_abbreviated;
-			}
-			/* negated? */
-			if (strncmp(arg, "no-", 3))
-				continue;
-			flags |= OPT_UNSET;
-			rest = skip_prefix(arg + 3, options->long_name);
-			/* abbreviated and negated? */
-			if (!rest && !prefixcmp(options->long_name, arg + 3))
-				goto is_abbreviated;
-			if (!rest)
-				continue;
-		}
-match:
-		if (*rest) {
-			if (*rest != '=')
-				continue;
-			p->opt = rest + 1;
-		}
-		return get_value(p, options, flags);
-	}
-
-	if (ambiguous_option) {
-		 fprintf(stderr,
-			 " Error: Ambiguous option: %s (could be --%s%s or --%s%s)",
-			 arg,
-			 (ambiguous_flags & OPT_UNSET) ?  "no-" : "",
-			 ambiguous_option->long_name,
-			 (abbrev_flags & OPT_UNSET) ?  "no-" : "",
-			 abbrev_option->long_name);
-		 return -1;
-	}
-	if (abbrev_option)
-		return get_value(p, abbrev_option, abbrev_flags);
-	return -2;
-}
-
-static void check_typos(const char *arg, const struct option *options)
-{
-	if (strlen(arg) < 3)
-		return;
-
-	if (!prefixcmp(arg, "no-")) {
-		fprintf(stderr, " Error: did you mean `--%s` (with two dashes ?)", arg);
-		exit(129);
-	}
-
-	for (; options->type != OPTION_END; options++) {
-		if (!options->long_name)
-			continue;
-		if (!prefixcmp(options->long_name, arg)) {
-			fprintf(stderr, " Error: did you mean `--%s` (with two dashes ?)", arg);
-			exit(129);
-		}
-	}
-}
-
-static void parse_options_start(struct parse_opt_ctx_t *ctx,
-				int argc, const char **argv, int flags)
-{
-	memset(ctx, 0, sizeof(*ctx));
-	ctx->argc = argc - 1;
-	ctx->argv = argv + 1;
-	ctx->out  = argv;
-	ctx->cpidx = ((flags & PARSE_OPT_KEEP_ARGV0) != 0);
-	ctx->flags = flags;
-	if ((flags & PARSE_OPT_KEEP_UNKNOWN) &&
-	    (flags & PARSE_OPT_STOP_AT_NON_OPTION))
-		die("STOP_AT_NON_OPTION and KEEP_UNKNOWN don't go together");
-}
-
-static int usage_with_options_internal(const char * const *,
-				       const struct option *, int,
-				       struct parse_opt_ctx_t *);
-
-static int parse_options_step(struct parse_opt_ctx_t *ctx,
-			      const struct option *options,
-			      const char * const usagestr[])
-{
-	int internal_help = !(ctx->flags & PARSE_OPT_NO_INTERNAL_HELP);
-	int excl_short_opt = 1;
-	const char *arg;
-
-	/* we must reset ->opt, unknown short option leave it dangling */
-	ctx->opt = NULL;
-
-	for (; ctx->argc; ctx->argc--, ctx->argv++) {
-		arg = ctx->argv[0];
-		if (*arg != '-' || !arg[1]) {
-			if (ctx->flags & PARSE_OPT_STOP_AT_NON_OPTION)
-				break;
-			ctx->out[ctx->cpidx++] = ctx->argv[0];
-			continue;
-		}
-
-		if (arg[1] != '-') {
-			ctx->opt = ++arg;
-			if (internal_help && *ctx->opt == 'h') {
-				return usage_with_options_internal(usagestr, options, 0, ctx);
-			}
-			switch (parse_short_opt(ctx, options)) {
-			case -1:
-				return parse_options_usage(usagestr, options, arg, 1);
-			case -2:
-				goto unknown;
-			case -3:
-				goto exclusive;
-			default:
-				break;
-			}
-			if (ctx->opt)
-				check_typos(arg, options);
-			while (ctx->opt) {
-				if (internal_help && *ctx->opt == 'h')
-					return usage_with_options_internal(usagestr, options, 0, ctx);
-				arg = ctx->opt;
-				switch (parse_short_opt(ctx, options)) {
-				case -1:
-					return parse_options_usage(usagestr, options, arg, 1);
-				case -2:
-					/* fake a short option thing to hide the fact that we may have
-					 * started to parse aggregated stuff
-					 *
-					 * This is leaky, too bad.
-					 */
-					ctx->argv[0] = strdup(ctx->opt - 1);
-					*(char *)ctx->argv[0] = '-';
-					goto unknown;
-				case -3:
-					goto exclusive;
-				default:
-					break;
-				}
-			}
-			continue;
-		}
-
-		if (!arg[2]) { /* "--" */
-			if (!(ctx->flags & PARSE_OPT_KEEP_DASHDASH)) {
-				ctx->argc--;
-				ctx->argv++;
-			}
-			break;
-		}
-
-		arg += 2;
-		if (internal_help && !strcmp(arg, "help-all"))
-			return usage_with_options_internal(usagestr, options, 1, ctx);
-		if (internal_help && !strcmp(arg, "help"))
-			return usage_with_options_internal(usagestr, options, 0, ctx);
-		if (!strcmp(arg, "list-opts"))
-			return PARSE_OPT_LIST_OPTS;
-		if (!strcmp(arg, "list-cmds"))
-			return PARSE_OPT_LIST_SUBCMDS;
-		switch (parse_long_opt(ctx, arg, options)) {
-		case -1:
-			return parse_options_usage(usagestr, options, arg, 0);
-		case -2:
-			goto unknown;
-		case -3:
-			excl_short_opt = 0;
-			goto exclusive;
-		default:
-			break;
-		}
-		continue;
-unknown:
-		if (!(ctx->flags & PARSE_OPT_KEEP_UNKNOWN))
-			return PARSE_OPT_UNKNOWN;
-		ctx->out[ctx->cpidx++] = ctx->argv[0];
-		ctx->opt = NULL;
-	}
-	return PARSE_OPT_DONE;
-
-exclusive:
-	parse_options_usage(usagestr, options, arg, excl_short_opt);
-	if ((excl_short_opt && ctx->excl_opt->short_name) ||
-	    ctx->excl_opt->long_name == NULL) {
-		char opt = ctx->excl_opt->short_name;
-		parse_options_usage(NULL, options, &opt, 1);
-	} else {
-		parse_options_usage(NULL, options, ctx->excl_opt->long_name, 0);
-	}
-	return PARSE_OPT_HELP;
-}
-
-static int parse_options_end(struct parse_opt_ctx_t *ctx)
-{
-	memmove(ctx->out + ctx->cpidx, ctx->argv, ctx->argc * sizeof(*ctx->out));
-	ctx->out[ctx->cpidx + ctx->argc] = NULL;
-	return ctx->cpidx + ctx->argc;
-}
-
-int parse_options_subcommand(int argc, const char **argv, const struct option *options,
-			const char *const subcommands[], const char *usagestr[], int flags)
-{
-	struct parse_opt_ctx_t ctx;
-
-	/* build usage string if it's not provided */
-	if (subcommands && !usagestr[0]) {
-		char *buf = NULL;
-
-		astrcatf(&buf, "%s %s [<options>] {", subcmd_config.exec_name, argv[0]);
-
-		for (int i = 0; subcommands[i]; i++) {
-			if (i)
-				astrcat(&buf, "|");
-			astrcat(&buf, subcommands[i]);
-		}
-		astrcat(&buf, "}");
-
-		usagestr[0] = buf;
-	}
-
-	parse_options_start(&ctx, argc, argv, flags);
-	switch (parse_options_step(&ctx, options, usagestr)) {
-	case PARSE_OPT_HELP:
-		exit(129);
-	case PARSE_OPT_DONE:
-		break;
-	case PARSE_OPT_LIST_OPTS:
-		while (options->type != OPTION_END) {
-			if (options->long_name)
-				printf("--%s ", options->long_name);
-			options++;
-		}
-		putchar('\n');
-		exit(130);
-	case PARSE_OPT_LIST_SUBCMDS:
-		if (subcommands) {
-			for (int i = 0; subcommands[i]; i++)
-				printf("%s ", subcommands[i]);
-		}
-		putchar('\n');
-		exit(130);
-	default: /* PARSE_OPT_UNKNOWN */
-		if (ctx.argv[0][1] == '-')
-			astrcatf(&error_buf, "unknown option `%s'",
-				 ctx.argv[0] + 2);
-		else
-			astrcatf(&error_buf, "unknown switch `%c'", *ctx.opt);
-		usage_with_options(usagestr, options);
-	}
-
-	return parse_options_end(&ctx);
-}
-
-int parse_options(int argc, const char **argv, const struct option *options,
-		  const char * const usagestr[], int flags)
-{
-	return parse_options_subcommand(argc, argv, options, NULL,
-					(const char **) usagestr, flags);
-}
-
-#define USAGE_OPTS_WIDTH 24
-#define USAGE_GAP         2
-
-static void print_option_help(const struct option *opts, int full)
-{
-	size_t pos;
-	int pad;
-
-	if (opts->type == OPTION_GROUP) {
-		fputc('\n', stderr);
-		if (*opts->help)
-			fprintf(stderr, "%s\n", opts->help);
-		return;
-	}
-	if (!full && (opts->flags & PARSE_OPT_HIDDEN))
-		return;
-	if (opts->flags & PARSE_OPT_DISABLED)
-		return;
-
-	pos = fprintf(stderr, "    ");
-	if (opts->short_name)
-		pos += fprintf(stderr, "-%c", opts->short_name);
-	else
-		pos += fprintf(stderr, "    ");
-
-	if (opts->long_name && opts->short_name)
-		pos += fprintf(stderr, ", ");
-	if (opts->long_name)
-		pos += fprintf(stderr, "--%s", opts->long_name);
-
-	switch (opts->type) {
-	case OPTION_ARGUMENT:
-		break;
-	case OPTION_LONG:
-	case OPTION_U64:
-	case OPTION_INTEGER:
-	case OPTION_UINTEGER:
-		if (opts->flags & PARSE_OPT_OPTARG)
-			if (opts->long_name)
-				pos += fprintf(stderr, "[=<n>]");
-			else
-				pos += fprintf(stderr, "[<n>]");
-		else
-			pos += fprintf(stderr, " <n>");
-		break;
-	case OPTION_CALLBACK:
-		if (opts->flags & PARSE_OPT_NOARG)
-			break;
-		/* FALLTHROUGH */
-	case OPTION_STRING:
-		if (opts->argh) {
-			if (opts->flags & PARSE_OPT_OPTARG)
-				if (opts->long_name)
-					pos += fprintf(stderr, "[=<%s>]", opts->argh);
-				else
-					pos += fprintf(stderr, "[<%s>]", opts->argh);
-			else
-				pos += fprintf(stderr, " <%s>", opts->argh);
-		} else {
-			if (opts->flags & PARSE_OPT_OPTARG)
-				if (opts->long_name)
-					pos += fprintf(stderr, "[=...]");
-				else
-					pos += fprintf(stderr, "[...]");
-			else
-				pos += fprintf(stderr, " ...");
-		}
-		break;
-	default: /* OPTION_{BIT,BOOLEAN,SET_UINT,SET_PTR} */
-	case OPTION_END:
-	case OPTION_GROUP:
-	case OPTION_BIT:
-	case OPTION_BOOLEAN:
-	case OPTION_INCR:
-	case OPTION_SET_UINT:
-	case OPTION_SET_PTR:
-		break;
-	}
-
-	if (pos <= USAGE_OPTS_WIDTH)
-		pad = USAGE_OPTS_WIDTH - pos;
-	else {
-		fputc('\n', stderr);
-		pad = USAGE_OPTS_WIDTH;
-	}
-	fprintf(stderr, "%*s%s\n", pad + USAGE_GAP, "", opts->help);
-	if (opts->flags & PARSE_OPT_NOBUILD)
-		fprintf(stderr, "%*s(not built-in because %s)\n",
-			USAGE_OPTS_WIDTH + USAGE_GAP, "",
-			opts->build_opt);
-}
-
-static int option__cmp(const void *va, const void *vb)
-{
-	const struct option *a = va, *b = vb;
-	int sa = tolower(a->short_name), sb = tolower(b->short_name), ret;
-
-	if (sa == 0)
-		sa = 'z' + 1;
-	if (sb == 0)
-		sb = 'z' + 1;
-
-	ret = sa - sb;
-
-	if (ret == 0) {
-		const char *la = a->long_name ?: "",
-			   *lb = b->long_name ?: "";
-		ret = strcmp(la, lb);
-	}
-
-	return ret;
-}
-
-static struct option *options__order(const struct option *opts)
-{
-	int nr_opts = 0, len;
-	const struct option *o = opts;
-	struct option *ordered;
-
-	for (o = opts; o->type != OPTION_END; o++)
-		++nr_opts;
-
-	len = sizeof(*o) * (nr_opts + 1);
-	ordered = malloc(len);
-	if (!ordered)
-		goto out;
-	memcpy(ordered, opts, len);
-
-	qsort(ordered, nr_opts, sizeof(*o), option__cmp);
-out:
-	return ordered;
-}
-
-static bool option__in_argv(const struct option *opt, const struct parse_opt_ctx_t *ctx)
-{
-	int i;
-
-	for (i = 1; i < ctx->argc; ++i) {
-		const char *arg = ctx->argv[i];
-
-		if (arg[0] != '-') {
-			if (arg[1] == '\0') {
-				if (arg[0] == opt->short_name)
-					return true;
-				continue;
-			}
-
-			if (opt->long_name && strcmp(opt->long_name, arg) == 0)
-				return true;
-
-			if (opt->help && strcasestr(opt->help, arg) != NULL)
-				return true;
-
-			continue;
-		}
-
-		if (arg[1] == opt->short_name ||
-		    (arg[1] == '-' && opt->long_name && strcmp(opt->long_name, arg + 2) == 0))
-			return true;
-	}
-
-	return false;
-}
-
-static int usage_with_options_internal(const char * const *usagestr,
-				       const struct option *opts, int full,
-				       struct parse_opt_ctx_t *ctx)
-{
-	struct option *ordered;
-
-	if (!usagestr)
-		return PARSE_OPT_HELP;
-
-	setup_pager();
-
-	if (error_buf) {
-		fprintf(stderr, "  Error: %s\n", error_buf);
-		zfree(&error_buf);
-	}
-
-	fprintf(stderr, "\n Usage: %s\n", *usagestr++);
-	while (*usagestr && **usagestr)
-		fprintf(stderr, "    or: %s\n", *usagestr++);
-	while (*usagestr) {
-		fprintf(stderr, "%s%s\n",
-				**usagestr ? "    " : "",
-				*usagestr);
-		usagestr++;
-	}
-
-	if (opts->type != OPTION_GROUP)
-		fputc('\n', stderr);
-
-	ordered = options__order(opts);
-	if (ordered)
-		opts = ordered;
-
-	for (  ; opts->type != OPTION_END; opts++) {
-		if (ctx && ctx->argc > 1 && !option__in_argv(opts, ctx))
-			continue;
-		print_option_help(opts, full);
-	}
-
-	fputc('\n', stderr);
-
-	free(ordered);
-
-	return PARSE_OPT_HELP;
-}
-
-void usage_with_options(const char * const *usagestr,
-			const struct option *opts)
-{
-	usage_with_options_internal(usagestr, opts, 0, NULL);
-	exit(129);
-}
-
-void usage_with_options_msg(const char * const *usagestr,
-			    const struct option *opts, const char *fmt, ...)
-{
-	va_list ap;
-	char *tmp = error_buf;
-
-	va_start(ap, fmt);
-	if (vasprintf(&error_buf, fmt, ap) == -1)
-		die("vasprintf failed");
-	va_end(ap);
-
-	free(tmp);
-
-	usage_with_options_internal(usagestr, opts, 0, NULL);
-	exit(129);
-}
-
-int parse_options_usage(const char * const *usagestr,
-			const struct option *opts,
-			const char *optstr, bool short_opt)
-{
-	if (!usagestr)
-		goto opt;
-
-	fprintf(stderr, "\n Usage: %s\n", *usagestr++);
-	while (*usagestr && **usagestr)
-		fprintf(stderr, "    or: %s\n", *usagestr++);
-	while (*usagestr) {
-		fprintf(stderr, "%s%s\n",
-				**usagestr ? "    " : "",
-				*usagestr);
-		usagestr++;
-	}
-	fputc('\n', stderr);
-
-opt:
-	for (  ; opts->type != OPTION_END; opts++) {
-		if (short_opt) {
-			if (opts->short_name == *optstr) {
-				print_option_help(opts, 0);
-				break;
-			}
-			continue;
-		}
-
-		if (opts->long_name == NULL)
-			continue;
-
-		if (!prefixcmp(opts->long_name, optstr))
-			print_option_help(opts, 0);
-		if (!prefixcmp("no-", optstr) &&
-		    !prefixcmp(opts->long_name, optstr + 3))
-			print_option_help(opts, 0);
-	}
-
-	return PARSE_OPT_HELP;
-}
-
-
-int parse_opt_verbosity_cb(const struct option *opt,
-			   const char *arg __maybe_unused,
-			   int unset)
-{
-	int *target = opt->value;
-
-	if (unset)
-		/* --no-quiet, --no-verbose */
-		*target = 0;
-	else if (opt->short_name == 'v') {
-		if (*target >= 0)
-			(*target)++;
-		else
-			*target = 1;
-	} else {
-		if (*target <= 0)
-			(*target)--;
-		else
-			*target = -1;
-	}
-	return 0;
-}
-
-static struct option *
-find_option(struct option *opts, int shortopt, const char *longopt)
-{
-	for (; opts->type != OPTION_END; opts++) {
-		if ((shortopt && opts->short_name == shortopt) ||
-		    (opts->long_name && longopt &&
-		     !strcmp(opts->long_name, longopt)))
-			return opts;
-	}
-	return NULL;
-}
-
-void set_option_flag(struct option *opts, int shortopt, const char *longopt,
-		     int flag)
-{
-	struct option *opt = find_option(opts, shortopt, longopt);
-
-	if (opt)
-		opt->flags |= flag;
-	return;
-}
-
-void set_option_nobuild(struct option *opts, int shortopt,
-			const char *longopt,
-			const char *build_opt,
-			bool can_skip)
-{
-	struct option *opt = find_option(opts, shortopt, longopt);
-
-	if (!opt)
-		return;
-
-	opt->flags |= PARSE_OPT_NOBUILD;
-	opt->flags |= can_skip ? PARSE_OPT_CANSKIP : 0;
-	opt->build_opt = build_opt;
-}
diff --git a/tools/perf/util/parse-options.h b/tools/perf/util/parse-options.h
deleted file mode 100644
index dec893f..0000000
--- a/tools/perf/util/parse-options.h
+++ /dev/null
@@ -1,228 +0,0 @@
-#ifndef __PERF_PARSE_OPTIONS_H
-#define __PERF_PARSE_OPTIONS_H
-
-#include <stdbool.h>
-#include <stdint.h>
-
-enum parse_opt_type {
-	/* special types */
-	OPTION_END,
-	OPTION_ARGUMENT,
-	OPTION_GROUP,
-	/* options with no arguments */
-	OPTION_BIT,
-	OPTION_BOOLEAN,
-	OPTION_INCR,
-	OPTION_SET_UINT,
-	OPTION_SET_PTR,
-	/* options with arguments (usually) */
-	OPTION_STRING,
-	OPTION_INTEGER,
-	OPTION_LONG,
-	OPTION_CALLBACK,
-	OPTION_U64,
-	OPTION_UINTEGER,
-};
-
-enum parse_opt_flags {
-	PARSE_OPT_KEEP_DASHDASH = 1,
-	PARSE_OPT_STOP_AT_NON_OPTION = 2,
-	PARSE_OPT_KEEP_ARGV0 = 4,
-	PARSE_OPT_KEEP_UNKNOWN = 8,
-	PARSE_OPT_NO_INTERNAL_HELP = 16,
-};
-
-enum parse_opt_option_flags {
-	PARSE_OPT_OPTARG  = 1,
-	PARSE_OPT_NOARG   = 2,
-	PARSE_OPT_NONEG   = 4,
-	PARSE_OPT_HIDDEN  = 8,
-	PARSE_OPT_LASTARG_DEFAULT = 16,
-	PARSE_OPT_DISABLED = 32,
-	PARSE_OPT_EXCLUSIVE = 64,
-	PARSE_OPT_NOEMPTY  = 128,
-	PARSE_OPT_NOBUILD  = 256,
-	PARSE_OPT_CANSKIP  = 512,
-};
-
-struct option;
-typedef int parse_opt_cb(const struct option *, const char *arg, int unset);
-
-/*
- * `type`::
- *   holds the type of the option, you must have an OPTION_END last in your
- *   array.
- *
- * `short_name`::
- *   the character to use as a short option name, '\0' if none.
- *
- * `long_name`::
- *   the long option name, without the leading dashes, NULL if none.
- *
- * `value`::
- *   stores pointers to the values to be filled.
- *
- * `argh`::
- *   token to explain the kind of argument this option wants. Keep it
- *   homogenous across the repository.
- *
- * `help`::
- *   the short help associated to what the option does.
- *   Must never be NULL (except for OPTION_END).
- *   OPTION_GROUP uses this pointer to store the group header.
- *
- * `flags`::
- *   mask of parse_opt_option_flags.
- *   PARSE_OPT_OPTARG: says that the argument is optionnal (not for BOOLEANs)
- *   PARSE_OPT_NOARG: says that this option takes no argument, for CALLBACKs
- *   PARSE_OPT_NONEG: says that this option cannot be negated
- *   PARSE_OPT_HIDDEN this option is skipped in the default usage, showed in
- *                    the long one.
- *
- * `callback`::
- *   pointer to the callback to use for OPTION_CALLBACK.
- *
- * `defval`::
- *   default value to fill (*->value) with for PARSE_OPT_OPTARG.
- *   OPTION_{BIT,SET_UINT,SET_PTR} store the {mask,integer,pointer} to put in
- *   the value when met.
- *   CALLBACKS can use it like they want.
- *
- * `set`::
- *   whether an option was set by the user
- */
-struct option {
-	enum parse_opt_type type;
-	int short_name;
-	const char *long_name;
-	void *value;
-	const char *argh;
-	const char *help;
-	const char *build_opt;
-
-	int flags;
-	parse_opt_cb *callback;
-	intptr_t defval;
-	bool *set;
-	void *data;
-};
-
-#define check_vtype(v, type) ( BUILD_BUG_ON_ZERO(!__builtin_types_compatible_p(typeof(v), type)) + v )
-
-#define OPT_END()                   { .type = OPTION_END }
-#define OPT_ARGUMENT(l, h)          { .type = OPTION_ARGUMENT, .long_name = (l), .help = (h) }
-#define OPT_GROUP(h)                { .type = OPTION_GROUP, .help = (h) }
-#define OPT_BIT(s, l, v, h, b)      { .type = OPTION_BIT, .short_name = (s), .long_name = (l), .value = check_vtype(v, int *), .help = (h), .defval = (b) }
-#define OPT_BOOLEAN(s, l, v, h)     { .type = OPTION_BOOLEAN, .short_name = (s), .long_name = (l), .value = check_vtype(v, bool *), .help = (h) }
-#define OPT_BOOLEAN_FLAG(s, l, v, h, f)     { .type = OPTION_BOOLEAN, .short_name = (s), .long_name = (l), .value = check_vtype(v, bool *), .help = (h), .flags = (f) }
-#define OPT_BOOLEAN_SET(s, l, v, os, h) \
-	{ .type = OPTION_BOOLEAN, .short_name = (s), .long_name = (l), \
-	.value = check_vtype(v, bool *), .help = (h), \
-	.set = check_vtype(os, bool *)}
-#define OPT_INCR(s, l, v, h)        { .type = OPTION_INCR, .short_name = (s), .long_name = (l), .value = check_vtype(v, int *), .help = (h) }
-#define OPT_SET_UINT(s, l, v, h, i)  { .type = OPTION_SET_UINT, .short_name = (s), .long_name = (l), .value = check_vtype(v, unsigned int *), .help = (h), .defval = (i) }
-#define OPT_SET_PTR(s, l, v, h, p)  { .type = OPTION_SET_PTR, .short_name = (s), .long_name = (l), .value = (v), .help = (h), .defval = (p) }
-#define OPT_INTEGER(s, l, v, h)     { .type = OPTION_INTEGER, .short_name = (s), .long_name = (l), .value = check_vtype(v, int *), .help = (h) }
-#define OPT_UINTEGER(s, l, v, h)    { .type = OPTION_UINTEGER, .short_name = (s), .long_name = (l), .value = check_vtype(v, unsigned int *), .help = (h) }
-#define OPT_LONG(s, l, v, h)        { .type = OPTION_LONG, .short_name = (s), .long_name = (l), .value = check_vtype(v, long *), .help = (h) }
-#define OPT_U64(s, l, v, h)         { .type = OPTION_U64, .short_name = (s), .long_name = (l), .value = check_vtype(v, u64 *), .help = (h) }
-#define OPT_STRING(s, l, v, a, h)   { .type = OPTION_STRING,  .short_name = (s), .long_name = (l), .value = check_vtype(v, const char **), (a), .help = (h) }
-#define OPT_STRING_OPTARG(s, l, v, a, h, d) \
-	{ .type = OPTION_STRING,  .short_name = (s), .long_name = (l), \
-	  .value = check_vtype(v, const char **), (a), .help = (h), \
-	  .flags = PARSE_OPT_OPTARG, .defval = (intptr_t)(d) }
-#define OPT_STRING_NOEMPTY(s, l, v, a, h)   { .type = OPTION_STRING,  .short_name = (s), .long_name = (l), .value = check_vtype(v, const char **), (a), .help = (h), .flags = PARSE_OPT_NOEMPTY}
-#define OPT_DATE(s, l, v, h) \
-	{ .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), .value = (v), .argh = "time", .help = (h), .callback = parse_opt_approxidate_cb }
-#define OPT_CALLBACK(s, l, v, a, h, f) \
-	{ .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), .value = (v), (a), .help = (h), .callback = (f) }
-#define OPT_CALLBACK_NOOPT(s, l, v, a, h, f) \
-	{ .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), .value = (v), (a), .help = (h), .callback = (f), .flags = PARSE_OPT_NOARG }
-#define OPT_CALLBACK_DEFAULT(s, l, v, a, h, f, d) \
-	{ .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), .value = (v), (a), .help = (h), .callback = (f), .defval = (intptr_t)d, .flags = PARSE_OPT_LASTARG_DEFAULT }
-#define OPT_CALLBACK_DEFAULT_NOOPT(s, l, v, a, h, f, d) \
-	{ .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l),\
-	.value = (v), (a), .help = (h), .callback = (f), .defval = (intptr_t)d,\
-	.flags = PARSE_OPT_LASTARG_DEFAULT | PARSE_OPT_NOARG}
-#define OPT_CALLBACK_OPTARG(s, l, v, d, a, h, f) \
-	{ .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), \
-	  .value = (v), (a), .help = (h), .callback = (f), \
-	  .flags = PARSE_OPT_OPTARG, .data = (d) }
-
-/* parse_options() will filter out the processed options and leave the
- * non-option argments in argv[].
- * Returns the number of arguments left in argv[].
- *
- * NOTE: parse_options() and parse_options_subcommand() may call exit() in the
- * case of an error (or for 'special' options like --list-cmds or --list-opts).
- */
-extern int parse_options(int argc, const char **argv,
-                         const struct option *options,
-                         const char * const usagestr[], int flags);
-
-extern int parse_options_subcommand(int argc, const char **argv,
-				const struct option *options,
-				const char *const subcommands[],
-				const char *usagestr[], int flags);
-
-extern NORETURN void usage_with_options(const char * const *usagestr,
-                                        const struct option *options);
-extern NORETURN __attribute__((format(printf,3,4)))
-void usage_with_options_msg(const char * const *usagestr,
-			    const struct option *options,
-			    const char *fmt, ...);
-
-/*----- incremantal advanced APIs -----*/
-
-enum {
-	PARSE_OPT_HELP = -1,
-	PARSE_OPT_DONE,
-	PARSE_OPT_LIST_OPTS,
-	PARSE_OPT_LIST_SUBCMDS,
-	PARSE_OPT_UNKNOWN,
-};
-
-/*
- * It's okay for the caller to consume argv/argc in the usual way.
- * Other fields of that structure are private to parse-options and should not
- * be modified in any way.
- */
-struct parse_opt_ctx_t {
-	const char **argv;
-	const char **out;
-	int argc, cpidx;
-	const char *opt;
-	const struct option *excl_opt;
-	int flags;
-};
-
-extern int parse_options_usage(const char * const *usagestr,
-			       const struct option *opts,
-			       const char *optstr,
-			       bool short_opt);
-
-
-/*----- some often used options -----*/
-extern int parse_opt_abbrev_cb(const struct option *, const char *, int);
-extern int parse_opt_approxidate_cb(const struct option *, const char *, int);
-extern int parse_opt_verbosity_cb(const struct option *, const char *, int);
-
-#define OPT__VERBOSE(var)  OPT_BOOLEAN('v', "verbose", (var), "be verbose")
-#define OPT__QUIET(var)    OPT_BOOLEAN('q', "quiet",   (var), "be quiet")
-#define OPT__VERBOSITY(var) \
-	{ OPTION_CALLBACK, 'v', "verbose", (var), NULL, "be more verbose", \
-	  PARSE_OPT_NOARG, &parse_opt_verbosity_cb, 0 }, \
-	{ OPTION_CALLBACK, 'q', "quiet", (var), NULL, "be more quiet", \
-	  PARSE_OPT_NOARG, &parse_opt_verbosity_cb, 0 }
-#define OPT__DRY_RUN(var)  OPT_BOOLEAN('n', "dry-run", (var), "dry run")
-#define OPT__ABBREV(var)  \
-	{ OPTION_CALLBACK, 0, "abbrev", (var), "n", \
-	  "use <n> digits to display SHA-1s", \
-	  PARSE_OPT_OPTARG, &parse_opt_abbrev_cb, 0 }
-
-extern const char *parse_options_fix_filename(const char *prefix, const char *file);
-
-void set_option_flag(struct option *opts, int sopt, const char *lopt, int flag);
-void set_option_nobuild(struct option *opts, int shortopt, const char *longopt,
-			const char *build_opt, bool can_skip);
-#endif /* __PERF_PARSE_OPTIONS_H */
diff --git a/tools/perf/util/parse-regs-options.c b/tools/perf/util/parse-regs-options.c
index 4f2c1c2..646ecf7 100644
--- a/tools/perf/util/parse-regs-options.c
+++ b/tools/perf/util/parse-regs-options.c
@@ -1,7 +1,7 @@
 #include "perf.h"
 #include "util/util.h"
 #include "util/debug.h"
-#include "util/parse-options.h"
+#include <subcmd/parse-options.h>
 #include "util/parse-regs-options.h"
 
 int
diff --git a/tools/perf/util/run-command.c b/tools/perf/util/run-command.c
deleted file mode 100644
index fed37d6..0000000
--- a/tools/perf/util/run-command.c
+++ /dev/null
@@ -1,227 +0,0 @@
-#include <unistd.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <string.h>
-#include <errno.h>
-#include <sys/wait.h>
-#include "subcmd-util.h"
-#include "run-command.h"
-#include "exec_cmd.h"
-
-#define STRERR_BUFSIZE 128
-
-static inline void close_pair(int fd[2])
-{
-	close(fd[0]);
-	close(fd[1]);
-}
-
-static inline void dup_devnull(int to)
-{
-	int fd = open("/dev/null", O_RDWR);
-	dup2(fd, to);
-	close(fd);
-}
-
-int start_command(struct child_process *cmd)
-{
-	int need_in, need_out, need_err;
-	int fdin[2], fdout[2], fderr[2];
-	char sbuf[STRERR_BUFSIZE];
-
-	/*
-	 * In case of errors we must keep the promise to close FDs
-	 * that have been passed in via ->in and ->out.
-	 */
-
-	need_in = !cmd->no_stdin && cmd->in < 0;
-	if (need_in) {
-		if (pipe(fdin) < 0) {
-			if (cmd->out > 0)
-				close(cmd->out);
-			return -ERR_RUN_COMMAND_PIPE;
-		}
-		cmd->in = fdin[1];
-	}
-
-	need_out = !cmd->no_stdout
-		&& !cmd->stdout_to_stderr
-		&& cmd->out < 0;
-	if (need_out) {
-		if (pipe(fdout) < 0) {
-			if (need_in)
-				close_pair(fdin);
-			else if (cmd->in)
-				close(cmd->in);
-			return -ERR_RUN_COMMAND_PIPE;
-		}
-		cmd->out = fdout[0];
-	}
-
-	need_err = !cmd->no_stderr && cmd->err < 0;
-	if (need_err) {
-		if (pipe(fderr) < 0) {
-			if (need_in)
-				close_pair(fdin);
-			else if (cmd->in)
-				close(cmd->in);
-			if (need_out)
-				close_pair(fdout);
-			else if (cmd->out)
-				close(cmd->out);
-			return -ERR_RUN_COMMAND_PIPE;
-		}
-		cmd->err = fderr[0];
-	}
-
-	fflush(NULL);
-	cmd->pid = fork();
-	if (!cmd->pid) {
-		if (cmd->no_stdin)
-			dup_devnull(0);
-		else if (need_in) {
-			dup2(fdin[0], 0);
-			close_pair(fdin);
-		} else if (cmd->in) {
-			dup2(cmd->in, 0);
-			close(cmd->in);
-		}
-
-		if (cmd->no_stderr)
-			dup_devnull(2);
-		else if (need_err) {
-			dup2(fderr[1], 2);
-			close_pair(fderr);
-		}
-
-		if (cmd->no_stdout)
-			dup_devnull(1);
-		else if (cmd->stdout_to_stderr)
-			dup2(2, 1);
-		else if (need_out) {
-			dup2(fdout[1], 1);
-			close_pair(fdout);
-		} else if (cmd->out > 1) {
-			dup2(cmd->out, 1);
-			close(cmd->out);
-		}
-
-		if (cmd->dir && chdir(cmd->dir))
-			die("exec %s: cd to %s failed (%s)", cmd->argv[0],
-			    cmd->dir, strerror_r(errno, sbuf, sizeof(sbuf)));
-		if (cmd->env) {
-			for (; *cmd->env; cmd->env++) {
-				if (strchr(*cmd->env, '='))
-					putenv((char*)*cmd->env);
-				else
-					unsetenv(*cmd->env);
-			}
-		}
-		if (cmd->preexec_cb)
-			cmd->preexec_cb();
-		if (cmd->exec_cmd) {
-			execv_cmd(cmd->argv);
-		} else {
-			execvp(cmd->argv[0], (char *const*) cmd->argv);
-		}
-		exit(127);
-	}
-
-	if (cmd->pid < 0) {
-		int err = errno;
-		if (need_in)
-			close_pair(fdin);
-		else if (cmd->in)
-			close(cmd->in);
-		if (need_out)
-			close_pair(fdout);
-		else if (cmd->out)
-			close(cmd->out);
-		if (need_err)
-			close_pair(fderr);
-		return err == ENOENT ?
-			-ERR_RUN_COMMAND_EXEC :
-			-ERR_RUN_COMMAND_FORK;
-	}
-
-	if (need_in)
-		close(fdin[0]);
-	else if (cmd->in)
-		close(cmd->in);
-
-	if (need_out)
-		close(fdout[1]);
-	else if (cmd->out)
-		close(cmd->out);
-
-	if (need_err)
-		close(fderr[1]);
-
-	return 0;
-}
-
-static int wait_or_whine(pid_t pid)
-{
-	char sbuf[STRERR_BUFSIZE];
-
-	for (;;) {
-		int status, code;
-		pid_t waiting = waitpid(pid, &status, 0);
-
-		if (waiting < 0) {
-			if (errno == EINTR)
-				continue;
-			fprintf(stderr, " Error: waitpid failed (%s)",
-				strerror_r(errno, sbuf, sizeof(sbuf)));
-			return -ERR_RUN_COMMAND_WAITPID;
-		}
-		if (waiting != pid)
-			return -ERR_RUN_COMMAND_WAITPID_WRONG_PID;
-		if (WIFSIGNALED(status))
-			return -ERR_RUN_COMMAND_WAITPID_SIGNAL;
-
-		if (!WIFEXITED(status))
-			return -ERR_RUN_COMMAND_WAITPID_NOEXIT;
-		code = WEXITSTATUS(status);
-		switch (code) {
-		case 127:
-			return -ERR_RUN_COMMAND_EXEC;
-		case 0:
-			return 0;
-		default:
-			return -code;
-		}
-	}
-}
-
-int finish_command(struct child_process *cmd)
-{
-	return wait_or_whine(cmd->pid);
-}
-
-int run_command(struct child_process *cmd)
-{
-	int code = start_command(cmd);
-	if (code)
-		return code;
-	return finish_command(cmd);
-}
-
-static void prepare_run_command_v_opt(struct child_process *cmd,
-				      const char **argv,
-				      int opt)
-{
-	memset(cmd, 0, sizeof(*cmd));
-	cmd->argv = argv;
-	cmd->no_stdin = opt & RUN_COMMAND_NO_STDIN ? 1 : 0;
-	cmd->exec_cmd = opt & RUN_EXEC_CMD ? 1 : 0;
-	cmd->stdout_to_stderr = opt & RUN_COMMAND_STDOUT_TO_STDERR ? 1 : 0;
-}
-
-int run_command_v_opt(const char **argv, int opt)
-{
-	struct child_process cmd;
-	prepare_run_command_v_opt(&cmd, argv, opt);
-	return run_command(&cmd);
-}
diff --git a/tools/perf/util/run-command.h b/tools/perf/util/run-command.h
deleted file mode 100644
index 4a55393..0000000
--- a/tools/perf/util/run-command.h
+++ /dev/null
@@ -1,60 +0,0 @@
-#ifndef __PERF_RUN_COMMAND_H
-#define __PERF_RUN_COMMAND_H
-
-#include <unistd.h>
-
-enum {
-	ERR_RUN_COMMAND_FORK = 10000,
-	ERR_RUN_COMMAND_EXEC,
-	ERR_RUN_COMMAND_PIPE,
-	ERR_RUN_COMMAND_WAITPID,
-	ERR_RUN_COMMAND_WAITPID_WRONG_PID,
-	ERR_RUN_COMMAND_WAITPID_SIGNAL,
-	ERR_RUN_COMMAND_WAITPID_NOEXIT,
-};
-#define IS_RUN_COMMAND_ERR(x) (-(x) >= ERR_RUN_COMMAND_FORK)
-
-struct child_process {
-	const char **argv;
-	pid_t pid;
-	/*
-	 * Using .in, .out, .err:
-	 * - Specify 0 for no redirections (child inherits stdin, stdout,
-	 *   stderr from parent).
-	 * - Specify -1 to have a pipe allocated as follows:
-	 *     .in: returns the writable pipe end; parent writes to it,
-	 *          the readable pipe end becomes child's stdin
-	 *     .out, .err: returns the readable pipe end; parent reads from
-	 *          it, the writable pipe end becomes child's stdout/stderr
-	 *   The caller of start_command() must close the returned FDs
-	 *   after it has completed reading from/writing to it!
-	 * - Specify > 0 to set a channel to a particular FD as follows:
-	 *     .in: a readable FD, becomes child's stdin
-	 *     .out: a writable FD, becomes child's stdout/stderr
-	 *     .err > 0 not supported
-	 *   The specified FD is closed by start_command(), even in case
-	 *   of errors!
-	 */
-	int in;
-	int out;
-	int err;
-	const char *dir;
-	const char *const *env;
-	unsigned no_stdin:1;
-	unsigned no_stdout:1;
-	unsigned no_stderr:1;
-	unsigned exec_cmd:1; /* if this is to be external sub-command */
-	unsigned stdout_to_stderr:1;
-	void (*preexec_cb)(void);
-};
-
-int start_command(struct child_process *);
-int finish_command(struct child_process *);
-int run_command(struct child_process *);
-
-#define RUN_COMMAND_NO_STDIN 1
-#define RUN_EXEC_CMD	     2	/*If this is to be external sub-command */
-#define RUN_COMMAND_STDOUT_TO_STDERR 4
-int run_command_v_opt(const char **argv, int opt);
-
-#endif /* __PERF_RUN_COMMAND_H */
diff --git a/tools/perf/util/sigchain.c b/tools/perf/util/sigchain.c
deleted file mode 100644
index 3537c34..0000000
--- a/tools/perf/util/sigchain.c
+++ /dev/null
@@ -1,53 +0,0 @@
-#include <signal.h>
-#include "subcmd-util.h"
-#include "sigchain.h"
-
-#define SIGCHAIN_MAX_SIGNALS 32
-
-struct sigchain_signal {
-	sigchain_fun *old;
-	int n;
-	int alloc;
-};
-static struct sigchain_signal signals[SIGCHAIN_MAX_SIGNALS];
-
-static void check_signum(int sig)
-{
-	if (sig < 1 || sig >= SIGCHAIN_MAX_SIGNALS)
-		die("BUG: signal out of range: %d", sig);
-}
-
-static int sigchain_push(int sig, sigchain_fun f)
-{
-	struct sigchain_signal *s = signals + sig;
-	check_signum(sig);
-
-	ALLOC_GROW(s->old, s->n + 1, s->alloc);
-	s->old[s->n] = signal(sig, f);
-	if (s->old[s->n] == SIG_ERR)
-		return -1;
-	s->n++;
-	return 0;
-}
-
-int sigchain_pop(int sig)
-{
-	struct sigchain_signal *s = signals + sig;
-	check_signum(sig);
-	if (s->n < 1)
-		return 0;
-
-	if (signal(sig, s->old[s->n - 1]) == SIG_ERR)
-		return -1;
-	s->n--;
-	return 0;
-}
-
-void sigchain_push_common(sigchain_fun f)
-{
-	sigchain_push(SIGINT, f);
-	sigchain_push(SIGHUP, f);
-	sigchain_push(SIGTERM, f);
-	sigchain_push(SIGQUIT, f);
-	sigchain_push(SIGPIPE, f);
-}
diff --git a/tools/perf/util/sigchain.h b/tools/perf/util/sigchain.h
deleted file mode 100644
index 959d64e..0000000
--- a/tools/perf/util/sigchain.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef __PERF_SIGCHAIN_H
-#define __PERF_SIGCHAIN_H
-
-typedef void (*sigchain_fun)(int);
-
-int sigchain_pop(int sig);
-
-void sigchain_push_common(sigchain_fun f);
-
-#endif /* __PERF_SIGCHAIN_H */
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index 3122885..86f05e7 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -18,7 +18,7 @@
 #include "debug.h"
 #include "header.h"
 
-#include "parse-options.h"
+#include <subcmd/parse-options.h>
 #include "parse-events.h"
 #include "hist.h"
 #include "thread.h"
diff --git a/tools/perf/util/subcmd-config.c b/tools/perf/util/subcmd-config.c
deleted file mode 100644
index d017c72..0000000
--- a/tools/perf/util/subcmd-config.c
+++ /dev/null
@@ -1,11 +0,0 @@
-#include "subcmd-config.h"
-
-#define UNDEFINED "SUBCMD_HAS_NOT_BEEN_INITIALIZED"
-
-struct subcmd_config subcmd_config = {
-	.exec_name	= UNDEFINED,
-	.prefix		= UNDEFINED,
-	.exec_path	= UNDEFINED,
-	.exec_path_env	= UNDEFINED,
-	.pager_env	= UNDEFINED,
-};
diff --git a/tools/perf/util/subcmd-config.h b/tools/perf/util/subcmd-config.h
deleted file mode 100644
index cc85140..0000000
--- a/tools/perf/util/subcmd-config.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef __PERF_SUBCMD_CONFIG_H
-#define __PERF_SUBCMD_CONFIG_H
-
-struct subcmd_config {
-	const char *exec_name;
-	const char *prefix;
-	const char *exec_path;
-	const char *exec_path_env;
-	const char *pager_env;
-};
-
-extern struct subcmd_config subcmd_config;
-
-#endif /* __PERF_SUBCMD_CONFIG_H */
diff --git a/tools/perf/util/subcmd-util.h b/tools/perf/util/subcmd-util.h
deleted file mode 100644
index 321aeb1..0000000
--- a/tools/perf/util/subcmd-util.h
+++ /dev/null
@@ -1,91 +0,0 @@
-#ifndef __PERF_SUBCMD_UTIL_H
-#define __PERF_SUBCMD_UTIL_H
-
-#include <stdarg.h>
-#include <stdlib.h>
-#include <stdio.h>
-
-#define NORETURN __attribute__((__noreturn__))
-
-static inline void report(const char *prefix, const char *err, va_list params)
-{
-	char msg[1024];
-	vsnprintf(msg, sizeof(msg), err, params);
-	fprintf(stderr, " %s%s\n", prefix, msg);
-}
-
-static NORETURN inline void die(const char *err, ...)
-{
-	va_list params;
-
-	va_start(params, err);
-	report(" Fatal: ", err, params);
-	exit(128);
-	va_end(params);
-}
-
-#define zfree(ptr) ({ free(*ptr); *ptr = NULL; })
-
-#define alloc_nr(x) (((x)+16)*3/2)
-
-/*
- * Realloc the buffer pointed at by variable 'x' so that it can hold
- * at least 'nr' entries; the number of entries currently allocated
- * is 'alloc', using the standard growing factor alloc_nr() macro.
- *
- * DO NOT USE any expression with side-effect for 'x' or 'alloc'.
- */
-#define ALLOC_GROW(x, nr, alloc) \
-	do { \
-		if ((nr) > alloc) { \
-			if (alloc_nr(alloc) < (nr)) \
-				alloc = (nr); \
-			else \
-				alloc = alloc_nr(alloc); \
-			x = xrealloc((x), alloc * sizeof(*(x))); \
-		} \
-	} while(0)
-
-static inline void *xrealloc(void *ptr, size_t size)
-{
-	void *ret = realloc(ptr, size);
-	if (!ret && !size)
-		ret = realloc(ptr, 1);
-	if (!ret) {
-		ret = realloc(ptr, size);
-		if (!ret && !size)
-			ret = realloc(ptr, 1);
-		if (!ret)
-			die("Out of memory, realloc failed");
-	}
-	return ret;
-}
-
-#define astrcatf(out, fmt, ...)						\
-({									\
-	char *tmp = *(out);						\
-	if (asprintf((out), "%s" fmt, tmp ?: "", ## __VA_ARGS__) == -1)	\
-		die("asprintf failed");					\
-	free(tmp);							\
-})
-
-static inline void astrcat(char **out, const char *add)
-{
-	char *tmp = *out;
-
-	if (asprintf(out, "%s%s", tmp ?: "", add) == -1)
-		die("asprintf failed");
-
-	free(tmp);
-}
-
-static inline int prefixcmp(const char *str, const char *prefix)
-{
-	for (; ; str++, prefix++)
-		if (!*prefix)
-			return 0;
-		else if (*str != *prefix)
-			return (unsigned char)*prefix - (unsigned char)*str;
-}
-
-#endif /* __PERF_SUBCMD_UTIL_H */
-- 
cgit v1.1


From 1843b4e057b7717db21a3ad96fa16d6b4ee8f6c4 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Tue, 15 Dec 2015 09:39:40 -0600
Subject: tools subcmd: Rename subcmd header include guards

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/d8081e7528b25ad91f4154b6a3fd063e93c108ec.1450193761.git.jpoimboe@redhat.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/lib/subcmd/exec-cmd.h      | 6 +++---
 tools/lib/subcmd/help.h          | 6 +++---
 tools/lib/subcmd/pager.h         | 6 +++---
 tools/lib/subcmd/parse-options.h | 7 ++++---
 tools/lib/subcmd/run-command.h   | 6 +++---
 tools/lib/subcmd/sigchain.h      | 6 +++---
 tools/lib/subcmd/subcmd-util.h   | 6 +++---
 7 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/tools/lib/subcmd/exec-cmd.h b/tools/lib/subcmd/exec-cmd.h
index f1bd343..5d08bda 100644
--- a/tools/lib/subcmd/exec-cmd.h
+++ b/tools/lib/subcmd/exec-cmd.h
@@ -1,5 +1,5 @@
-#ifndef __PERF_EXEC_CMD_H
-#define __PERF_EXEC_CMD_H
+#ifndef __SUBCMD_EXEC_CMD_H
+#define __SUBCMD_EXEC_CMD_H
 
 extern void exec_cmd_init(const char *exec_name, const char *prefix,
 			  const char *exec_path, const char *exec_path_env);
@@ -13,4 +13,4 @@ extern int execl_cmd(const char *cmd, ...);
 extern char *get_argv_exec_path(void);
 extern char *system_path(const char *path);
 
-#endif /* __PERF_EXEC_CMD_H */
+#endif /* __SUBCMD_EXEC_CMD_H */
diff --git a/tools/lib/subcmd/help.h b/tools/lib/subcmd/help.h
index 096c8bc..e145a02 100644
--- a/tools/lib/subcmd/help.h
+++ b/tools/lib/subcmd/help.h
@@ -1,5 +1,5 @@
-#ifndef __PERF_HELP_H
-#define __PERF_HELP_H
+#ifndef __SUBCMD_HELP_H
+#define __SUBCMD_HELP_H
 
 #include <sys/types.h>
 
@@ -31,4 +31,4 @@ int is_in_cmdlist(struct cmdnames *c, const char *s);
 void list_commands(const char *title, struct cmdnames *main_cmds,
 		   struct cmdnames *other_cmds);
 
-#endif /* __PERF_HELP_H */
+#endif /* __SUBCMD_HELP_H */
diff --git a/tools/lib/subcmd/pager.h b/tools/lib/subcmd/pager.h
index d6a591a..8b83714 100644
--- a/tools/lib/subcmd/pager.h
+++ b/tools/lib/subcmd/pager.h
@@ -1,9 +1,9 @@
-#ifndef __PERF_PAGER_H
-#define __PERF_PAGER_H
+#ifndef __SUBCMD_PAGER_H
+#define __SUBCMD_PAGER_H
 
 extern void pager_init(const char *pager_env);
 
 extern void setup_pager(void);
 extern int pager_in_use(void);
 
-#endif /* __PERF_PAGER_H */
+#endif /* __SUBCMD_PAGER_H */
diff --git a/tools/lib/subcmd/parse-options.h b/tools/lib/subcmd/parse-options.h
index dec893f..13a2cc1 100644
--- a/tools/lib/subcmd/parse-options.h
+++ b/tools/lib/subcmd/parse-options.h
@@ -1,5 +1,5 @@
-#ifndef __PERF_PARSE_OPTIONS_H
-#define __PERF_PARSE_OPTIONS_H
+#ifndef __SUBCMD_PARSE_OPTIONS_H
+#define __SUBCMD_PARSE_OPTIONS_H
 
 #include <stdbool.h>
 #include <stdint.h>
@@ -225,4 +225,5 @@ extern const char *parse_options_fix_filename(const char *prefix, const char *fi
 void set_option_flag(struct option *opts, int sopt, const char *lopt, int flag);
 void set_option_nobuild(struct option *opts, int shortopt, const char *longopt,
 			const char *build_opt, bool can_skip);
-#endif /* __PERF_PARSE_OPTIONS_H */
+
+#endif /* __SUBCMD_PARSE_OPTIONS_H */
diff --git a/tools/lib/subcmd/run-command.h b/tools/lib/subcmd/run-command.h
index 4a55393..fe2befe 100644
--- a/tools/lib/subcmd/run-command.h
+++ b/tools/lib/subcmd/run-command.h
@@ -1,5 +1,5 @@
-#ifndef __PERF_RUN_COMMAND_H
-#define __PERF_RUN_COMMAND_H
+#ifndef __SUBCMD_RUN_COMMAND_H
+#define __SUBCMD_RUN_COMMAND_H
 
 #include <unistd.h>
 
@@ -57,4 +57,4 @@ int run_command(struct child_process *);
 #define RUN_COMMAND_STDOUT_TO_STDERR 4
 int run_command_v_opt(const char **argv, int opt);
 
-#endif /* __PERF_RUN_COMMAND_H */
+#endif /* __SUBCMD_RUN_COMMAND_H */
diff --git a/tools/lib/subcmd/sigchain.h b/tools/lib/subcmd/sigchain.h
index 959d64e..0c919f2 100644
--- a/tools/lib/subcmd/sigchain.h
+++ b/tools/lib/subcmd/sigchain.h
@@ -1,5 +1,5 @@
-#ifndef __PERF_SIGCHAIN_H
-#define __PERF_SIGCHAIN_H
+#ifndef __SUBCMD_SIGCHAIN_H
+#define __SUBCMD_SIGCHAIN_H
 
 typedef void (*sigchain_fun)(int);
 
@@ -7,4 +7,4 @@ int sigchain_pop(int sig);
 
 void sigchain_push_common(sigchain_fun f);
 
-#endif /* __PERF_SIGCHAIN_H */
+#endif /* __SUBCMD_SIGCHAIN_H */
diff --git a/tools/lib/subcmd/subcmd-util.h b/tools/lib/subcmd/subcmd-util.h
index 321aeb1..fc2e45d 100644
--- a/tools/lib/subcmd/subcmd-util.h
+++ b/tools/lib/subcmd/subcmd-util.h
@@ -1,5 +1,5 @@
-#ifndef __PERF_SUBCMD_UTIL_H
-#define __PERF_SUBCMD_UTIL_H
+#ifndef __SUBCMD_UTIL_H
+#define __SUBCMD_UTIL_H
 
 #include <stdarg.h>
 #include <stdlib.h>
@@ -88,4 +88,4 @@ static inline int prefixcmp(const char *str, const char *prefix)
 			return (unsigned char)*prefix - (unsigned char)*str;
 }
 
-#endif /* __PERF_SUBCMD_UTIL_H */
+#endif /* __SUBCMD_UTIL_H */
-- 
cgit v1.1


From 5f3339d2e83ca587c2e13c3e37e1b5fb7c68ebe5 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Sun, 25 Oct 2015 15:51:19 +0100
Subject: perf thread_map: Add thread_map user level event

Adding the thread_map event to pass/store thread maps as data in
the pipe/perf.data.

Storing the thread ID along with the standard comm[16] thread name string.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Kan Liang <kan.liang@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1445784728-21732-4-git-send-email-jolsa@kernel.org
[ Renamed thread_map_data_event to thread_map_event_entry ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/event.c   |  1 +
 tools/perf/util/event.h   | 13 +++++++++++++
 tools/perf/util/session.c | 26 ++++++++++++++++++++++++++
 tools/perf/util/tool.h    |  3 ++-
 4 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 8b10621..771545a 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -37,6 +37,7 @@ static const char *perf_event__names[] = {
 	[PERF_RECORD_AUXTRACE_INFO]		= "AUXTRACE_INFO",
 	[PERF_RECORD_AUXTRACE]			= "AUXTRACE",
 	[PERF_RECORD_AUXTRACE_ERROR]		= "AUXTRACE_ERROR",
+	[PERF_RECORD_THREAD_MAP]		= "THREAD_MAP",
 };
 
 const char *perf_event__name(unsigned int id)
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index a0dbcbd..66f303e 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -226,6 +226,7 @@ enum perf_user_event_type { /* above any possible kernel type */
 	PERF_RECORD_AUXTRACE_INFO		= 70,
 	PERF_RECORD_AUXTRACE			= 71,
 	PERF_RECORD_AUXTRACE_ERROR		= 72,
+	PERF_RECORD_THREAD_MAP			= 73,
 	PERF_RECORD_HEADER_MAX
 };
 
@@ -356,6 +357,17 @@ struct context_switch_event {
 	u32 next_prev_tid;
 };
 
+struct thread_map_event_entry {
+	u64	pid;
+	char	comm[16];
+};
+
+struct thread_map_event {
+	struct perf_event_header	header;
+	u64				nr;
+	struct thread_map_event_entry	entries[];
+};
+
 union perf_event {
 	struct perf_event_header	header;
 	struct mmap_event		mmap;
@@ -378,6 +390,7 @@ union perf_event {
 	struct aux_event		aux;
 	struct itrace_start_event	itrace_start;
 	struct context_switch_event	context_switch;
+	struct thread_map_event		thread_map;
 };
 
 void perf_event__print_totals(void);
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 9774686..36b07b2 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -296,6 +296,16 @@ int process_event_auxtrace_error_stub(struct perf_tool *tool __maybe_unused,
 	return 0;
 }
 
+
+static
+int process_event_thread_map_stub(struct perf_tool *tool __maybe_unused,
+				  union perf_event *event __maybe_unused,
+				  struct perf_session *session __maybe_unused)
+{
+	dump_printf(": unhandled!\n");
+	return 0;
+}
+
 void perf_tool__fill_defaults(struct perf_tool *tool)
 {
 	if (tool->sample == NULL)
@@ -346,6 +356,8 @@ void perf_tool__fill_defaults(struct perf_tool *tool)
 		tool->auxtrace = process_event_auxtrace_stub;
 	if (tool->auxtrace_error == NULL)
 		tool->auxtrace_error = process_event_auxtrace_error_stub;
+	if (tool->thread_map == NULL)
+		tool->thread_map = process_event_thread_map_stub;
 }
 
 static void swap_sample_id_all(union perf_event *event, void *data)
@@ -616,6 +628,17 @@ static void perf_event__auxtrace_error_swap(union perf_event *event,
 	event->auxtrace_error.ip   = bswap_64(event->auxtrace_error.ip);
 }
 
+static void perf_event__thread_map_swap(union perf_event *event,
+					bool sample_id_all __maybe_unused)
+{
+	unsigned i;
+
+	event->thread_map.nr = bswap_64(event->thread_map.nr);
+
+	for (i = 0; i < event->thread_map.nr; i++)
+		event->thread_map.entries[i].pid = bswap_64(event->thread_map.entries[i].pid);
+}
+
 typedef void (*perf_event__swap_op)(union perf_event *event,
 				    bool sample_id_all);
 
@@ -643,6 +666,7 @@ static perf_event__swap_op perf_event__swap_ops[] = {
 	[PERF_RECORD_AUXTRACE_INFO]	  = perf_event__auxtrace_info_swap,
 	[PERF_RECORD_AUXTRACE]		  = perf_event__auxtrace_swap,
 	[PERF_RECORD_AUXTRACE_ERROR]	  = perf_event__auxtrace_error_swap,
+	[PERF_RECORD_THREAD_MAP]	  = perf_event__thread_map_swap,
 	[PERF_RECORD_HEADER_MAX]	  = NULL,
 };
 
@@ -1179,6 +1203,8 @@ static s64 perf_session__process_user_event(struct perf_session *session,
 	case PERF_RECORD_AUXTRACE_ERROR:
 		perf_session__auxtrace_error_inc(session, event);
 		return tool->auxtrace_error(tool, event, session);
+	case PERF_RECORD_THREAD_MAP:
+		return tool->thread_map(tool, event, session);
 	default:
 		return -EINVAL;
 	}
diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h
index cab8cc2..1af4774 100644
--- a/tools/perf/util/tool.h
+++ b/tools/perf/util/tool.h
@@ -55,7 +55,8 @@ struct perf_tool {
 	event_op2	build_id,
 			id_index,
 			auxtrace_info,
-			auxtrace_error;
+			auxtrace_error,
+			thread_map;
 	event_op3	auxtrace;
 	bool		ordered_events;
 	bool		ordering_requires_timestamps;
-- 
cgit v1.1


From 99471c967a00c875bb5d61f377d4267904545499 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Sun, 25 Oct 2015 15:51:20 +0100
Subject: perf thread_map: Add thread_map event sythesize function

Introduce the perf_event__synthesize_thread_map2 function to synthesize
struct thread_map.

The perf_event__synthesize_thread_map name is already taken for
synthesizing the complete threads data (comm/mmap/fork).

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Kan Liang <kan.liang@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1445784728-21732-5-git-send-email-jolsa@kernel.org
[ Rename thread_map_data_event to thread_map_event_entry ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/builtin-test.c |  4 ++++
 tools/perf/tests/tests.h        |  1 +
 tools/perf/tests/thread-map.c   | 29 +++++++++++++++++++++++++++++
 tools/perf/util/event.c         | 36 ++++++++++++++++++++++++++++++++++++
 tools/perf/util/event.h         |  4 ++++
 5 files changed, 74 insertions(+)

diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c
index 0372d59..745bdb0 100644
--- a/tools/perf/tests/builtin-test.c
+++ b/tools/perf/tests/builtin-test.c
@@ -180,6 +180,10 @@ static struct test generic_tests[] = {
 		},
 	},
 	{
+		.desc = "Test thread map synthesize",
+		.func = test__thread_map_synthesize,
+	},
+	{
 		.func = NULL,
 	},
 };
diff --git a/tools/perf/tests/tests.h b/tools/perf/tests/tests.h
index a0733aa..3fe52cc 100644
--- a/tools/perf/tests/tests.h
+++ b/tools/perf/tests/tests.h
@@ -79,6 +79,7 @@ int test__bpf(int subtest);
 const char *test__bpf_subtest_get_desc(int subtest);
 int test__bpf_subtest_get_nr(void);
 int test_session_topology(int subtest);
+int test__thread_map_synthesize(int subtest);
 
 #if defined(__arm__) || defined(__aarch64__)
 #ifdef HAVE_DWARF_UNWIND_SUPPORT
diff --git a/tools/perf/tests/thread-map.c b/tools/perf/tests/thread-map.c
index 2be02d3..ac5be25 100644
--- a/tools/perf/tests/thread-map.c
+++ b/tools/perf/tests/thread-map.c
@@ -40,3 +40,32 @@ int test__thread_map(int subtest __maybe_unused)
 	thread_map__put(map);
 	return 0;
 }
+
+static int process_event(struct perf_tool *tool __maybe_unused,
+			 union perf_event *event,
+			 struct perf_sample *sample __maybe_unused,
+			 struct machine *machine __maybe_unused)
+{
+	struct thread_map_event *map = &event->thread_map;
+
+	TEST_ASSERT_VAL("wrong nr",   map->nr == 1);
+	TEST_ASSERT_VAL("wrong pid",  map->entries[0].pid == (u64) getpid());
+	TEST_ASSERT_VAL("wrong comm", !strcmp(map->entries[0].comm, "perf"));
+	return 0;
+}
+
+int test__thread_map_synthesize(int subtest __maybe_unused)
+{
+	struct thread_map *threads;
+
+	/* test map on current pid */
+	threads = thread_map__new_by_pid(getpid());
+	TEST_ASSERT_VAL("failed to alloc map", threads);
+
+	thread_map__read_comms(threads);
+
+	TEST_ASSERT_VAL("failed to synthesize map",
+		!perf_event__synthesize_thread_map2(NULL, threads, process_event, NULL));
+
+	return 0;
+}
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 771545a..b13373a 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -700,6 +700,42 @@ int perf_event__synthesize_kernel_mmap(struct perf_tool *tool,
 	return err;
 }
 
+int perf_event__synthesize_thread_map2(struct perf_tool *tool,
+				      struct thread_map *threads,
+				      perf_event__handler_t process,
+				      struct machine *machine)
+{
+	union perf_event *event;
+	int i, err, size;
+
+	size  = sizeof(event->thread_map);
+	size +=	threads->nr * sizeof(event->thread_map.entries[0]);
+
+	event = zalloc(size);
+	if (!event)
+		return -ENOMEM;
+
+	event->header.type = PERF_RECORD_THREAD_MAP;
+	event->header.size = size;
+	event->thread_map.nr = threads->nr;
+
+	for (i = 0; i < threads->nr; i++) {
+		struct thread_map_event_entry *entry = &event->thread_map.entries[i];
+		char *comm = thread_map__comm(threads, i);
+
+		if (!comm)
+			comm = (char *) "";
+
+		entry->pid = thread_map__pid(threads, i);
+		strncpy((char *) &entry->comm, comm, sizeof(entry->comm));
+	}
+
+	err = process(tool, event, NULL, machine);
+
+	free(event);
+	return err;
+}
+
 size_t perf_event__fprintf_comm(union perf_event *event, FILE *fp)
 {
 	const char *s;
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 66f303e..952dd4d 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -408,6 +408,10 @@ int perf_event__synthesize_thread_map(struct perf_tool *tool,
 				      perf_event__handler_t process,
 				      struct machine *machine, bool mmap_data,
 				      unsigned int proc_map_timeout);
+int perf_event__synthesize_thread_map2(struct perf_tool *tool,
+				      struct thread_map *threads,
+				      perf_event__handler_t process,
+				      struct machine *machine);
 int perf_event__synthesize_threads(struct perf_tool *tool,
 				   perf_event__handler_t process,
 				   struct machine *machine, bool mmap_data,
-- 
cgit v1.1


From 59660942397b57b37eccba014544623cf4beb12b Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Sun, 25 Oct 2015 15:51:21 +0100
Subject: perf thread_map: Add thread_map__new_event function

Introducing the thread_map__new_event function to create a struct
thread_map object from a thread_map event.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Kan Liang <kan.liang@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1445784728-21732-6-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/thread-map.c | 14 ++++++++++++++
 tools/perf/util/thread_map.c  | 27 +++++++++++++++++++++++++++
 tools/perf/util/thread_map.h  |  3 +++
 3 files changed, 44 insertions(+)

diff --git a/tools/perf/tests/thread-map.c b/tools/perf/tests/thread-map.c
index ac5be25..fccde84 100644
--- a/tools/perf/tests/thread-map.c
+++ b/tools/perf/tests/thread-map.c
@@ -47,10 +47,24 @@ static int process_event(struct perf_tool *tool __maybe_unused,
 			 struct machine *machine __maybe_unused)
 {
 	struct thread_map_event *map = &event->thread_map;
+	struct thread_map *threads;
 
 	TEST_ASSERT_VAL("wrong nr",   map->nr == 1);
 	TEST_ASSERT_VAL("wrong pid",  map->entries[0].pid == (u64) getpid());
 	TEST_ASSERT_VAL("wrong comm", !strcmp(map->entries[0].comm, "perf"));
+
+	threads = thread_map__new_event(&event->thread_map);
+	TEST_ASSERT_VAL("failed to alloc map", threads);
+
+	TEST_ASSERT_VAL("wrong nr", threads->nr == 1);
+	TEST_ASSERT_VAL("wrong pid",
+			thread_map__pid(threads, 0) == getpid());
+	TEST_ASSERT_VAL("wrong comm",
+			thread_map__comm(threads, 0) &&
+			!strcmp(thread_map__comm(threads, 0), "perf"));
+	TEST_ASSERT_VAL("wrong refcnt",
+			atomic_read(&threads->refcnt) == 1);
+	thread_map__put(threads);
 	return 0;
 }
 
diff --git a/tools/perf/util/thread_map.c b/tools/perf/util/thread_map.c
index 371fb28..08afc690 100644
--- a/tools/perf/util/thread_map.c
+++ b/tools/perf/util/thread_map.c
@@ -13,6 +13,7 @@
 #include "thread_map.h"
 #include "util.h"
 #include "debug.h"
+#include "event.h"
 
 /* Skip "." and ".." directories */
 static int filter(const struct dirent *dir)
@@ -409,3 +410,29 @@ void thread_map__read_comms(struct thread_map *threads)
 	for (i = 0; i < threads->nr; ++i)
 		comm_init(threads, i);
 }
+
+static void thread_map__copy_event(struct thread_map *threads,
+				   struct thread_map_event *event)
+{
+	unsigned i;
+
+	threads->nr = (int) event->nr;
+
+	for (i = 0; i < event->nr; i++) {
+		thread_map__set_pid(threads, i, (pid_t) event->entries[i].pid);
+		threads->map[i].comm = strndup(event->entries[i].comm, 16);
+	}
+
+	atomic_set(&threads->refcnt, 1);
+}
+
+struct thread_map *thread_map__new_event(struct thread_map_event *event)
+{
+	struct thread_map *threads;
+
+	threads = thread_map__alloc(event->nr);
+	if (threads)
+		thread_map__copy_event(threads, event);
+
+	return threads;
+}
diff --git a/tools/perf/util/thread_map.h b/tools/perf/util/thread_map.h
index af679d8..85e4c7c 100644
--- a/tools/perf/util/thread_map.h
+++ b/tools/perf/util/thread_map.h
@@ -16,11 +16,14 @@ struct thread_map {
 	struct thread_map_data map[];
 };
 
+struct thread_map_event;
+
 struct thread_map *thread_map__new_dummy(void);
 struct thread_map *thread_map__new_by_pid(pid_t pid);
 struct thread_map *thread_map__new_by_tid(pid_t tid);
 struct thread_map *thread_map__new_by_uid(uid_t uid);
 struct thread_map *thread_map__new(pid_t pid, pid_t tid, uid_t uid);
+struct thread_map *thread_map__new_event(struct thread_map_event *event);
 
 struct thread_map *thread_map__get(struct thread_map *map);
 void thread_map__put(struct thread_map *map);
-- 
cgit v1.1


From ec7fa596f514b76a5f1003ffe9e6dfb50cb9e811 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Sun, 25 Oct 2015 15:51:22 +0100
Subject: perf thread_map: Add perf_event__fprintf_thread_map function

To display a thread_map event for a raw dump.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Kan Liang <kan.liang@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1445784728-21732-7-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/event.c | 16 ++++++++++++++++
 tools/perf/util/event.h |  1 +
 2 files changed, 17 insertions(+)

diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index b13373a..938f006 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -820,6 +820,22 @@ size_t perf_event__fprintf_mmap2(union perf_event *event, FILE *fp)
 		       event->mmap2.filename);
 }
 
+size_t perf_event__fprintf_thread_map(union perf_event *event, FILE *fp)
+{
+	struct thread_map *threads = thread_map__new_event(&event->thread_map);
+	size_t ret;
+
+	ret = fprintf(fp, " nr: ");
+
+	if (threads)
+		ret += thread_map__fprintf(threads, fp);
+	else
+		ret += fprintf(fp, "failed to get threads from event\n");
+
+	thread_map__put(threads);
+	return ret;
+}
+
 int perf_event__process_mmap(struct perf_tool *tool __maybe_unused,
 			     union perf_event *event,
 			     struct perf_sample *sample,
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 952dd4d..b7ad896 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -516,6 +516,7 @@ size_t perf_event__fprintf_task(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf_aux(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf_itrace_start(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf_switch(union perf_event *event, FILE *fp);
+size_t perf_event__fprintf_thread_map(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf(union perf_event *event, FILE *fp);
 
 u64 kallsyms__get_function_start(const char *kallsyms_filename,
-- 
cgit v1.1


From 6640b6c227fc85fd8bdcc4a31239a04450487f6a Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Sun, 25 Oct 2015 15:51:23 +0100
Subject: perf cpu_map: Add cpu_map user level event

Adding the cpu_map event to pass/store cpu maps as data in
a pipe/perf.data.

We store maps in 2 formats:
  - list of cpus
  - mask of cpus

The format that takes less space is selected transparently in the
following patch.

The interface is made generic, so we could add the cpumap event data
into another event in the following patches.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Kan Liang <kan.liang@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1445784728-21732-8-git-send-email-jolsa@kernel.org
[ cpu_map_data_cpus -> cpu_map_entries, cpu_map_data_mask -> cpu_map_mask ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/event.c   |  1 +
 tools/perf/util/event.h   | 28 ++++++++++++++++++++++++++
 tools/perf/util/session.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++
 tools/perf/util/tool.h    |  3 ++-
 4 files changed, 81 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 938f006..719c078 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -38,6 +38,7 @@ static const char *perf_event__names[] = {
 	[PERF_RECORD_AUXTRACE]			= "AUXTRACE",
 	[PERF_RECORD_AUXTRACE_ERROR]		= "AUXTRACE_ERROR",
 	[PERF_RECORD_THREAD_MAP]		= "THREAD_MAP",
+	[PERF_RECORD_CPU_MAP]			= "CPU_MAP",
 };
 
 const char *perf_event__name(unsigned int id)
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index b7ad896..1c82a0e 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -227,6 +227,7 @@ enum perf_user_event_type { /* above any possible kernel type */
 	PERF_RECORD_AUXTRACE			= 71,
 	PERF_RECORD_AUXTRACE_ERROR		= 72,
 	PERF_RECORD_THREAD_MAP			= 73,
+	PERF_RECORD_CPU_MAP			= 74,
 	PERF_RECORD_HEADER_MAX
 };
 
@@ -271,6 +272,32 @@ struct events_stats {
 	u32 nr_proc_map_timeout;
 };
 
+enum {
+	PERF_CPU_MAP__CPUS = 0,
+	PERF_CPU_MAP__MASK = 1,
+};
+
+struct cpu_map_entries {
+	u16	nr;
+	u16	cpu[];
+};
+
+struct cpu_map_mask {
+	u16	nr;
+	u16	long_size;
+	unsigned long mask[];
+};
+
+struct cpu_map_data {
+	u16	type;
+	char	data[];
+};
+
+struct cpu_map_event {
+	struct perf_event_header	header;
+	struct cpu_map_data		data;
+};
+
 struct attr_event {
 	struct perf_event_header header;
 	struct perf_event_attr attr;
@@ -391,6 +418,7 @@ union perf_event {
 	struct itrace_start_event	itrace_start;
 	struct context_switch_event	context_switch;
 	struct thread_map_event		thread_map;
+	struct cpu_map_event		cpu_map;
 };
 
 void perf_event__print_totals(void);
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 36b07b2..4350f5e 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -306,6 +306,15 @@ int process_event_thread_map_stub(struct perf_tool *tool __maybe_unused,
 	return 0;
 }
 
+static
+int process_event_cpu_map_stub(struct perf_tool *tool __maybe_unused,
+			       union perf_event *event __maybe_unused,
+			       struct perf_session *session __maybe_unused)
+{
+	dump_printf(": unhandled!\n");
+	return 0;
+}
+
 void perf_tool__fill_defaults(struct perf_tool *tool)
 {
 	if (tool->sample == NULL)
@@ -358,6 +367,8 @@ void perf_tool__fill_defaults(struct perf_tool *tool)
 		tool->auxtrace_error = process_event_auxtrace_error_stub;
 	if (tool->thread_map == NULL)
 		tool->thread_map = process_event_thread_map_stub;
+	if (tool->cpu_map == NULL)
+		tool->cpu_map = process_event_cpu_map_stub;
 }
 
 static void swap_sample_id_all(union perf_event *event, void *data)
@@ -639,6 +650,42 @@ static void perf_event__thread_map_swap(union perf_event *event,
 		event->thread_map.entries[i].pid = bswap_64(event->thread_map.entries[i].pid);
 }
 
+static void perf_event__cpu_map_swap(union perf_event *event,
+				     bool sample_id_all __maybe_unused)
+{
+	struct cpu_map_data *data = &event->cpu_map.data;
+	struct cpu_map_entries *cpus;
+	struct cpu_map_mask *mask;
+	unsigned i;
+
+	data->type = bswap_64(data->type);
+
+	switch (data->type) {
+	case PERF_CPU_MAP__CPUS:
+		cpus = (struct cpu_map_entries *)data->data;
+
+		cpus->nr = bswap_16(cpus->nr);
+
+		for (i = 0; i < cpus->nr; i++)
+			cpus->cpu[i] = bswap_16(cpus->cpu[i]);
+		break;
+	case PERF_CPU_MAP__MASK:
+		mask = (struct cpu_map_mask *) data->data;
+
+		mask->nr = bswap_16(mask->nr);
+		mask->long_size = bswap_16(mask->long_size);
+
+		switch (mask->long_size) {
+		case 4: mem_bswap_32(&mask->mask, mask->nr); break;
+		case 8: mem_bswap_64(&mask->mask, mask->nr); break;
+		default:
+			pr_err("cpu_map swap: unsupported long size\n");
+		}
+	default:
+		break;
+	}
+}
+
 typedef void (*perf_event__swap_op)(union perf_event *event,
 				    bool sample_id_all);
 
@@ -667,6 +714,7 @@ static perf_event__swap_op perf_event__swap_ops[] = {
 	[PERF_RECORD_AUXTRACE]		  = perf_event__auxtrace_swap,
 	[PERF_RECORD_AUXTRACE_ERROR]	  = perf_event__auxtrace_error_swap,
 	[PERF_RECORD_THREAD_MAP]	  = perf_event__thread_map_swap,
+	[PERF_RECORD_CPU_MAP]		  = perf_event__cpu_map_swap,
 	[PERF_RECORD_HEADER_MAX]	  = NULL,
 };
 
@@ -1205,6 +1253,8 @@ static s64 perf_session__process_user_event(struct perf_session *session,
 		return tool->auxtrace_error(tool, event, session);
 	case PERF_RECORD_THREAD_MAP:
 		return tool->thread_map(tool, event, session);
+	case PERF_RECORD_CPU_MAP:
+		return tool->cpu_map(tool, event, session);
 	default:
 		return -EINVAL;
 	}
diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h
index 1af4774..9e5925c 100644
--- a/tools/perf/util/tool.h
+++ b/tools/perf/util/tool.h
@@ -56,7 +56,8 @@ struct perf_tool {
 			id_index,
 			auxtrace_info,
 			auxtrace_error,
-			thread_map;
+			thread_map,
+			cpu_map;
 	event_op3	auxtrace;
 	bool		ordered_events;
 	bool		ordering_requires_timestamps;
-- 
cgit v1.1


From 6c872901af07c41745f1abf5ceac9b3b4d9cdbb6 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Sun, 25 Oct 2015 15:51:24 +0100
Subject: perf cpu_map: Add cpu_map event synthesize function

Introduce the perf_event__synthesize_cpu_map function to synthesize a
struct cpu_map.

Added generic interface:
  cpu_map_data__alloc
  cpu_map_data__synthesize

to make the cpu_map synthesizing usable for other events.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Kan Liang <kan.liang@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1445784728-21732-9-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/Build          |   1 +
 tools/perf/tests/builtin-test.c |   4 ++
 tools/perf/tests/cpumap.c       |  71 ++++++++++++++++++++++
 tools/perf/tests/tests.h        |   1 +
 tools/perf/util/event.c         | 131 ++++++++++++++++++++++++++++++++++++++++
 tools/perf/util/event.h         |   8 +++
 6 files changed, 216 insertions(+)
 create mode 100644 tools/perf/tests/cpumap.c

diff --git a/tools/perf/tests/Build b/tools/perf/tests/Build
index f23fb7e..7abad28 100644
--- a/tools/perf/tests/Build
+++ b/tools/perf/tests/Build
@@ -34,6 +34,7 @@ perf-y += thread-map.o
 perf-y += llvm.o llvm-src-base.o llvm-src-kbuild.o llvm-src-prologue.o
 perf-y += bpf.o
 perf-y += topology.o
+perf-y += cpumap.o
 
 $(OUTPUT)tests/llvm-src-base.c: tests/bpf-script-example.c tests/Build
 	$(call rule_mkdir)
diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c
index 745bdb0..0c3fe28 100644
--- a/tools/perf/tests/builtin-test.c
+++ b/tools/perf/tests/builtin-test.c
@@ -184,6 +184,10 @@ static struct test generic_tests[] = {
 		.func = test__thread_map_synthesize,
 	},
 	{
+		.desc = "Test cpu map synthesize",
+		.func = test__cpu_map_synthesize,
+	},
+	{
 		.func = NULL,
 	},
 };
diff --git a/tools/perf/tests/cpumap.c b/tools/perf/tests/cpumap.c
new file mode 100644
index 0000000..7154805
--- /dev/null
+++ b/tools/perf/tests/cpumap.c
@@ -0,0 +1,71 @@
+#include "tests.h"
+#include "cpumap.h"
+
+static int process_event_mask(struct perf_tool *tool __maybe_unused,
+			 union perf_event *event,
+			 struct perf_sample *sample __maybe_unused,
+			 struct machine *machine __maybe_unused)
+{
+	struct cpu_map_event *map = &event->cpu_map;
+	struct cpu_map_mask *mask;
+	struct cpu_map_data *data;
+	int i;
+
+	data = &map->data;
+
+	TEST_ASSERT_VAL("wrong type", data->type == PERF_CPU_MAP__MASK);
+
+	mask = (struct cpu_map_mask *)data->data;
+
+	TEST_ASSERT_VAL("wrong nr",   mask->nr == 1);
+
+	for (i = 0; i < 20; i++) {
+		TEST_ASSERT_VAL("wrong cpu", test_bit(i, mask->mask));
+	}
+
+	return 0;
+}
+
+static int process_event_cpus(struct perf_tool *tool __maybe_unused,
+			 union perf_event *event,
+			 struct perf_sample *sample __maybe_unused,
+			 struct machine *machine __maybe_unused)
+{
+	struct cpu_map_event *map = &event->cpu_map;
+	struct cpu_map_entries *cpus;
+	struct cpu_map_data *data;
+
+	data = &map->data;
+
+	TEST_ASSERT_VAL("wrong type", data->type == PERF_CPU_MAP__CPUS);
+
+	cpus = (struct cpu_map_entries *)data->data;
+
+	TEST_ASSERT_VAL("wrong nr",   cpus->nr == 2);
+	TEST_ASSERT_VAL("wrong cpu",  cpus->cpu[0] == 1);
+	TEST_ASSERT_VAL("wrong cpu",  cpus->cpu[1] == 256);
+	return 0;
+}
+
+
+int test__cpu_map_synthesize(int subtest __maybe_unused)
+{
+	struct cpu_map *cpus;
+
+	/* This one is better stores in mask. */
+	cpus = cpu_map__new("0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19");
+
+	TEST_ASSERT_VAL("failed to synthesize map",
+		!perf_event__synthesize_cpu_map(NULL, cpus, process_event_mask, NULL));
+
+	cpu_map__put(cpus);
+
+	/* This one is better stores in cpu values. */
+	cpus = cpu_map__new("1,256");
+
+	TEST_ASSERT_VAL("failed to synthesize map",
+		!perf_event__synthesize_cpu_map(NULL, cpus, process_event_cpus, NULL));
+
+	cpu_map__put(cpus);
+	return 0;
+}
diff --git a/tools/perf/tests/tests.h b/tools/perf/tests/tests.h
index 3fe52cc..f85160f 100644
--- a/tools/perf/tests/tests.h
+++ b/tools/perf/tests/tests.h
@@ -80,6 +80,7 @@ const char *test__bpf_subtest_get_desc(int subtest);
 int test__bpf_subtest_get_nr(void);
 int test_session_topology(int subtest);
 int test__thread_map_synthesize(int subtest);
+int test__cpu_map_synthesize(int subtest);
 
 #if defined(__arm__) || defined(__aarch64__)
 #ifdef HAVE_DWARF_UNWIND_SUPPORT
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 719c078..15d6466 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -737,6 +737,137 @@ int perf_event__synthesize_thread_map2(struct perf_tool *tool,
 	return err;
 }
 
+static void synthesize_cpus(struct cpu_map_entries *cpus,
+			    struct cpu_map *map)
+{
+	int i;
+
+	cpus->nr = map->nr;
+
+	for (i = 0; i < map->nr; i++)
+		cpus->cpu[i] = map->map[i];
+}
+
+static void synthesize_mask(struct cpu_map_mask *mask,
+			    struct cpu_map *map, int max)
+{
+	int i;
+
+	mask->nr = BITS_TO_LONGS(max);
+	mask->long_size = sizeof(long);
+
+	for (i = 0; i < map->nr; i++)
+		set_bit(map->map[i], mask->mask);
+}
+
+static size_t cpus_size(struct cpu_map *map)
+{
+	return sizeof(struct cpu_map_entries) + map->nr * sizeof(u16);
+}
+
+static size_t mask_size(struct cpu_map *map, int *max)
+{
+	int i;
+
+	*max = 0;
+
+	for (i = 0; i < map->nr; i++) {
+		/* bit possition of the cpu is + 1 */
+		int bit = map->map[i] + 1;
+
+		if (bit > *max)
+			*max = bit;
+	}
+
+	return sizeof(struct cpu_map_mask) + BITS_TO_LONGS(*max) * sizeof(long);
+}
+
+void *cpu_map_data__alloc(struct cpu_map *map, size_t *size, u16 *type, int *max)
+{
+	size_t size_cpus, size_mask;
+	bool is_dummy = cpu_map__empty(map);
+
+	/*
+	 * Both array and mask data have variable size based
+	 * on the number of cpus and their actual values.
+	 * The size of the 'struct cpu_map_data' is:
+	 *
+	 *   array = size of 'struct cpu_map_entries' +
+	 *           number of cpus * sizeof(u64)
+	 *
+	 *   mask  = size of 'struct cpu_map_mask' +
+	 *           maximum cpu bit converted to size of longs
+	 *
+	 * and finaly + the size of 'struct cpu_map_data'.
+	 */
+	size_cpus = cpus_size(map);
+	size_mask = mask_size(map, max);
+
+	if (is_dummy || (size_cpus < size_mask)) {
+		*size += size_cpus;
+		*type  = PERF_CPU_MAP__CPUS;
+	} else {
+		*size += size_mask;
+		*type  = PERF_CPU_MAP__MASK;
+	}
+
+	*size += sizeof(struct cpu_map_data);
+	return zalloc(*size);
+}
+
+void cpu_map_data__synthesize(struct cpu_map_data *data, struct cpu_map *map,
+			      u16 type, int max)
+{
+	data->type = type;
+
+	switch (type) {
+	case PERF_CPU_MAP__CPUS:
+		synthesize_cpus((struct cpu_map_entries *) data->data, map);
+		break;
+	case PERF_CPU_MAP__MASK:
+		synthesize_mask((struct cpu_map_mask *) data->data, map, max);
+	default:
+		break;
+	};
+}
+
+static struct cpu_map_event* cpu_map_event__new(struct cpu_map *map)
+{
+	size_t size = sizeof(struct cpu_map_event);
+	struct cpu_map_event *event;
+	int max;
+	u16 type;
+
+	event = cpu_map_data__alloc(map, &size, &type, &max);
+	if (!event)
+		return NULL;
+
+	event->header.type = PERF_RECORD_CPU_MAP;
+	event->header.size = size;
+	event->data.type   = type;
+
+	cpu_map_data__synthesize(&event->data, map, type, max);
+	return event;
+}
+
+int perf_event__synthesize_cpu_map(struct perf_tool *tool,
+				   struct cpu_map *map,
+				   perf_event__handler_t process,
+				   struct machine *machine)
+{
+	struct cpu_map_event *event;
+	int err;
+
+	event = cpu_map_event__new(map);
+	if (!event)
+		return -ENOMEM;
+
+	err = process(tool, (union perf_event *) event, NULL, machine);
+
+	free(event);
+	return err;
+}
+
 size_t perf_event__fprintf_comm(union perf_event *event, FILE *fp)
 {
 	const char *s;
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 1c82a0e..de18ee0 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -425,6 +425,7 @@ void perf_event__print_totals(void);
 
 struct perf_tool;
 struct thread_map;
+struct cpu_map;
 
 typedef int (*perf_event__handler_t)(struct perf_tool *tool,
 				     union perf_event *event,
@@ -440,6 +441,10 @@ int perf_event__synthesize_thread_map2(struct perf_tool *tool,
 				      struct thread_map *threads,
 				      perf_event__handler_t process,
 				      struct machine *machine);
+int perf_event__synthesize_cpu_map(struct perf_tool *tool,
+				   struct cpu_map *cpus,
+				   perf_event__handler_t process,
+				   struct machine *machine);
 int perf_event__synthesize_threads(struct perf_tool *tool,
 				   perf_event__handler_t process,
 				   struct machine *machine, bool mmap_data,
@@ -550,4 +555,7 @@ size_t perf_event__fprintf(union perf_event *event, FILE *fp);
 u64 kallsyms__get_function_start(const char *kallsyms_filename,
 				 const char *symbol_name);
 
+void *cpu_map_data__alloc(struct cpu_map *map, size_t *size, u16 *type, int *max);
+void  cpu_map_data__synthesize(struct cpu_map_data *data, struct cpu_map *map,
+			       u16 type, int max);
 #endif /* __PERF_RECORD_H */
-- 
cgit v1.1


From f77b57ad4fc42a074eae564bbb6660f0a3ff5503 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Sun, 25 Oct 2015 15:51:25 +0100
Subject: perf cpu_map: Add cpu_map__new_event function

Introducing the cpu_map__new_event function to create a struct cpu_map
object from a cpu_map event.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Kan Liang <kan.liang@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1445784728-21732-10-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/cpumap.c | 25 +++++++++++++++++++++----
 tools/perf/util/cpumap.c  | 42 ++++++++++++++++++++++++++++++++++++++++++
 tools/perf/util/cpumap.h  |  1 +
 3 files changed, 64 insertions(+), 4 deletions(-)

diff --git a/tools/perf/tests/cpumap.c b/tools/perf/tests/cpumap.c
index 7154805..4cb6418 100644
--- a/tools/perf/tests/cpumap.c
+++ b/tools/perf/tests/cpumap.c
@@ -6,12 +6,13 @@ static int process_event_mask(struct perf_tool *tool __maybe_unused,
 			 struct perf_sample *sample __maybe_unused,
 			 struct machine *machine __maybe_unused)
 {
-	struct cpu_map_event *map = &event->cpu_map;
+	struct cpu_map_event *map_event = &event->cpu_map;
 	struct cpu_map_mask *mask;
 	struct cpu_map_data *data;
+	struct cpu_map *map;
 	int i;
 
-	data = &map->data;
+	data = &map_event->data;
 
 	TEST_ASSERT_VAL("wrong type", data->type == PERF_CPU_MAP__MASK);
 
@@ -23,6 +24,14 @@ static int process_event_mask(struct perf_tool *tool __maybe_unused,
 		TEST_ASSERT_VAL("wrong cpu", test_bit(i, mask->mask));
 	}
 
+	map = cpu_map__new_data(data);
+	TEST_ASSERT_VAL("wrong nr",  map->nr == 20);
+
+	for (i = 0; i < 20; i++) {
+		TEST_ASSERT_VAL("wrong cpu", map->map[i] == i);
+	}
+
+	cpu_map__put(map);
 	return 0;
 }
 
@@ -31,11 +40,12 @@ static int process_event_cpus(struct perf_tool *tool __maybe_unused,
 			 struct perf_sample *sample __maybe_unused,
 			 struct machine *machine __maybe_unused)
 {
-	struct cpu_map_event *map = &event->cpu_map;
+	struct cpu_map_event *map_event = &event->cpu_map;
 	struct cpu_map_entries *cpus;
 	struct cpu_map_data *data;
+	struct cpu_map *map;
 
-	data = &map->data;
+	data = &map_event->data;
 
 	TEST_ASSERT_VAL("wrong type", data->type == PERF_CPU_MAP__CPUS);
 
@@ -44,6 +54,13 @@ static int process_event_cpus(struct perf_tool *tool __maybe_unused,
 	TEST_ASSERT_VAL("wrong nr",   cpus->nr == 2);
 	TEST_ASSERT_VAL("wrong cpu",  cpus->cpu[0] == 1);
 	TEST_ASSERT_VAL("wrong cpu",  cpus->cpu[1] == 256);
+
+	map = cpu_map__new_data(data);
+	TEST_ASSERT_VAL("wrong nr",  map->nr == 2);
+	TEST_ASSERT_VAL("wrong cpu", map->map[0] == 1);
+	TEST_ASSERT_VAL("wrong cpu", map->map[1] == 256);
+	TEST_ASSERT_VAL("wrong refcnt", atomic_read(&map->refcnt) == 1);
+	cpu_map__put(map);
 	return 0;
 }
 
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index 10af1e7..a0717b9 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -5,6 +5,7 @@
 #include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <linux/bitmap.h>
 #include "asm/bug.h"
 
 static struct cpu_map *cpu_map__default_new(void)
@@ -179,6 +180,47 @@ out:
 	return cpus;
 }
 
+static struct cpu_map *cpu_map__from_entries(struct cpu_map_entries *cpus)
+{
+	struct cpu_map *map;
+
+	map = cpu_map__empty_new(cpus->nr);
+	if (map) {
+		unsigned i;
+
+		for (i = 0; i < cpus->nr; i++)
+			map->map[i] = (int)cpus->cpu[i];
+	}
+
+	return map;
+}
+
+static struct cpu_map *cpu_map__from_mask(struct cpu_map_mask *mask)
+{
+	struct cpu_map *map;
+	int nr, nbits = mask->nr * mask->long_size * BITS_PER_BYTE;
+
+	nr = bitmap_weight(mask->mask, nbits);
+
+	map = cpu_map__empty_new(nr);
+	if (map) {
+		int cpu, i = 0;
+
+		for_each_set_bit(cpu, mask->mask, nbits)
+			map->map[i++] = cpu;
+	}
+	return map;
+
+}
+
+struct cpu_map *cpu_map__new_data(struct cpu_map_data *data)
+{
+	if (data->type == PERF_CPU_MAP__CPUS)
+		return cpu_map__from_entries((struct cpu_map_entries *)data->data);
+	else
+		return cpu_map__from_mask((struct cpu_map_mask *)data->data);
+}
+
 size_t cpu_map__fprintf(struct cpu_map *map, FILE *fp)
 {
 	int i;
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index 85f7772..71c41b9 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -17,6 +17,7 @@ struct cpu_map {
 struct cpu_map *cpu_map__new(const char *cpu_list);
 struct cpu_map *cpu_map__empty_new(int nr);
 struct cpu_map *cpu_map__dummy_new(void);
+struct cpu_map *cpu_map__new_data(struct cpu_map_data *data);
 struct cpu_map *cpu_map__read(FILE *file);
 size_t cpu_map__fprintf(struct cpu_map *map, FILE *fp);
 int cpu_map__get_socket_id(int cpu);
-- 
cgit v1.1


From eb12a1afdc02e59fc09934743490549c77327b1a Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Sun, 25 Oct 2015 15:51:26 +0100
Subject: perf cpu_map: Add perf_event__fprintf_cpu_map function

To display a cpu_map event for raw dump.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Kan Liang <kan.liang@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1445784728-21732-11-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/event.c | 16 ++++++++++++++++
 tools/perf/util/event.h |  1 +
 2 files changed, 17 insertions(+)

diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 15d6466..f31ab3b 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -968,6 +968,22 @@ size_t perf_event__fprintf_thread_map(union perf_event *event, FILE *fp)
 	return ret;
 }
 
+size_t perf_event__fprintf_cpu_map(union perf_event *event, FILE *fp)
+{
+	struct cpu_map *cpus = cpu_map__new_data(&event->cpu_map.data);
+	size_t ret;
+
+	ret = fprintf(fp, " nr: ");
+
+	if (cpus)
+		ret += cpu_map__fprintf(cpus, fp);
+	else
+		ret += fprintf(fp, "failed to get cpumap from event\n");
+
+	cpu_map__put(cpus);
+	return ret;
+}
+
 int perf_event__process_mmap(struct perf_tool *tool __maybe_unused,
 			     union perf_event *event,
 			     struct perf_sample *sample,
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index de18ee0..74a4341 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -550,6 +550,7 @@ size_t perf_event__fprintf_aux(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf_itrace_start(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf_switch(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf_thread_map(union perf_event *event, FILE *fp);
+size_t perf_event__fprintf_cpu_map(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf(union perf_event *event, FILE *fp);
 
 u64 kallsyms__get_function_start(const char *kallsyms_filename,
-- 
cgit v1.1


From 374fb9e362f64e730388abc1de9bb93829670a54 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Sun, 25 Oct 2015 15:51:27 +0100
Subject: perf tools: Add stat config user level event

Adding the stat config event to pass/store stat config data, so report
tools (report/script) know how to interpret stat data.

The config data is stored in a 'tag|value' way to allow for easy
extension and backwards compatibility.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Kan Liang <kan.liang@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1445784728-21732-12-git-send-email-jolsa@kernel.org
[ stat_config_term_event -> stat_config_event_entry ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/event.c   |  1 +
 tools/perf/util/event.h   | 20 ++++++++++++++++++++
 tools/perf/util/session.c | 24 ++++++++++++++++++++++++
 tools/perf/util/tool.h    |  3 ++-
 4 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index f31ab3b..43e2dfc 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -39,6 +39,7 @@ static const char *perf_event__names[] = {
 	[PERF_RECORD_AUXTRACE_ERROR]		= "AUXTRACE_ERROR",
 	[PERF_RECORD_THREAD_MAP]		= "THREAD_MAP",
 	[PERF_RECORD_CPU_MAP]			= "CPU_MAP",
+	[PERF_RECORD_STAT_CONFIG]		= "STAT_CONFIG",
 };
 
 const char *perf_event__name(unsigned int id)
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 74a4341..16cee44 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -228,6 +228,7 @@ enum perf_user_event_type { /* above any possible kernel type */
 	PERF_RECORD_AUXTRACE_ERROR		= 72,
 	PERF_RECORD_THREAD_MAP			= 73,
 	PERF_RECORD_CPU_MAP			= 74,
+	PERF_RECORD_STAT_CONFIG			= 75,
 	PERF_RECORD_HEADER_MAX
 };
 
@@ -395,6 +396,24 @@ struct thread_map_event {
 	struct thread_map_event_entry	entries[];
 };
 
+enum {
+	PERF_STAT_CONFIG_TERM__AGGR_MODE	= 0,
+	PERF_STAT_CONFIG_TERM__INTERVAL		= 1,
+	PERF_STAT_CONFIG_TERM__SCALE		= 2,
+	PERF_STAT_CONFIG_TERM__MAX		= 3,
+};
+
+struct stat_config_event_entry {
+	u64	tag;
+	u64	val;
+};
+
+struct stat_config_event {
+	struct perf_event_header	header;
+	u64				nr;
+	struct stat_config_event_entry	data[];
+};
+
 union perf_event {
 	struct perf_event_header	header;
 	struct mmap_event		mmap;
@@ -419,6 +438,7 @@ union perf_event {
 	struct context_switch_event	context_switch;
 	struct thread_map_event		thread_map;
 	struct cpu_map_event		cpu_map;
+	struct stat_config_event	stat_config;
 };
 
 void perf_event__print_totals(void);
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 4350f5e..fbc52ab 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -315,6 +315,15 @@ int process_event_cpu_map_stub(struct perf_tool *tool __maybe_unused,
 	return 0;
 }
 
+static
+int process_event_stat_config_stub(struct perf_tool *tool __maybe_unused,
+				   union perf_event *event __maybe_unused,
+				   struct perf_session *session __maybe_unused)
+{
+	dump_printf(": unhandled!\n");
+	return 0;
+}
+
 void perf_tool__fill_defaults(struct perf_tool *tool)
 {
 	if (tool->sample == NULL)
@@ -369,6 +378,8 @@ void perf_tool__fill_defaults(struct perf_tool *tool)
 		tool->thread_map = process_event_thread_map_stub;
 	if (tool->cpu_map == NULL)
 		tool->cpu_map = process_event_cpu_map_stub;
+	if (tool->stat_config == NULL)
+		tool->stat_config = process_event_stat_config_stub;
 }
 
 static void swap_sample_id_all(union perf_event *event, void *data)
@@ -686,6 +697,16 @@ static void perf_event__cpu_map_swap(union perf_event *event,
 	}
 }
 
+static void perf_event__stat_config_swap(union perf_event *event,
+					 bool sample_id_all __maybe_unused)
+{
+	u64 size;
+
+	size  = event->stat_config.nr * sizeof(event->stat_config.data[0]);
+	size += 1; /* nr item itself */
+	mem_bswap_64(&event->stat_config.nr, size);
+}
+
 typedef void (*perf_event__swap_op)(union perf_event *event,
 				    bool sample_id_all);
 
@@ -715,6 +736,7 @@ static perf_event__swap_op perf_event__swap_ops[] = {
 	[PERF_RECORD_AUXTRACE_ERROR]	  = perf_event__auxtrace_error_swap,
 	[PERF_RECORD_THREAD_MAP]	  = perf_event__thread_map_swap,
 	[PERF_RECORD_CPU_MAP]		  = perf_event__cpu_map_swap,
+	[PERF_RECORD_STAT_CONFIG]	  = perf_event__stat_config_swap,
 	[PERF_RECORD_HEADER_MAX]	  = NULL,
 };
 
@@ -1255,6 +1277,8 @@ static s64 perf_session__process_user_event(struct perf_session *session,
 		return tool->thread_map(tool, event, session);
 	case PERF_RECORD_CPU_MAP:
 		return tool->cpu_map(tool, event, session);
+	case PERF_RECORD_STAT_CONFIG:
+		return tool->stat_config(tool, event, session);
 	default:
 		return -EINVAL;
 	}
diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h
index 9e5925c..aa7ae73 100644
--- a/tools/perf/util/tool.h
+++ b/tools/perf/util/tool.h
@@ -57,7 +57,8 @@ struct perf_tool {
 			auxtrace_info,
 			auxtrace_error,
 			thread_map,
-			cpu_map;
+			cpu_map,
+			stat_config;
 	event_op3	auxtrace;
 	bool		ordered_events;
 	bool		ordering_requires_timestamps;
-- 
cgit v1.1


From 6742434261158ad9678bf15b165304e0200cc324 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Sun, 25 Oct 2015 15:51:28 +0100
Subject: perf tools: Add stat config event synthesize function

Introduce the perf_event__synthesize_stat_config to synthesize a 'struct
perf_stat_config'.

Storing the stat config in the form of tag-value pairs will, I believe,
sort out future version extensibility issues.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Kan Liang <kan.liang@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1445784728-21732-13-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/Build          |  1 +
 tools/perf/tests/builtin-test.c |  4 ++++
 tools/perf/tests/stat.c         | 53 +++++++++++++++++++++++++++++++++++++++++
 tools/perf/tests/tests.h        |  1 +
 tools/perf/util/event.c         | 40 +++++++++++++++++++++++++++++++
 tools/perf/util/event.h         |  5 ++++
 6 files changed, 104 insertions(+)
 create mode 100644 tools/perf/tests/stat.c

diff --git a/tools/perf/tests/Build b/tools/perf/tests/Build
index 7abad28..fc02931 100644
--- a/tools/perf/tests/Build
+++ b/tools/perf/tests/Build
@@ -35,6 +35,7 @@ perf-y += llvm.o llvm-src-base.o llvm-src-kbuild.o llvm-src-prologue.o
 perf-y += bpf.o
 perf-y += topology.o
 perf-y += cpumap.o
+perf-y += stat.o
 
 $(OUTPUT)tests/llvm-src-base.c: tests/bpf-script-example.c tests/Build
 	$(call rule_mkdir)
diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c
index 0c3fe28..ed8402f 100644
--- a/tools/perf/tests/builtin-test.c
+++ b/tools/perf/tests/builtin-test.c
@@ -188,6 +188,10 @@ static struct test generic_tests[] = {
 		.func = test__cpu_map_synthesize,
 	},
 	{
+		.desc = "Test stat config synthesize",
+		.func = test__synthesize_stat_config,
+	},
+	{
 		.func = NULL,
 	},
 };
diff --git a/tools/perf/tests/stat.c b/tools/perf/tests/stat.c
new file mode 100644
index 0000000..c7a2bdb
--- /dev/null
+++ b/tools/perf/tests/stat.c
@@ -0,0 +1,53 @@
+#include <linux/compiler.h>
+#include "event.h"
+#include "tests.h"
+#include "stat.h"
+#include "debug.h"
+
+static bool has_term(struct stat_config_event *config,
+		     u64 tag, u64 val)
+{
+	unsigned i;
+
+	for (i = 0; i < config->nr; i++) {
+		if ((config->data[i].tag == tag) &&
+		    (config->data[i].val == val))
+			return true;
+	}
+
+	return false;
+}
+
+static int process_event(struct perf_tool *tool __maybe_unused,
+			 union perf_event *event,
+			 struct perf_sample *sample __maybe_unused,
+			 struct machine *machine __maybe_unused)
+{
+	struct stat_config_event *config = &event->stat_config;
+
+#define HAS(term, val) \
+	has_term(config, PERF_STAT_CONFIG_TERM__##term, val)
+
+	TEST_ASSERT_VAL("wrong nr",        config->nr == PERF_STAT_CONFIG_TERM__MAX);
+	TEST_ASSERT_VAL("wrong aggr_mode", HAS(AGGR_MODE, AGGR_CORE));
+	TEST_ASSERT_VAL("wrong scale",     HAS(SCALE, 1));
+	TEST_ASSERT_VAL("wrong interval",  HAS(INTERVAL, 1));
+
+#undef HAS
+
+	return 0;
+}
+
+int test__synthesize_stat_config(int subtest __maybe_unused)
+{
+	struct perf_stat_config stat_config = {
+		.aggr_mode	= AGGR_CORE,
+		.scale		= 1,
+		.interval	= 1,
+	};
+
+	TEST_ASSERT_VAL("failed to synthesize stat_config",
+		!perf_event__synthesize_stat_config(NULL, &stat_config, process_event, NULL));
+
+	return 0;
+}
diff --git a/tools/perf/tests/tests.h b/tools/perf/tests/tests.h
index f85160f..319757a 100644
--- a/tools/perf/tests/tests.h
+++ b/tools/perf/tests/tests.h
@@ -81,6 +81,7 @@ int test__bpf_subtest_get_nr(void);
 int test_session_topology(int subtest);
 int test__thread_map_synthesize(int subtest);
 int test__cpu_map_synthesize(int subtest);
+int test__synthesize_stat_config(int subtest);
 
 #if defined(__arm__) || defined(__aarch64__)
 #ifdef HAVE_DWARF_UNWIND_SUPPORT
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 43e2dfc..1ea693c 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -10,6 +10,8 @@
 #include "thread.h"
 #include "thread_map.h"
 #include "symbol/kallsyms.h"
+#include "asm/bug.h"
+#include "stat.h"
 
 static const char *perf_event__names[] = {
 	[0]					= "TOTAL",
@@ -869,6 +871,44 @@ int perf_event__synthesize_cpu_map(struct perf_tool *tool,
 	return err;
 }
 
+int perf_event__synthesize_stat_config(struct perf_tool *tool,
+				       struct perf_stat_config *config,
+				       perf_event__handler_t process,
+				       struct machine *machine)
+{
+	struct stat_config_event *event;
+	int size, i = 0, err;
+
+	size  = sizeof(*event);
+	size += (PERF_STAT_CONFIG_TERM__MAX * sizeof(event->data[0]));
+
+	event = zalloc(size);
+	if (!event)
+		return -ENOMEM;
+
+	event->header.type = PERF_RECORD_STAT_CONFIG;
+	event->header.size = size;
+	event->nr          = PERF_STAT_CONFIG_TERM__MAX;
+
+#define ADD(__term, __val)					\
+	event->data[i].tag = PERF_STAT_CONFIG_TERM__##__term;	\
+	event->data[i].val = __val;				\
+	i++;
+
+	ADD(AGGR_MODE,	config->aggr_mode)
+	ADD(INTERVAL,	config->interval)
+	ADD(SCALE,	config->scale)
+
+	WARN_ONCE(i != PERF_STAT_CONFIG_TERM__MAX,
+		  "stat config terms unbalanced\n");
+#undef ADD
+
+	err = process(tool, (union perf_event *) event, NULL, machine);
+
+	free(event);
+	return err;
+}
+
 size_t perf_event__fprintf_comm(union perf_event *event, FILE *fp)
 {
 	const char *s;
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 16cee44..39014c7 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -446,6 +446,7 @@ void perf_event__print_totals(void);
 struct perf_tool;
 struct thread_map;
 struct cpu_map;
+struct perf_stat_config;
 
 typedef int (*perf_event__handler_t)(struct perf_tool *tool,
 				     union perf_event *event,
@@ -472,6 +473,10 @@ int perf_event__synthesize_threads(struct perf_tool *tool,
 int perf_event__synthesize_kernel_mmap(struct perf_tool *tool,
 				       perf_event__handler_t process,
 				       struct machine *machine);
+int perf_event__synthesize_stat_config(struct perf_tool *tool,
+				       struct perf_stat_config *config,
+				       perf_event__handler_t process,
+				       struct machine *machine);
 
 int perf_event__synthesize_modules(struct perf_tool *tool,
 				   perf_event__handler_t process,
-- 
cgit v1.1


From 8e381596b67af53564a69f16440d3e5d5a73d034 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Sun, 25 Oct 2015 15:51:29 +0100
Subject: perf tools: Add stat config event read function

Introducing the perf_event__read_stat_config function to read a struct
perf_stat_config object data from a stat config event.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Kan Liang <kan.liang@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1445784728-21732-14-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/stat.c |  6 ++++++
 tools/perf/util/event.c | 24 ++++++++++++++++++++++++
 tools/perf/util/event.h |  2 ++
 3 files changed, 32 insertions(+)

diff --git a/tools/perf/tests/stat.c b/tools/perf/tests/stat.c
index c7a2bdb..aa35d28 100644
--- a/tools/perf/tests/stat.c
+++ b/tools/perf/tests/stat.c
@@ -24,6 +24,7 @@ static int process_event(struct perf_tool *tool __maybe_unused,
 			 struct machine *machine __maybe_unused)
 {
 	struct stat_config_event *config = &event->stat_config;
+	struct perf_stat_config stat_config;
 
 #define HAS(term, val) \
 	has_term(config, PERF_STAT_CONFIG_TERM__##term, val)
@@ -35,6 +36,11 @@ static int process_event(struct perf_tool *tool __maybe_unused,
 
 #undef HAS
 
+	perf_event__read_stat_config(&stat_config, config);
+
+	TEST_ASSERT_VAL("wrong aggr_mode", stat_config.aggr_mode == AGGR_CORE);
+	TEST_ASSERT_VAL("wrong scale",     stat_config.scale == 1);
+	TEST_ASSERT_VAL("wrong interval",  stat_config.interval == 1);
 	return 0;
 }
 
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 1ea693c..223deaf 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -909,6 +909,30 @@ int perf_event__synthesize_stat_config(struct perf_tool *tool,
 	return err;
 }
 
+void perf_event__read_stat_config(struct perf_stat_config *config,
+				  struct stat_config_event *event)
+{
+	unsigned i;
+
+	for (i = 0; i < event->nr; i++) {
+
+		switch (event->data[i].tag) {
+#define CASE(__term, __val)					\
+		case PERF_STAT_CONFIG_TERM__##__term:		\
+			config->__val = event->data[i].val;	\
+			break;
+
+		CASE(AGGR_MODE, aggr_mode)
+		CASE(SCALE,     scale)
+		CASE(INTERVAL,  interval)
+#undef CASE
+		default:
+			pr_warning("unknown stat config term %" PRIu64 "\n",
+				   event->data[i].tag);
+		}
+	}
+}
+
 size_t perf_event__fprintf_comm(union perf_event *event, FILE *fp)
 {
 	const char *s;
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 39014c7..4e87be2 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -477,6 +477,8 @@ int perf_event__synthesize_stat_config(struct perf_tool *tool,
 				       struct perf_stat_config *config,
 				       perf_event__handler_t process,
 				       struct machine *machine);
+void perf_event__read_stat_config(struct perf_stat_config *config,
+				  struct stat_config_event *event);
 
 int perf_event__synthesize_modules(struct perf_tool *tool,
 				   perf_event__handler_t process,
-- 
cgit v1.1


From d80518c90bb2b4af9755d79af5dfe9d44e04cdb9 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Sun, 25 Oct 2015 15:51:30 +0100
Subject: perf tools: Add stat user level event

Adding a stat event to store a 'struct perf_counter_values' for a given
event/cpu/thread.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Kan Liang <kan.liang@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1445784728-21732-15-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/event.c   |  1 +
 tools/perf/util/event.h   | 19 +++++++++++++++++++
 tools/perf/util/session.c | 25 +++++++++++++++++++++++++
 tools/perf/util/tool.h    |  3 ++-
 4 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 223deaf..670123f 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -42,6 +42,7 @@ static const char *perf_event__names[] = {
 	[PERF_RECORD_THREAD_MAP]		= "THREAD_MAP",
 	[PERF_RECORD_CPU_MAP]			= "CPU_MAP",
 	[PERF_RECORD_STAT_CONFIG]		= "STAT_CONFIG",
+	[PERF_RECORD_STAT]			= "STAT",
 };
 
 const char *perf_event__name(unsigned int id)
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 4e87be2..f23f464 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -229,6 +229,7 @@ enum perf_user_event_type { /* above any possible kernel type */
 	PERF_RECORD_THREAD_MAP			= 73,
 	PERF_RECORD_CPU_MAP			= 74,
 	PERF_RECORD_STAT_CONFIG			= 75,
+	PERF_RECORD_STAT			= 76,
 	PERF_RECORD_HEADER_MAX
 };
 
@@ -414,6 +415,23 @@ struct stat_config_event {
 	struct stat_config_event_entry	data[];
 };
 
+struct stat_event {
+	struct perf_event_header	header;
+
+	u64	id;
+	u32	cpu;
+	u32	thread;
+
+	union {
+		struct {
+			u64 val;
+			u64 ena;
+			u64 run;
+		};
+		u64 values[3];
+	};
+};
+
 union perf_event {
 	struct perf_event_header	header;
 	struct mmap_event		mmap;
@@ -439,6 +457,7 @@ union perf_event {
 	struct thread_map_event		thread_map;
 	struct cpu_map_event		cpu_map;
 	struct stat_config_event	stat_config;
+	struct stat_event		stat;
 };
 
 void perf_event__print_totals(void);
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index fbc52ab..663a2fd 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -324,6 +324,15 @@ int process_event_stat_config_stub(struct perf_tool *tool __maybe_unused,
 	return 0;
 }
 
+static int process_stat_stub(struct perf_tool *tool __maybe_unused,
+			     union perf_event *event __maybe_unused,
+			     struct perf_session *perf_session
+			     __maybe_unused)
+{
+	dump_printf(": unhandled!\n");
+	return 0;
+}
+
 void perf_tool__fill_defaults(struct perf_tool *tool)
 {
 	if (tool->sample == NULL)
@@ -380,6 +389,8 @@ void perf_tool__fill_defaults(struct perf_tool *tool)
 		tool->cpu_map = process_event_cpu_map_stub;
 	if (tool->stat_config == NULL)
 		tool->stat_config = process_event_stat_config_stub;
+	if (tool->stat == NULL)
+		tool->stat = process_stat_stub;
 }
 
 static void swap_sample_id_all(union perf_event *event, void *data)
@@ -707,6 +718,17 @@ static void perf_event__stat_config_swap(union perf_event *event,
 	mem_bswap_64(&event->stat_config.nr, size);
 }
 
+static void perf_event__stat_swap(union perf_event *event,
+				  bool sample_id_all __maybe_unused)
+{
+	event->stat.id     = bswap_64(event->stat.id);
+	event->stat.thread = bswap_32(event->stat.thread);
+	event->stat.cpu    = bswap_32(event->stat.cpu);
+	event->stat.val    = bswap_64(event->stat.val);
+	event->stat.ena    = bswap_64(event->stat.ena);
+	event->stat.run    = bswap_64(event->stat.run);
+}
+
 typedef void (*perf_event__swap_op)(union perf_event *event,
 				    bool sample_id_all);
 
@@ -737,6 +759,7 @@ static perf_event__swap_op perf_event__swap_ops[] = {
 	[PERF_RECORD_THREAD_MAP]	  = perf_event__thread_map_swap,
 	[PERF_RECORD_CPU_MAP]		  = perf_event__cpu_map_swap,
 	[PERF_RECORD_STAT_CONFIG]	  = perf_event__stat_config_swap,
+	[PERF_RECORD_STAT]		  = perf_event__stat_swap,
 	[PERF_RECORD_HEADER_MAX]	  = NULL,
 };
 
@@ -1279,6 +1302,8 @@ static s64 perf_session__process_user_event(struct perf_session *session,
 		return tool->cpu_map(tool, event, session);
 	case PERF_RECORD_STAT_CONFIG:
 		return tool->stat_config(tool, event, session);
+	case PERF_RECORD_STAT:
+		return tool->stat(tool, event, session);
 	default:
 		return -EINVAL;
 	}
diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h
index aa7ae73..f0b9da0 100644
--- a/tools/perf/util/tool.h
+++ b/tools/perf/util/tool.h
@@ -58,7 +58,8 @@ struct perf_tool {
 			auxtrace_error,
 			thread_map,
 			cpu_map,
-			stat_config;
+			stat_config,
+			stat;
 	event_op3	auxtrace;
 	bool		ordered_events;
 	bool		ordering_requires_timestamps;
-- 
cgit v1.1


From 5796f8f073fe50171376f058376dde93ec5f3785 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Sun, 25 Oct 2015 15:51:31 +0100
Subject: perf tools: Add stat event synthesize function

Introduce the perf_event__synthesize_stat function to synthesize a
'struct stat_event'.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Kan Liang <kan.liang@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1445784728-21732-16-git-send-email-jolsa@kernel.org
[ Renamed 'stat' parameter to 'st' to fix 'already defined' build error with older distros (e.g. RHEL6.7) ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/builtin-test.c |  4 ++++
 tools/perf/tests/stat.c         | 41 ++++++++++++++++++++++++++++++++++++-----
 tools/perf/tests/tests.h        |  1 +
 tools/perf/util/event.c         | 22 ++++++++++++++++++++++
 tools/perf/util/event.h         |  7 ++++++-
 5 files changed, 69 insertions(+), 6 deletions(-)

diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c
index ed8402f..4a7d998 100644
--- a/tools/perf/tests/builtin-test.c
+++ b/tools/perf/tests/builtin-test.c
@@ -192,6 +192,10 @@ static struct test generic_tests[] = {
 		.func = test__synthesize_stat_config,
 	},
 	{
+		.desc = "Test stat synthesize",
+		.func = test__synthesize_stat,
+	},
+	{
 		.func = NULL,
 	},
 };
diff --git a/tools/perf/tests/stat.c b/tools/perf/tests/stat.c
index aa35d28..d319875 100644
--- a/tools/perf/tests/stat.c
+++ b/tools/perf/tests/stat.c
@@ -2,6 +2,7 @@
 #include "event.h"
 #include "tests.h"
 #include "stat.h"
+#include "counts.h"
 #include "debug.h"
 
 static bool has_term(struct stat_config_event *config,
@@ -18,10 +19,10 @@ static bool has_term(struct stat_config_event *config,
 	return false;
 }
 
-static int process_event(struct perf_tool *tool __maybe_unused,
-			 union perf_event *event,
-			 struct perf_sample *sample __maybe_unused,
-			 struct machine *machine __maybe_unused)
+static int process_stat_config_event(struct perf_tool *tool __maybe_unused,
+				     union perf_event *event,
+				     struct perf_sample *sample __maybe_unused,
+				     struct machine *machine __maybe_unused)
 {
 	struct stat_config_event *config = &event->stat_config;
 	struct perf_stat_config stat_config;
@@ -53,7 +54,37 @@ int test__synthesize_stat_config(int subtest __maybe_unused)
 	};
 
 	TEST_ASSERT_VAL("failed to synthesize stat_config",
-		!perf_event__synthesize_stat_config(NULL, &stat_config, process_event, NULL));
+		!perf_event__synthesize_stat_config(NULL, &stat_config, process_stat_config_event, NULL));
+
+	return 0;
+}
+
+static int process_stat_event(struct perf_tool *tool __maybe_unused,
+			      union perf_event *event,
+			      struct perf_sample *sample __maybe_unused,
+			      struct machine *machine __maybe_unused)
+{
+	struct stat_event *st = &event->stat;
+
+	TEST_ASSERT_VAL("wrong cpu",    st->cpu    == 1);
+	TEST_ASSERT_VAL("wrong thread", st->thread == 2);
+	TEST_ASSERT_VAL("wrong id",     st->id     == 3);
+	TEST_ASSERT_VAL("wrong val",    st->val    == 100);
+	TEST_ASSERT_VAL("wrong run",    st->ena    == 200);
+	TEST_ASSERT_VAL("wrong ena",    st->run    == 300);
+	return 0;
+}
+
+int test__synthesize_stat(int subtest __maybe_unused)
+{
+	struct perf_counts_values count;
+
+	count.val = 100;
+	count.ena = 200;
+	count.run = 300;
+
+	TEST_ASSERT_VAL("failed to synthesize stat_config",
+		!perf_event__synthesize_stat(NULL, 1, 2, 3, &count, process_stat_event, NULL));
 
 	return 0;
 }
diff --git a/tools/perf/tests/tests.h b/tools/perf/tests/tests.h
index 319757a..d36eda1 100644
--- a/tools/perf/tests/tests.h
+++ b/tools/perf/tests/tests.h
@@ -82,6 +82,7 @@ int test_session_topology(int subtest);
 int test__thread_map_synthesize(int subtest);
 int test__cpu_map_synthesize(int subtest);
 int test__synthesize_stat_config(int subtest);
+int test__synthesize_stat(int subtest);
 
 #if defined(__arm__) || defined(__aarch64__)
 #ifdef HAVE_DWARF_UNWIND_SUPPORT
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 670123f..eb8243a 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -910,6 +910,28 @@ int perf_event__synthesize_stat_config(struct perf_tool *tool,
 	return err;
 }
 
+int perf_event__synthesize_stat(struct perf_tool *tool,
+				u32 cpu, u32 thread, u64 id,
+				struct perf_counts_values *count,
+				perf_event__handler_t process,
+				struct machine *machine)
+{
+	struct stat_event event;
+
+	event.header.type = PERF_RECORD_STAT;
+	event.header.size = sizeof(event);
+	event.header.misc = 0;
+
+	event.id        = id;
+	event.cpu       = cpu;
+	event.thread    = thread;
+	event.val       = count->val;
+	event.ena       = count->ena;
+	event.run       = count->run;
+
+	return process(tool, (union perf_event *) &event, NULL, machine);
+}
+
 void perf_event__read_stat_config(struct perf_stat_config *config,
 				  struct stat_config_event *event)
 {
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index f23f464..336eb44 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -466,6 +466,7 @@ struct perf_tool;
 struct thread_map;
 struct cpu_map;
 struct perf_stat_config;
+struct perf_counts_values;
 
 typedef int (*perf_event__handler_t)(struct perf_tool *tool,
 				     union perf_event *event,
@@ -498,7 +499,11 @@ int perf_event__synthesize_stat_config(struct perf_tool *tool,
 				       struct machine *machine);
 void perf_event__read_stat_config(struct perf_stat_config *config,
 				  struct stat_config_event *event);
-
+int perf_event__synthesize_stat(struct perf_tool *tool,
+				u32 cpu, u32 thread, u64 id,
+				struct perf_counts_values *count,
+				perf_event__handler_t process,
+				struct machine *machine);
 int perf_event__synthesize_modules(struct perf_tool *tool,
 				   perf_event__handler_t process,
 				   struct machine *machine);
-- 
cgit v1.1


From 0ea0e3558607626196eb09ace796aac585e61f5c Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Sun, 25 Oct 2015 15:51:32 +0100
Subject: perf tools: Add stat event read function

Introducing the perf_event__process_stat_event function to process a
'struct perf_stat' data from a stat event.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Kan Liang <kan.liang@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1445784728-21732-17-git-send-email-jolsa@kernel.org
[ Renamed 'stat' parameter to 'st' to fix 'already defined' build error with older distros (e.g. RHEL6.7) ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/stat.c | 23 +++++++++++++++++++++++
 tools/perf/util/stat.h |  6 ++++++
 2 files changed, 29 insertions(+)

diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c
index 2d9d830..0ad59ce 100644
--- a/tools/perf/util/stat.c
+++ b/tools/perf/util/stat.c
@@ -341,3 +341,26 @@ int perf_stat_process_counter(struct perf_stat_config *config,
 
 	return 0;
 }
+
+int perf_event__process_stat_event(struct perf_tool *tool __maybe_unused,
+				   union perf_event *event,
+				   struct perf_session *session)
+{
+	struct perf_counts_values count;
+	struct stat_event *st = &event->stat;
+	struct perf_evsel *counter;
+
+	count.val = st->val;
+	count.ena = st->ena;
+	count.run = st->run;
+
+	counter = perf_evlist__id2evsel(session->evlist, st->id);
+	if (!counter) {
+		pr_err("Failed to resolve counter for stat event.\n");
+		return -EINVAL;
+	}
+
+	*perf_counts(counter->counts, st->cpu, st->thread) = count;
+	counter->supported = true;
+	return 0;
+}
diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
index da1d11c..afe6844 100644
--- a/tools/perf/util/stat.h
+++ b/tools/perf/util/stat.h
@@ -90,4 +90,10 @@ void perf_evlist__reset_stats(struct perf_evlist *evlist);
 
 int perf_stat_process_counter(struct perf_stat_config *config,
 			      struct perf_evsel *counter);
+struct perf_tool;
+union perf_event;
+struct perf_session;
+int perf_event__process_stat_event(struct perf_tool *tool,
+				   union perf_event *event,
+				   struct perf_session *session);
 #endif
-- 
cgit v1.1


From 2d8f0f18a5c37cf0322cb385b99adb1167b7cf78 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Sun, 25 Oct 2015 15:51:33 +0100
Subject: perf tools: Add stat round user level event

Adding the stat round event to be stored after each stat interval round,
so that report tools (report/script) gets notified and process interval
data.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Kan Liang <kan.liang@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1445784728-21732-18-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/event.c   |  1 +
 tools/perf/util/event.h   | 13 +++++++++++++
 tools/perf/util/session.c | 21 +++++++++++++++++++++
 tools/perf/util/tool.h    |  3 ++-
 4 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index eb8243a..725db54 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -43,6 +43,7 @@ static const char *perf_event__names[] = {
 	[PERF_RECORD_CPU_MAP]			= "CPU_MAP",
 	[PERF_RECORD_STAT_CONFIG]		= "STAT_CONFIG",
 	[PERF_RECORD_STAT]			= "STAT",
+	[PERF_RECORD_STAT_ROUND]		= "STAT_ROUND",
 };
 
 const char *perf_event__name(unsigned int id)
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 336eb44..5eb4f55 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -230,6 +230,7 @@ enum perf_user_event_type { /* above any possible kernel type */
 	PERF_RECORD_CPU_MAP			= 74,
 	PERF_RECORD_STAT_CONFIG			= 75,
 	PERF_RECORD_STAT			= 76,
+	PERF_RECORD_STAT_ROUND			= 77,
 	PERF_RECORD_HEADER_MAX
 };
 
@@ -432,6 +433,17 @@ struct stat_event {
 	};
 };
 
+enum {
+	PERF_STAT_ROUND_TYPE__INTERVAL	= 0,
+	PERF_STAT_ROUND_TYPE__FINAL	= 1,
+};
+
+struct stat_round_event {
+	struct perf_event_header	header;
+	u64				type;
+	u64				time;
+};
+
 union perf_event {
 	struct perf_event_header	header;
 	struct mmap_event		mmap;
@@ -458,6 +470,7 @@ union perf_event {
 	struct cpu_map_event		cpu_map;
 	struct stat_config_event	stat_config;
 	struct stat_event		stat;
+	struct stat_round_event		stat_round;
 };
 
 void perf_event__print_totals(void);
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 663a2fd..5b3a81a 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -333,6 +333,15 @@ static int process_stat_stub(struct perf_tool *tool __maybe_unused,
 	return 0;
 }
 
+static int process_stat_round_stub(struct perf_tool *tool __maybe_unused,
+				   union perf_event *event __maybe_unused,
+				   struct perf_session *perf_session
+				   __maybe_unused)
+{
+	dump_printf(": unhandled!\n");
+	return 0;
+}
+
 void perf_tool__fill_defaults(struct perf_tool *tool)
 {
 	if (tool->sample == NULL)
@@ -391,6 +400,8 @@ void perf_tool__fill_defaults(struct perf_tool *tool)
 		tool->stat_config = process_event_stat_config_stub;
 	if (tool->stat == NULL)
 		tool->stat = process_stat_stub;
+	if (tool->stat_round == NULL)
+		tool->stat_round = process_stat_round_stub;
 }
 
 static void swap_sample_id_all(union perf_event *event, void *data)
@@ -729,6 +740,13 @@ static void perf_event__stat_swap(union perf_event *event,
 	event->stat.run    = bswap_64(event->stat.run);
 }
 
+static void perf_event__stat_round_swap(union perf_event *event,
+					bool sample_id_all __maybe_unused)
+{
+	event->stat_round.type = bswap_64(event->stat_round.type);
+	event->stat_round.time = bswap_64(event->stat_round.time);
+}
+
 typedef void (*perf_event__swap_op)(union perf_event *event,
 				    bool sample_id_all);
 
@@ -760,6 +778,7 @@ static perf_event__swap_op perf_event__swap_ops[] = {
 	[PERF_RECORD_CPU_MAP]		  = perf_event__cpu_map_swap,
 	[PERF_RECORD_STAT_CONFIG]	  = perf_event__stat_config_swap,
 	[PERF_RECORD_STAT]		  = perf_event__stat_swap,
+	[PERF_RECORD_STAT_ROUND]	  = perf_event__stat_round_swap,
 	[PERF_RECORD_HEADER_MAX]	  = NULL,
 };
 
@@ -1304,6 +1323,8 @@ static s64 perf_session__process_user_event(struct perf_session *session,
 		return tool->stat_config(tool, event, session);
 	case PERF_RECORD_STAT:
 		return tool->stat(tool, event, session);
+	case PERF_RECORD_STAT_ROUND:
+		return tool->stat_round(tool, event, session);
 	default:
 		return -EINVAL;
 	}
diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h
index f0b9da0..d04d9e5 100644
--- a/tools/perf/util/tool.h
+++ b/tools/perf/util/tool.h
@@ -59,7 +59,8 @@ struct perf_tool {
 			thread_map,
 			cpu_map,
 			stat_config,
-			stat;
+			stat,
+			stat_round;
 	event_op3	auxtrace;
 	bool		ordered_events;
 	bool		ordering_requires_timestamps;
-- 
cgit v1.1


From d4c2259195f538505d2570e78555532372fb4ad2 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Sun, 25 Oct 2015 15:51:34 +0100
Subject: perf tools: Add stat round event synthesize function

Introduce the perf_event__synthesize_stat_round function to
synthesize a 'struct stat_round_event'.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Kan Liang <kan.liang@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1445784728-21732-19-git-send-email-jolsa@kernel.org
[ Renamed 'time' parameter to 'evtime' to fix build on older systems ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/builtin-test.c |  4 ++++
 tools/perf/tests/stat.c         | 21 +++++++++++++++++++++
 tools/perf/tests/tests.h        |  1 +
 tools/perf/util/event.c         | 17 +++++++++++++++++
 tools/perf/util/event.h         |  4 ++++
 5 files changed, 47 insertions(+)

diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c
index 4a7d998..6a35198 100644
--- a/tools/perf/tests/builtin-test.c
+++ b/tools/perf/tests/builtin-test.c
@@ -196,6 +196,10 @@ static struct test generic_tests[] = {
 		.func = test__synthesize_stat,
 	},
 	{
+		.desc = "Test stat round synthesize",
+		.func = test__synthesize_stat_round,
+	},
+	{
 		.func = NULL,
 	},
 };
diff --git a/tools/perf/tests/stat.c b/tools/perf/tests/stat.c
index d319875..6a20ff2 100644
--- a/tools/perf/tests/stat.c
+++ b/tools/perf/tests/stat.c
@@ -88,3 +88,24 @@ int test__synthesize_stat(int subtest __maybe_unused)
 
 	return 0;
 }
+
+static int process_stat_round_event(struct perf_tool *tool __maybe_unused,
+				    union perf_event *event,
+				    struct perf_sample *sample __maybe_unused,
+				    struct machine *machine __maybe_unused)
+{
+	struct stat_round_event *stat_round = &event->stat_round;
+
+	TEST_ASSERT_VAL("wrong time", stat_round->time == 0xdeadbeef);
+	TEST_ASSERT_VAL("wrong type", stat_round->type == PERF_STAT_ROUND_TYPE__INTERVAL);
+	return 0;
+}
+
+int test__synthesize_stat_round(int subtest __maybe_unused)
+{
+	TEST_ASSERT_VAL("failed to synthesize stat_config",
+		!perf_event__synthesize_stat_round(NULL, 0xdeadbeef, PERF_STAT_ROUND_TYPE__INTERVAL,
+						   process_stat_round_event, NULL));
+
+	return 0;
+}
diff --git a/tools/perf/tests/tests.h b/tools/perf/tests/tests.h
index d36eda1..a82ab9c 100644
--- a/tools/perf/tests/tests.h
+++ b/tools/perf/tests/tests.h
@@ -83,6 +83,7 @@ int test__thread_map_synthesize(int subtest);
 int test__cpu_map_synthesize(int subtest);
 int test__synthesize_stat_config(int subtest);
 int test__synthesize_stat(int subtest);
+int test__synthesize_stat_round(int subtest);
 
 #if defined(__arm__) || defined(__aarch64__)
 #ifdef HAVE_DWARF_UNWIND_SUPPORT
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 725db54..e4c68ba 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -933,6 +933,23 @@ int perf_event__synthesize_stat(struct perf_tool *tool,
 	return process(tool, (union perf_event *) &event, NULL, machine);
 }
 
+int perf_event__synthesize_stat_round(struct perf_tool *tool,
+				      u64 evtime, u64 type,
+				      perf_event__handler_t process,
+				      struct machine *machine)
+{
+	struct stat_round_event event;
+
+	event.header.type = PERF_RECORD_STAT_ROUND;
+	event.header.size = sizeof(event);
+	event.header.misc = 0;
+
+	event.time = evtime;
+	event.type = type;
+
+	return process(tool, (union perf_event *) &event, NULL, machine);
+}
+
 void perf_event__read_stat_config(struct perf_stat_config *config,
 				  struct stat_config_event *event)
 {
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 5eb4f55..1afaa21 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -517,6 +517,10 @@ int perf_event__synthesize_stat(struct perf_tool *tool,
 				struct perf_counts_values *count,
 				perf_event__handler_t process,
 				struct machine *machine);
+int perf_event__synthesize_stat_round(struct perf_tool *tool,
+				      u64 time, u64 type,
+				      perf_event__handler_t process,
+				      struct machine *machine);
 int perf_event__synthesize_modules(struct perf_tool *tool,
 				   perf_event__handler_t process,
 				   struct machine *machine);
-- 
cgit v1.1


From e08a4564e23f8f89a055d717887674f54a9da515 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Sun, 25 Oct 2015 15:51:35 +0100
Subject: perf tools: Add stat events fprintf functions

Introducing the following functions to display the stat events for raw
dump.

  perf_event__fprintf_stat
  perf_event__fprintf_stat_round
  perf_event__fprintf_stat_config

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Kan Liang <kan.liang@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1445784728-21732-20-git-send-email-jolsa@kernel.org
[ s/stat/st/g and s/round/rd/g parameters to fix 'already defined' build error with older distros (e.g. RHEL6.7) ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/stat.c | 39 +++++++++++++++++++++++++++++++++++++++
 tools/perf/util/stat.h |  4 ++++
 2 files changed, 43 insertions(+)

diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c
index 0ad59ce..2f901d1 100644
--- a/tools/perf/util/stat.c
+++ b/tools/perf/util/stat.c
@@ -364,3 +364,42 @@ int perf_event__process_stat_event(struct perf_tool *tool __maybe_unused,
 	counter->supported = true;
 	return 0;
 }
+
+size_t perf_event__fprintf_stat(union perf_event *event, FILE *fp)
+{
+	struct stat_event *st = (struct stat_event *) event;
+	size_t ret;
+
+	ret  = fprintf(fp, "\n... id %" PRIu64 ", cpu %d, thread %d\n",
+		       st->id, st->cpu, st->thread);
+	ret += fprintf(fp, "... value %" PRIu64 ", enabled %" PRIu64 ", running %" PRIu64 "\n",
+		       st->val, st->ena, st->run);
+
+	return ret;
+}
+
+size_t perf_event__fprintf_stat_round(union perf_event *event, FILE *fp)
+{
+	struct stat_round_event *rd = (struct stat_round_event *)event;
+	size_t ret;
+
+	ret = fprintf(fp, "\n... time %" PRIu64 ", type %s\n", rd->time,
+		      rd->type == PERF_STAT_ROUND_TYPE__FINAL ? "FINAL" : "INTERVAL");
+
+	return ret;
+}
+
+size_t perf_event__fprintf_stat_config(union perf_event *event, FILE *fp)
+{
+	struct perf_stat_config sc;
+	size_t ret;
+
+	perf_event__read_stat_config(&sc, &event->stat_config);
+
+	ret  = fprintf(fp, "\n");
+	ret += fprintf(fp, "... aggr_mode %d\n", sc.aggr_mode);
+	ret += fprintf(fp, "... scale     %d\n", sc.scale);
+	ret += fprintf(fp, "... interval  %u\n", sc.interval);
+
+	return ret;
+}
diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
index afe6844..086f4e1 100644
--- a/tools/perf/util/stat.h
+++ b/tools/perf/util/stat.h
@@ -96,4 +96,8 @@ struct perf_session;
 int perf_event__process_stat_event(struct perf_tool *tool,
 				   union perf_event *event,
 				   struct perf_session *session);
+
+size_t perf_event__fprintf_stat(union perf_event *event, FILE *fp);
+size_t perf_event__fprintf_stat_round(union perf_event *event, FILE *fp);
+size_t perf_event__fprintf_stat_config(union perf_event *event, FILE *fp);
 #endif
-- 
cgit v1.1


From ffe777254cce24fb5fde3f0aa91fc755cfb1b812 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Sun, 25 Oct 2015 15:51:36 +0100
Subject: perf tools: Add event_update user level event

It'll serve as a base event for additional event attributes details,
that are not part of the attr event.

At the moment this event is just a dummy one without any specific
functionality. The type value will distinguish the update event details.
It'll come in the following patches.

The idea for this event is to be extensible for any update that the
event might need in the future.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Kan Liang <kan.liang@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1445784728-21732-21-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/event.c   |  1 +
 tools/perf/util/event.h   | 10 ++++++++++
 tools/perf/util/header.c  | 20 ++++++++++++++++++++
 tools/perf/util/header.h  |  3 +++
 tools/perf/util/session.c | 21 +++++++++++++++++++++
 tools/perf/util/tool.h    |  1 +
 6 files changed, 56 insertions(+)

diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index e4c68ba..cd61bb1 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -44,6 +44,7 @@ static const char *perf_event__names[] = {
 	[PERF_RECORD_STAT_CONFIG]		= "STAT_CONFIG",
 	[PERF_RECORD_STAT]			= "STAT",
 	[PERF_RECORD_STAT_ROUND]		= "STAT_ROUND",
+	[PERF_RECORD_EVENT_UPDATE]		= "EVENT_UPDATE",
 };
 
 const char *perf_event__name(unsigned int id)
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 1afaa21..6966a4b 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -231,6 +231,7 @@ enum perf_user_event_type { /* above any possible kernel type */
 	PERF_RECORD_STAT_CONFIG			= 75,
 	PERF_RECORD_STAT			= 76,
 	PERF_RECORD_STAT_ROUND			= 77,
+	PERF_RECORD_EVENT_UPDATE		= 78,
 	PERF_RECORD_HEADER_MAX
 };
 
@@ -307,6 +308,14 @@ struct attr_event {
 	u64 id[];
 };
 
+struct event_update_event {
+	struct perf_event_header header;
+	u64 type;
+	u64 id;
+
+	char data[];
+};
+
 #define MAX_EVENT_NAME 64
 
 struct perf_trace_event_type {
@@ -456,6 +465,7 @@ union perf_event {
 	struct throttle_event		throttle;
 	struct sample_event		sample;
 	struct attr_event		attr;
+	struct event_update_event	event_update;
 	struct event_type_event		event_type;
 	struct tracing_data_event	tracing_data;
 	struct build_id_event		build_id;
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 5ac7bdb..6b4e002 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -2745,6 +2745,26 @@ int perf_event__process_attr(struct perf_tool *tool __maybe_unused,
 	return 0;
 }
 
+int perf_event__process_event_update(struct perf_tool *tool __maybe_unused,
+				     union perf_event *event,
+				     struct perf_evlist **pevlist)
+{
+	struct event_update_event *ev = &event->event_update;
+	struct perf_evlist *evlist;
+	struct perf_evsel *evsel;
+
+	if (!pevlist || *pevlist == NULL)
+		return -EINVAL;
+
+	evlist = *pevlist;
+
+	evsel = perf_evlist__id2evsel(evlist, ev->id);
+	if (evsel == NULL)
+		return -EINVAL;
+
+	return 0;
+}
+
 int perf_event__synthesize_tracing_data(struct perf_tool *tool, int fd,
 					struct perf_evlist *evlist,
 					perf_event__handler_t process)
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index 05f27cb..1e843c6 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -107,6 +107,9 @@ int perf_event__synthesize_attrs(struct perf_tool *tool,
 				 perf_event__handler_t process);
 int perf_event__process_attr(struct perf_tool *tool, union perf_event *event,
 			     struct perf_evlist **pevlist);
+int perf_event__process_event_update(struct perf_tool *tool __maybe_unused,
+				     union perf_event *event,
+				     struct perf_evlist **pevlist);
 
 int perf_event__synthesize_tracing_data(struct perf_tool *tool,
 					int fd, struct perf_evlist *evlist,
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 5b3a81a..49e5cdc 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -205,6 +205,15 @@ static int process_event_synth_attr_stub(struct perf_tool *tool __maybe_unused,
 	return 0;
 }
 
+static int process_event_synth_event_update_stub(struct perf_tool *tool __maybe_unused,
+						 union perf_event *event __maybe_unused,
+						 struct perf_evlist **pevlist
+						 __maybe_unused)
+{
+	dump_printf(": unhandled!\n");
+	return 0;
+}
+
 static int process_event_sample_stub(struct perf_tool *tool __maybe_unused,
 				     union perf_event *event __maybe_unused,
 				     struct perf_sample *sample __maybe_unused,
@@ -374,6 +383,8 @@ void perf_tool__fill_defaults(struct perf_tool *tool)
 		tool->unthrottle = process_event_stub;
 	if (tool->attr == NULL)
 		tool->attr = process_event_synth_attr_stub;
+	if (tool->event_update == NULL)
+		tool->event_update = process_event_synth_event_update_stub;
 	if (tool->tracing_data == NULL)
 		tool->tracing_data = process_event_synth_tracing_data_stub;
 	if (tool->build_id == NULL)
@@ -625,6 +636,13 @@ static void perf_event__hdr_attr_swap(union perf_event *event,
 	mem_bswap_64(event->attr.id, size);
 }
 
+static void perf_event__event_update_swap(union perf_event *event,
+					  bool sample_id_all __maybe_unused)
+{
+	event->event_update.type = bswap_64(event->event_update.type);
+	event->event_update.id   = bswap_64(event->event_update.id);
+}
+
 static void perf_event__event_type_swap(union perf_event *event,
 					bool sample_id_all __maybe_unused)
 {
@@ -779,6 +797,7 @@ static perf_event__swap_op perf_event__swap_ops[] = {
 	[PERF_RECORD_STAT_CONFIG]	  = perf_event__stat_config_swap,
 	[PERF_RECORD_STAT]		  = perf_event__stat_swap,
 	[PERF_RECORD_STAT_ROUND]	  = perf_event__stat_round_swap,
+	[PERF_RECORD_EVENT_UPDATE]	  = perf_event__event_update_swap,
 	[PERF_RECORD_HEADER_MAX]	  = NULL,
 };
 
@@ -1290,6 +1309,8 @@ static s64 perf_session__process_user_event(struct perf_session *session,
 			perf_session__set_comm_exec(session);
 		}
 		return err;
+	case PERF_RECORD_EVENT_UPDATE:
+		return tool->event_update(tool, event, &session->evlist);
 	case PERF_RECORD_HEADER_EVENT_TYPE:
 		/*
 		 * Depreceated, but we need to handle it for sake
diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h
index d04d9e5..55de4cf 100644
--- a/tools/perf/util/tool.h
+++ b/tools/perf/util/tool.h
@@ -50,6 +50,7 @@ struct perf_tool {
 			throttle,
 			unthrottle;
 	event_attr_op	attr;
+	event_attr_op	event_update;
 	event_op2	tracing_data;
 	event_oe	finished_round;
 	event_op2	build_id,
-- 
cgit v1.1


From a6e5281780d1da65c15ce529707f43eb4a6df856 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Sun, 25 Oct 2015 15:51:37 +0100
Subject: perf tools: Add event_update event unit type

Adding unit type 'event update' event, that stores/transfer events unit
name. The unit name is part of the perf stat output data.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Kan Liang <kan.liang@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1445784728-21732-22-git-send-email-jolsa@kernel.org
[ Rename __alloc() to __new() for consistency ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/Build          |  1 +
 tools/perf/tests/builtin-test.c |  4 ++++
 tools/perf/tests/event_update.c | 42 +++++++++++++++++++++++++++++++++++++++
 tools/perf/tests/tests.h        |  1 +
 tools/perf/util/event.h         |  4 ++++
 tools/perf/util/header.c        | 44 +++++++++++++++++++++++++++++++++++++++++
 tools/perf/util/header.h        |  3 +++
 7 files changed, 99 insertions(+)
 create mode 100644 tools/perf/tests/event_update.c

diff --git a/tools/perf/tests/Build b/tools/perf/tests/Build
index fc02931..614899b 100644
--- a/tools/perf/tests/Build
+++ b/tools/perf/tests/Build
@@ -36,6 +36,7 @@ perf-y += bpf.o
 perf-y += topology.o
 perf-y += cpumap.o
 perf-y += stat.o
+perf-y += event_update.o
 
 $(OUTPUT)tests/llvm-src-base.c: tests/bpf-script-example.c tests/Build
 	$(call rule_mkdir)
diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c
index 6a35198..f2b1dca 100644
--- a/tools/perf/tests/builtin-test.c
+++ b/tools/perf/tests/builtin-test.c
@@ -200,6 +200,10 @@ static struct test generic_tests[] = {
 		.func = test__synthesize_stat_round,
 	},
 	{
+		.desc = "Test attr update synthesize",
+		.func = test__event_update,
+	},
+	{
 		.func = NULL,
 	},
 };
diff --git a/tools/perf/tests/event_update.c b/tools/perf/tests/event_update.c
new file mode 100644
index 0000000..9cdf4c9
--- /dev/null
+++ b/tools/perf/tests/event_update.c
@@ -0,0 +1,42 @@
+#include <linux/compiler.h>
+#include "evlist.h"
+#include "evsel.h"
+#include "machine.h"
+#include "tests.h"
+#include "debug.h"
+
+static int process_event_unit(struct perf_tool *tool __maybe_unused,
+			      union perf_event *event,
+			      struct perf_sample *sample __maybe_unused,
+			      struct machine *machine __maybe_unused)
+{
+	struct event_update_event *ev = (struct event_update_event *) event;
+
+	TEST_ASSERT_VAL("wrong id", ev->id == 123);
+	TEST_ASSERT_VAL("wrong id", ev->type == PERF_EVENT_UPDATE__UNIT);
+	TEST_ASSERT_VAL("wrong unit", !strcmp(ev->data, "KRAVA"));
+	return 0;
+}
+
+int test__event_update(int subtest __maybe_unused)
+{
+	struct perf_evlist *evlist;
+	struct perf_evsel *evsel;
+
+	evlist = perf_evlist__new_default();
+	TEST_ASSERT_VAL("failed to get evlist", evlist);
+
+	evsel = perf_evlist__first(evlist);
+
+	TEST_ASSERT_VAL("failed to allos ids",
+			!perf_evsel__alloc_id(evsel, 1, 1));
+
+	perf_evlist__id_add(evlist, evsel, 0, 0, 123);
+
+	evsel->unit = strdup("KRAVA");
+
+	TEST_ASSERT_VAL("failed to synthesize attr update unit",
+			!perf_event__synthesize_event_update_unit(NULL, evsel, process_event_unit));
+
+	return 0;
+}
diff --git a/tools/perf/tests/tests.h b/tools/perf/tests/tests.h
index a82ab9c..82b2b5e 100644
--- a/tools/perf/tests/tests.h
+++ b/tools/perf/tests/tests.h
@@ -84,6 +84,7 @@ int test__cpu_map_synthesize(int subtest);
 int test__synthesize_stat_config(int subtest);
 int test__synthesize_stat(int subtest);
 int test__synthesize_stat_round(int subtest);
+int test__event_update(int subtest);
 
 #if defined(__arm__) || defined(__aarch64__)
 #ifdef HAVE_DWARF_UNWIND_SUPPORT
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 6966a4b..64c4cdf 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -308,6 +308,10 @@ struct attr_event {
 	u64 id[];
 };
 
+enum {
+	PERF_EVENT_UPDATE__UNIT  = 0,
+};
+
 struct event_update_event {
 	struct perf_event_header header;
 	u64 type;
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 6b4e002..5759ebf 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -2686,6 +2686,43 @@ int perf_event__synthesize_attr(struct perf_tool *tool,
 	return err;
 }
 
+static struct event_update_event *
+event_update_event__new(size_t size, u64 type, u64 id)
+{
+	struct event_update_event *ev;
+
+	size += sizeof(*ev);
+	size  = PERF_ALIGN(size, sizeof(u64));
+
+	ev = zalloc(size);
+	if (ev) {
+		ev->header.type = PERF_RECORD_EVENT_UPDATE;
+		ev->header.size = (u16)size;
+		ev->type = type;
+		ev->id = id;
+	}
+	return ev;
+}
+
+int
+perf_event__synthesize_event_update_unit(struct perf_tool *tool,
+					 struct perf_evsel *evsel,
+					 perf_event__handler_t process)
+{
+	struct event_update_event *ev;
+	size_t size = strlen(evsel->unit);
+	int err;
+
+	ev = event_update_event__new(size + 1, PERF_EVENT_UPDATE__UNIT, evsel->id[0]);
+	if (ev == NULL)
+		return -ENOMEM;
+
+	strncpy(ev->data, evsel->unit, size);
+	err = process(tool, (union perf_event *)ev, NULL, NULL);
+	free(ev);
+	return err;
+}
+
 int perf_event__synthesize_attrs(struct perf_tool *tool,
 				   struct perf_session *session,
 				   perf_event__handler_t process)
@@ -2762,6 +2799,13 @@ int perf_event__process_event_update(struct perf_tool *tool __maybe_unused,
 	if (evsel == NULL)
 		return -EINVAL;
 
+	switch (ev->type) {
+	case PERF_EVENT_UPDATE__UNIT:
+		evsel->unit = strdup(ev->data);
+	default:
+		break;
+	}
+
 	return 0;
 }
 
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index 1e843c6..6aa2b92 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -105,6 +105,9 @@ int perf_event__synthesize_attr(struct perf_tool *tool,
 int perf_event__synthesize_attrs(struct perf_tool *tool,
 				 struct perf_session *session,
 				 perf_event__handler_t process);
+int perf_event__synthesize_event_update_unit(struct perf_tool *tool,
+					     struct perf_evsel *evsel,
+					     perf_event__handler_t process);
 int perf_event__process_attr(struct perf_tool *tool, union perf_event *event,
 			     struct perf_evlist **pevlist);
 int perf_event__process_event_update(struct perf_tool *tool __maybe_unused,
-- 
cgit v1.1


From daeecbc0c431f15f492fb8d704080a02de6e2918 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Sun, 25 Oct 2015 15:51:38 +0100
Subject: perf tools: Add event_update event scale type

A__allocdding scale type 'event update' event, that stores/transfer
events scale value. The PMU events can define the scale
value which is used to multiply events data.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Kan Liang <kan.liang@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1445784728-21732-23-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/event_update.c | 21 +++++++++++++++++++++
 tools/perf/util/event.h         |  5 +++++
 tools/perf/util/header.c        | 26 ++++++++++++++++++++++++++
 tools/perf/util/header.h        |  3 +++
 4 files changed, 55 insertions(+)

diff --git a/tools/perf/tests/event_update.c b/tools/perf/tests/event_update.c
index 9cdf4c9..a91fcef 100644
--- a/tools/perf/tests/event_update.c
+++ b/tools/perf/tests/event_update.c
@@ -18,6 +18,22 @@ static int process_event_unit(struct perf_tool *tool __maybe_unused,
 	return 0;
 }
 
+static int process_event_scale(struct perf_tool *tool __maybe_unused,
+			       union perf_event *event,
+			       struct perf_sample *sample __maybe_unused,
+			       struct machine *machine __maybe_unused)
+{
+	struct event_update_event *ev = (struct event_update_event *) event;
+	struct event_update_event_scale *ev_data;
+
+	ev_data = (struct event_update_event_scale *) ev->data;
+
+	TEST_ASSERT_VAL("wrong id", ev->id == 123);
+	TEST_ASSERT_VAL("wrong id", ev->type == PERF_EVENT_UPDATE__SCALE);
+	TEST_ASSERT_VAL("wrong scale", ev_data->scale = 0.123);
+	return 0;
+}
+
 int test__event_update(int subtest __maybe_unused)
 {
 	struct perf_evlist *evlist;
@@ -38,5 +54,10 @@ int test__event_update(int subtest __maybe_unused)
 	TEST_ASSERT_VAL("failed to synthesize attr update unit",
 			!perf_event__synthesize_event_update_unit(NULL, evsel, process_event_unit));
 
+	evsel->scale = 0.123;
+
+	TEST_ASSERT_VAL("failed to synthesize attr update scale",
+			!perf_event__synthesize_event_update_scale(NULL, evsel, process_event_scale));
+
 	return 0;
 }
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 64c4cdf..44198e8 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -310,6 +310,11 @@ struct attr_event {
 
 enum {
 	PERF_EVENT_UPDATE__UNIT  = 0,
+	PERF_EVENT_UPDATE__SCALE = 1,
+};
+
+struct event_update_event_scale {
+	double scale;
 };
 
 struct event_update_event {
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 5759ebf..30edb4b 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -2723,6 +2723,27 @@ perf_event__synthesize_event_update_unit(struct perf_tool *tool,
 	return err;
 }
 
+int
+perf_event__synthesize_event_update_scale(struct perf_tool *tool,
+					  struct perf_evsel *evsel,
+					  perf_event__handler_t process)
+{
+	struct event_update_event *ev;
+	struct event_update_event_scale *ev_data;
+	int err;
+
+	ev = event_update_event__new(sizeof(*ev_data), PERF_EVENT_UPDATE__SCALE, evsel->id[0]);
+	if (ev == NULL)
+		return -ENOMEM;
+
+	ev_data = (struct event_update_event_scale *) ev->data;
+	ev_data->scale = evsel->scale;
+	err = process(tool, (union perf_event*) ev, NULL, NULL);
+	free(ev);
+	return err;
+}
+
+
 int perf_event__synthesize_attrs(struct perf_tool *tool,
 				   struct perf_session *session,
 				   perf_event__handler_t process)
@@ -2787,6 +2808,7 @@ int perf_event__process_event_update(struct perf_tool *tool __maybe_unused,
 				     struct perf_evlist **pevlist)
 {
 	struct event_update_event *ev = &event->event_update;
+	struct event_update_event_scale *ev_scale;
 	struct perf_evlist *evlist;
 	struct perf_evsel *evsel;
 
@@ -2802,6 +2824,10 @@ int perf_event__process_event_update(struct perf_tool *tool __maybe_unused,
 	switch (ev->type) {
 	case PERF_EVENT_UPDATE__UNIT:
 		evsel->unit = strdup(ev->data);
+		break;
+	case PERF_EVENT_UPDATE__SCALE:
+		ev_scale = (struct event_update_event_scale *) ev->data;
+		evsel->scale = ev_scale->scale;
 	default:
 		break;
 	}
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index 6aa2b92..fad04cb 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -108,6 +108,9 @@ int perf_event__synthesize_attrs(struct perf_tool *tool,
 int perf_event__synthesize_event_update_unit(struct perf_tool *tool,
 					     struct perf_evsel *evsel,
 					     perf_event__handler_t process);
+int perf_event__synthesize_event_update_scale(struct perf_tool *tool,
+					      struct perf_evsel *evsel,
+					      perf_event__handler_t process);
 int perf_event__process_attr(struct perf_tool *tool, union perf_event *event,
 			     struct perf_evlist **pevlist);
 int perf_event__process_event_update(struct perf_tool *tool __maybe_unused,
-- 
cgit v1.1


From 802c9048b824eaa3c75d875e2d107460ad586439 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Sun, 25 Oct 2015 15:51:39 +0100
Subject: perf tools: Add event_update event name type

Adding name type 'event update' event, that stores/transfer events name.
Event's name is stored within perf.data's EVENT_DESC feature, but we
don't have it if we get the report data from pipe.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Kan Liang <kan.liang@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1445784728-21732-24-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/event_update.c | 25 +++++++++++++++++++++++++
 tools/perf/util/event.h         |  1 +
 tools/perf/util/header.c        | 21 +++++++++++++++++++++
 tools/perf/util/header.h        |  3 +++
 4 files changed, 50 insertions(+)

diff --git a/tools/perf/tests/event_update.c b/tools/perf/tests/event_update.c
index a91fcef..482b896 100644
--- a/tools/perf/tests/event_update.c
+++ b/tools/perf/tests/event_update.c
@@ -34,10 +34,30 @@ static int process_event_scale(struct perf_tool *tool __maybe_unused,
 	return 0;
 }
 
+struct event_name {
+	struct perf_tool tool;
+	const char *name;
+};
+
+static int process_event_name(struct perf_tool *tool,
+			      union perf_event *event,
+			      struct perf_sample *sample __maybe_unused,
+			      struct machine *machine __maybe_unused)
+{
+	struct event_name *tmp = container_of(tool, struct event_name, tool);
+	struct event_update_event *ev = (struct event_update_event*) event;
+
+	TEST_ASSERT_VAL("wrong id", ev->id == 123);
+	TEST_ASSERT_VAL("wrong id", ev->type == PERF_EVENT_UPDATE__NAME);
+	TEST_ASSERT_VAL("wrong name", !strcmp(ev->data, tmp->name));
+	return 0;
+}
+
 int test__event_update(int subtest __maybe_unused)
 {
 	struct perf_evlist *evlist;
 	struct perf_evsel *evsel;
+	struct event_name tmp;
 
 	evlist = perf_evlist__new_default();
 	TEST_ASSERT_VAL("failed to get evlist", evlist);
@@ -59,5 +79,10 @@ int test__event_update(int subtest __maybe_unused)
 	TEST_ASSERT_VAL("failed to synthesize attr update scale",
 			!perf_event__synthesize_event_update_scale(NULL, evsel, process_event_scale));
 
+	tmp.name = perf_evsel__name(evsel);
+
+	TEST_ASSERT_VAL("failed to synthesize attr update name",
+			!perf_event__synthesize_event_update_name(&tmp.tool, evsel, process_event_name));
+
 	return 0;
 }
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 44198e8..235196b 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -311,6 +311,7 @@ struct attr_event {
 enum {
 	PERF_EVENT_UPDATE__UNIT  = 0,
 	PERF_EVENT_UPDATE__SCALE = 1,
+	PERF_EVENT_UPDATE__NAME  = 2,
 };
 
 struct event_update_event_scale {
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 30edb4b..cd3d005 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -2743,6 +2743,24 @@ perf_event__synthesize_event_update_scale(struct perf_tool *tool,
 	return err;
 }
 
+int
+perf_event__synthesize_event_update_name(struct perf_tool *tool,
+					 struct perf_evsel *evsel,
+					 perf_event__handler_t process)
+{
+	struct event_update_event *ev;
+	size_t len = strlen(evsel->name);
+	int err;
+
+	ev = event_update_event__new(len + 1, PERF_EVENT_UPDATE__NAME, evsel->id[0]);
+	if (ev == NULL)
+		return -ENOMEM;
+
+	strncpy(ev->data, evsel->name, len);
+	err = process(tool, (union perf_event*) ev, NULL, NULL);
+	free(ev);
+	return err;
+}
 
 int perf_event__synthesize_attrs(struct perf_tool *tool,
 				   struct perf_session *session,
@@ -2825,6 +2843,9 @@ int perf_event__process_event_update(struct perf_tool *tool __maybe_unused,
 	case PERF_EVENT_UPDATE__UNIT:
 		evsel->unit = strdup(ev->data);
 		break;
+	case PERF_EVENT_UPDATE__NAME:
+		evsel->name = strdup(ev->data);
+		break;
 	case PERF_EVENT_UPDATE__SCALE:
 		ev_scale = (struct event_update_event_scale *) ev->data;
 		evsel->scale = ev_scale->scale;
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index fad04cb..51cf566 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -111,6 +111,9 @@ int perf_event__synthesize_event_update_unit(struct perf_tool *tool,
 int perf_event__synthesize_event_update_scale(struct perf_tool *tool,
 					      struct perf_evsel *evsel,
 					      perf_event__handler_t process);
+int perf_event__synthesize_event_update_name(struct perf_tool *tool,
+					     struct perf_evsel *evsel,
+					     perf_event__handler_t process);
 int perf_event__process_attr(struct perf_tool *tool, union perf_event *event,
 			     struct perf_evlist **pevlist);
 int perf_event__process_event_update(struct perf_tool *tool __maybe_unused,
-- 
cgit v1.1


From 86ebb09f96fe6886e1e5d53b648df5537ba859ca Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Sun, 25 Oct 2015 15:51:40 +0100
Subject: perf tools: Add event_update event cpus type

Adding the cpumask 'event update' event, that stores/transfer the
cpumask for a event.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Kan Liang <kan.liang@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1445784728-21732-25-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/event_update.c | 29 ++++++++++++++++++++++++++++
 tools/perf/util/event.h         |  5 +++++
 tools/perf/util/header.c        | 42 +++++++++++++++++++++++++++++++++++++++++
 tools/perf/util/header.h        |  3 +++
 4 files changed, 79 insertions(+)

diff --git a/tools/perf/tests/event_update.c b/tools/perf/tests/event_update.c
index 482b896..012eab5 100644
--- a/tools/perf/tests/event_update.c
+++ b/tools/perf/tests/event_update.c
@@ -53,6 +53,29 @@ static int process_event_name(struct perf_tool *tool,
 	return 0;
 }
 
+static int process_event_cpus(struct perf_tool *tool __maybe_unused,
+			      union perf_event *event,
+			      struct perf_sample *sample __maybe_unused,
+			      struct machine *machine __maybe_unused)
+{
+	struct event_update_event *ev = (struct event_update_event*) event;
+	struct event_update_event_cpus *ev_data;
+	struct cpu_map *map;
+
+	ev_data = (struct event_update_event_cpus*) ev->data;
+
+	map = cpu_map__new_data(&ev_data->cpus);
+
+	TEST_ASSERT_VAL("wrong id", ev->id == 123);
+	TEST_ASSERT_VAL("wrong type", ev->type == PERF_EVENT_UPDATE__CPUS);
+	TEST_ASSERT_VAL("wrong cpus", map->nr == 3);
+	TEST_ASSERT_VAL("wrong cpus", map->map[0] == 1);
+	TEST_ASSERT_VAL("wrong cpus", map->map[1] == 2);
+	TEST_ASSERT_VAL("wrong cpus", map->map[2] == 3);
+	cpu_map__put(map);
+	return 0;
+}
+
 int test__event_update(int subtest __maybe_unused)
 {
 	struct perf_evlist *evlist;
@@ -84,5 +107,11 @@ int test__event_update(int subtest __maybe_unused)
 	TEST_ASSERT_VAL("failed to synthesize attr update name",
 			!perf_event__synthesize_event_update_name(&tmp.tool, evsel, process_event_name));
 
+	evsel->own_cpus = cpu_map__new("1,2,3");
+
+	TEST_ASSERT_VAL("failed to synthesize attr update cpus",
+			!perf_event__synthesize_event_update_cpus(&tmp.tool, evsel, process_event_cpus));
+
+	cpu_map__put(evsel->own_cpus);
 	return 0;
 }
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 235196b..b7ffb7e 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -312,6 +312,11 @@ enum {
 	PERF_EVENT_UPDATE__UNIT  = 0,
 	PERF_EVENT_UPDATE__SCALE = 1,
 	PERF_EVENT_UPDATE__NAME  = 2,
+	PERF_EVENT_UPDATE__CPUS  = 3,
+};
+
+struct event_update_event_cpus {
+	struct cpu_map_data cpus;
 };
 
 struct event_update_event_scale {
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index cd3d005..79d3eb9 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -2762,6 +2762,38 @@ perf_event__synthesize_event_update_name(struct perf_tool *tool,
 	return err;
 }
 
+int
+perf_event__synthesize_event_update_cpus(struct perf_tool *tool,
+					struct perf_evsel *evsel,
+					perf_event__handler_t process)
+{
+	size_t size = sizeof(struct event_update_event);
+	struct event_update_event *ev;
+	int max, err;
+	u16 type;
+
+	if (!evsel->own_cpus)
+		return 0;
+
+	ev = cpu_map_data__alloc(evsel->own_cpus, &size, &type, &max);
+	if (!ev)
+		return -ENOMEM;
+
+	ev->header.type = PERF_RECORD_EVENT_UPDATE;
+	ev->header.size = (u16)size;
+	ev->type = PERF_EVENT_UPDATE__CPUS;
+	ev->id   = evsel->id[0];
+
+	cpu_map_data__synthesize((struct cpu_map_data *) ev->data,
+				 evsel->own_cpus,
+				 type, max);
+
+	err = process(tool, (union perf_event*) ev, NULL, NULL);
+	free(ev);
+	return err;
+}
+
+
 int perf_event__synthesize_attrs(struct perf_tool *tool,
 				   struct perf_session *session,
 				   perf_event__handler_t process)
@@ -2827,8 +2859,10 @@ int perf_event__process_event_update(struct perf_tool *tool __maybe_unused,
 {
 	struct event_update_event *ev = &event->event_update;
 	struct event_update_event_scale *ev_scale;
+	struct event_update_event_cpus *ev_cpus;
 	struct perf_evlist *evlist;
 	struct perf_evsel *evsel;
+	struct cpu_map *map;
 
 	if (!pevlist || *pevlist == NULL)
 		return -EINVAL;
@@ -2849,6 +2883,14 @@ int perf_event__process_event_update(struct perf_tool *tool __maybe_unused,
 	case PERF_EVENT_UPDATE__SCALE:
 		ev_scale = (struct event_update_event_scale *) ev->data;
 		evsel->scale = ev_scale->scale;
+	case PERF_EVENT_UPDATE__CPUS:
+		ev_cpus = (struct event_update_event_cpus *) ev->data;
+
+		map = cpu_map__new_data(&ev_cpus->cpus);
+		if (map)
+			evsel->own_cpus = map;
+		else
+			pr_err("failed to get event_update cpus\n");
 	default:
 		break;
 	}
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index 51cf566..a1bc0c57 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -114,6 +114,9 @@ int perf_event__synthesize_event_update_scale(struct perf_tool *tool,
 int perf_event__synthesize_event_update_name(struct perf_tool *tool,
 					     struct perf_evsel *evsel,
 					     perf_event__handler_t process);
+int perf_event__synthesize_event_update_cpus(struct perf_tool *tool,
+					     struct perf_evsel *evsel,
+					     perf_event__handler_t process);
 int perf_event__process_attr(struct perf_tool *tool, union perf_event *event,
 			     struct perf_evlist **pevlist);
 int perf_event__process_event_update(struct perf_tool *tool __maybe_unused,
-- 
cgit v1.1


From c853f9394b7bc189632673cac802bdbf6537463b Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Sun, 25 Oct 2015 15:51:41 +0100
Subject: perf tools: Add perf_event__fprintf_event_update function

To display a 'event update' event for raw dump.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Kan Liang <kan.liang@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1445784728-21732-26-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/header.c | 38 ++++++++++++++++++++++++++++++++++++++
 tools/perf/util/header.h |  1 +
 2 files changed, 39 insertions(+)

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 79d3eb9..49676c1 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -2793,6 +2793,44 @@ perf_event__synthesize_event_update_cpus(struct perf_tool *tool,
 	return err;
 }
 
+size_t perf_event__fprintf_event_update(union perf_event *event, FILE *fp)
+{
+	struct event_update_event *ev = &event->event_update;
+	struct event_update_event_scale *ev_scale;
+	struct event_update_event_cpus *ev_cpus;
+	struct cpu_map *map;
+	size_t ret;
+
+	ret = fprintf(fp, "\n... id:    %" PRIu64 "\n", ev->id);
+
+	switch (ev->type) {
+	case PERF_EVENT_UPDATE__SCALE:
+		ev_scale = (struct event_update_event_scale *) ev->data;
+		ret += fprintf(fp, "... scale: %f\n", ev_scale->scale);
+		break;
+	case PERF_EVENT_UPDATE__UNIT:
+		ret += fprintf(fp, "... unit:  %s\n", ev->data);
+		break;
+	case PERF_EVENT_UPDATE__NAME:
+		ret += fprintf(fp, "... name:  %s\n", ev->data);
+		break;
+	case PERF_EVENT_UPDATE__CPUS:
+		ev_cpus = (struct event_update_event_cpus *) ev->data;
+		ret += fprintf(fp, "... ");
+
+		map = cpu_map__new_data(&ev_cpus->cpus);
+		if (map)
+			ret += cpu_map__fprintf(map, fp);
+		else
+			ret += fprintf(fp, "failed to get cpus\n");
+		break;
+	default:
+		ret += fprintf(fp, "... unknown type\n");
+		break;
+	}
+
+	return ret;
+}
 
 int perf_event__synthesize_attrs(struct perf_tool *tool,
 				   struct perf_session *session,
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index a1bc0c57..710deec 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -122,6 +122,7 @@ int perf_event__process_attr(struct perf_tool *tool, union perf_event *event,
 int perf_event__process_event_update(struct perf_tool *tool __maybe_unused,
 				     union perf_event *event,
 				     struct perf_evlist **pevlist);
+size_t perf_event__fprintf_event_update(union perf_event *event, FILE *fp);
 
 int perf_event__synthesize_tracing_data(struct perf_tool *tool,
 					int fd, struct perf_evlist *evlist,
-- 
cgit v1.1


From 2d2aea6ae736503d3896c4997b494760ed8febc1 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Sun, 25 Oct 2015 15:51:42 +0100
Subject: perf report: Display newly added events in raw dump

The 'perf report -D' command will now display detailed output for these
newly added events:

  event_update
  thread_map
  cpu_map
  stat
  stat_config
  stat_round

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Kan Liang <kan.liang@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1445784728-21732-27-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/session.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 49e5cdc..a90c74b 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -17,6 +17,7 @@
 #include "asm/bug.h"
 #include "auxtrace.h"
 #include "thread-stack.h"
+#include "stat.h"
 
 static int perf_session__deliver_event(struct perf_session *session,
 				       union perf_event *event,
@@ -210,6 +211,9 @@ static int process_event_synth_event_update_stub(struct perf_tool *tool __maybe_
 						 struct perf_evlist **pevlist
 						 __maybe_unused)
 {
+	if (dump_trace)
+		perf_event__fprintf_event_update(event, stdout);
+
 	dump_printf(": unhandled!\n");
 	return 0;
 }
@@ -311,6 +315,9 @@ int process_event_thread_map_stub(struct perf_tool *tool __maybe_unused,
 				  union perf_event *event __maybe_unused,
 				  struct perf_session *session __maybe_unused)
 {
+	if (dump_trace)
+		perf_event__fprintf_thread_map(event, stdout);
+
 	dump_printf(": unhandled!\n");
 	return 0;
 }
@@ -320,6 +327,9 @@ int process_event_cpu_map_stub(struct perf_tool *tool __maybe_unused,
 			       union perf_event *event __maybe_unused,
 			       struct perf_session *session __maybe_unused)
 {
+	if (dump_trace)
+		perf_event__fprintf_cpu_map(event, stdout);
+
 	dump_printf(": unhandled!\n");
 	return 0;
 }
@@ -329,6 +339,9 @@ int process_event_stat_config_stub(struct perf_tool *tool __maybe_unused,
 				   union perf_event *event __maybe_unused,
 				   struct perf_session *session __maybe_unused)
 {
+	if (dump_trace)
+		perf_event__fprintf_stat_config(event, stdout);
+
 	dump_printf(": unhandled!\n");
 	return 0;
 }
@@ -338,6 +351,9 @@ static int process_stat_stub(struct perf_tool *tool __maybe_unused,
 			     struct perf_session *perf_session
 			     __maybe_unused)
 {
+	if (dump_trace)
+		perf_event__fprintf_stat(event, stdout);
+
 	dump_printf(": unhandled!\n");
 	return 0;
 }
@@ -347,6 +363,9 @@ static int process_stat_round_stub(struct perf_tool *tool __maybe_unused,
 				   struct perf_session *perf_session
 				   __maybe_unused)
 {
+	if (dump_trace)
+		perf_event__fprintf_stat_round(event, stdout);
+
 	dump_printf(": unhandled!\n");
 	return 0;
 }
-- 
cgit v1.1


From ffa517adf625fa6a6c168285534e1ff7344fa2f1 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Sun, 25 Oct 2015 15:51:43 +0100
Subject: perf tools: Introduce stat perf.data header feature

Introducing the 'stat' feature to mark a perf.data as created by  the
'perf stat record' command. It contains no data.

It's needed so that the report tools (report/script) can differentiate
sampling data from counting data, because they need to be treated in a
different way.

In the future it might be used to store the version of the stat storage
system used.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Kan Liang <kan.liang@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1445784728-21732-28-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-record.c |  2 ++
 tools/perf/util/header.c    | 14 ++++++++++++++
 tools/perf/util/header.h    |  1 +
 3 files changed, 17 insertions(+)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 1435ef6..9c5cdc2c4 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -452,6 +452,8 @@ static void record__init_features(struct record *rec)
 
 	if (!rec->opts.full_auxtrace)
 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
+
+	perf_header__clear_feat(&session->header, HEADER_STAT);
 }
 
 static volatile int workload_exec_errno;
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 49676c1..f50b723 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -868,6 +868,13 @@ static int write_auxtrace(int fd, struct perf_header *h,
 	return err;
 }
 
+static int write_stat(int fd __maybe_unused,
+		      struct perf_header *h __maybe_unused,
+		      struct perf_evlist *evlist __maybe_unused)
+{
+	return 0;
+}
+
 static void print_hostname(struct perf_header *ph, int fd __maybe_unused,
 			   FILE *fp)
 {
@@ -1159,6 +1166,12 @@ static void print_auxtrace(struct perf_header *ph __maybe_unused,
 	fprintf(fp, "# contains AUX area data (e.g. instruction trace)\n");
 }
 
+static void print_stat(struct perf_header *ph __maybe_unused,
+		       int fd __maybe_unused, FILE *fp)
+{
+	fprintf(fp, "# contains stat data\n");
+}
+
 static void print_pmu_mappings(struct perf_header *ph, int fd __maybe_unused,
 			       FILE *fp)
 {
@@ -1948,6 +1961,7 @@ static const struct feature_ops feat_ops[HEADER_LAST_FEATURE] = {
 	FEAT_OPP(HEADER_PMU_MAPPINGS,	pmu_mappings),
 	FEAT_OPP(HEADER_GROUP_DESC,	group_desc),
 	FEAT_OPP(HEADER_AUXTRACE,	auxtrace),
+	FEAT_OPA(HEADER_STAT,		stat),
 };
 
 struct header_print_data {
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index 710deec..cff9892 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -31,6 +31,7 @@ enum {
 	HEADER_PMU_MAPPINGS,
 	HEADER_GROUP_DESC,
 	HEADER_AUXTRACE,
+	HEADER_STAT,
 	HEADER_LAST_FEATURE,
 	HEADER_FEAT_BITS	= 256,
 };
-- 
cgit v1.1


From 4979d0c7d0c73a3e799d4dcfbacd3cd11cc55638 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 5 Nov 2015 15:40:46 +0100
Subject: perf stat record: Add record command

Add 'perf stat record' command support. It creates simple (header only)
perf.data file ATM.

The record command could be specified anywhere among stat options. All
stat command options are valid for stat record command with '-o' option
exception. If specified for record command it denotes the perf data file
name.

Committer note:

Set sample_type to PERF_SAMPLE_IDENTIFIER, which should be harmless
while avoiding that older tools show confusing messages, for instance,
with sample_type = 0, we get:

  $ perf stat record usleep 1

   Performance counter stats for 'usleep 1':

          0.630237      task-clock (msec)         #    0.528 CPUs utilized
                 1      context-switches          #    0.002 M/sec
                 0      cpu-migrations            #    0.000 K/sec
                52      page-faults               #    0.083 M/sec
           978,312      cycles                    #    1.552 GHz
           671,931      stalled-cycles-frontend   #   68.68% frontend cycles idle
   <not supported>      stalled-cycles-backend
           646,379      instructions              #    0.66  insns per cycle
                                                  #    1.04  stalled cycles per insn
           131,046      branches                  #  207.931 M/sec
             7,073      branch-misses             #    5.40% of all branches

       0.001193240 seconds time elapsed

  $ oldperf evlist
  WARNING: The perf.data file's data size field is 0 which is unexpected.
  Was the 'perf record' command properly terminated?
  non matching sample_type
  $

While with sample_type set to PERF_SAMPLE_IDENTIFIER, after we re-run 'perf
stat record usleep' we get:

  $ oldperf evlist
  WARNING: The perf.data file's data size field is 0 which is unexpected.
  Was the 'perf record' command properly terminated?
  task-clock
  context-switches
  cpu-migrations
  page-faults
  cycles
  stalled-cycles-frontend
  stalled-cycles-backend
  instructions
  branches
  branch-misses
  $

Which at least shows the names of the events in the perf.data file.

Additionally, such files, when passed to 'perf report' will produce:

  $ oldperf report --stdio
  WARNING: The perf.data file's data size field is 0 which is unexpected.
  Was the 'perf record' command properly terminated?
  Warning:
  Kernel address maps (/proc/{kallsyms,modules}) were restricted.

  Check /proc/sys/kernel/kptr_restrict before running 'perf record'.

  As no suitable kallsyms nor vmlinux was found, kernel samples
  can't be resolved.

  Samples in kernel modules can't be resolved as well.

  Error:
  The perf.data file has no samples!
  # To display the perf.data header info, please use --header/--header-only options.
  #
  $

Which is confusing and can be solved by just adding the kernel mmap record,
which will also remove that warning about the data size field being equal to
zero, after generating the mmap record:

  $ perf stat record usleep 1

   Performance counter stats for 'usleep 1':

          0.600796      task-clock (msec)         #    0.478 CPUs utilized
                 1      context-switches          #    0.002 M/sec
                 0      cpu-migrations            #    0.000 K/sec
                54      page-faults               #    0.090 M/sec
           886,844      cycles                    #    1.476 GHz
           582,169      stalled-cycles-frontend   #   65.65% frontend cycles idle
   <not supported>      stalled-cycles-backend
           638,344      instructions              #    0.72  insns per cycle
                                                  #    0.91  stalled cycles per insn
           130,204      branches                  #  216.719 M/sec
             7,500      branch-misses             #    5.76% of all branches

       0.001255897 seconds time elapsed

  $ oldperf evlist
  task-clock
  context-switches
  cpu-migrations
  page-faults
  cycles
  stalled-cycles-frontend
  stalled-cycles-backend
  instructions
  branches
  branch-misses
  $ oldperf report --stdio
  Error:
  The perf.data file has no samples!
  # To display the perf.data header info, please use --header/--header-only options.
  #
  [acme@zoo linux]$

No warnings, sensible output about what are the events in the perf.data file and also
a "file has no samples" message, which indeed it doesn't.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Tested-by: Kan Liang <kan.liang@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: htp://lkml.kernel.org/r/1446734469-11352-3-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-stat.txt |  12 ++++
 tools/perf/builtin-stat.c              | 120 +++++++++++++++++++++++++++++++--
 2 files changed, 128 insertions(+), 4 deletions(-)

diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
index 4e074a6..70eee1c 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -10,6 +10,7 @@ SYNOPSIS
 [verse]
 'perf stat' [-e <EVENT> | --event=EVENT] [-a] <command>
 'perf stat' [-e <EVENT> | --event=EVENT] [-a] -- <command> [<options>]
+'perf stat' [-e <EVENT> | --event=EVENT] [-a] record [-o file] -- <command> [<options>]
 
 DESCRIPTION
 -----------
@@ -22,6 +23,8 @@ OPTIONS
 <command>...::
 	Any command you can specify in a shell.
 
+record::
+	See STAT RECORD.
 
 -e::
 --event=::
@@ -159,6 +162,15 @@ filter out the startup phase of the program, which is often very different.
 
 Print statistics of transactional execution if supported.
 
+STAT RECORD
+-----------
+Stores stat data into perf data file.
+
+-o file::
+--output file::
+Output file name.
+
+
 EXAMPLES
 --------
 
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index bbf42ee..af2a3bf 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -59,6 +59,7 @@
 #include "util/thread.h"
 #include "util/thread_map.h"
 #include "util/counts.h"
+#include "util/session.h"
 
 #include <stdlib.h>
 #include <sys/prctl.h>
@@ -126,6 +127,16 @@ static bool			append_file;
 static const char		*output_name;
 static int			output_fd;
 
+struct perf_stat {
+	bool			 record;
+	struct perf_data_file	 file;
+	struct perf_session	*session;
+	u64			 bytes_written;
+};
+
+static struct perf_stat		perf_stat;
+#define STAT_RECORD		perf_stat.record
+
 static volatile int done = 0;
 
 static struct perf_stat_config stat_config = {
@@ -166,7 +177,11 @@ static int create_perf_stat_counter(struct perf_evsel *evsel)
 	 * like tracepoints. Clear it up for counting.
 	 */
 	attr->sample_period = 0;
-	attr->sample_type   = 0;
+	/*
+	 * But set sample_type to PERF_SAMPLE_IDENTIFIER, which should be harmless
+	 * while avoiding that older tools show confusing messages.
+	 */
+	attr->sample_type   = PERF_SAMPLE_IDENTIFIER;
 
 	/*
 	 * Disabling all counters initially, they will be enabled
@@ -202,6 +217,26 @@ static inline int nsec_counter(struct perf_evsel *evsel)
 	return 0;
 }
 
+static int perf_stat__write(struct perf_stat *stat, void *bf, size_t size)
+{
+	if (perf_data_file__write(stat->session->file, bf, size) < 0) {
+		pr_err("failed to write perf data, error: %m\n");
+		return -1;
+	}
+
+	stat->bytes_written += size;
+	return 0;
+}
+
+static int process_synthesized_event(struct perf_tool *tool,
+				     union perf_event *event,
+				     struct perf_sample *sample __maybe_unused,
+				     struct machine *machine __maybe_unused)
+{
+	struct perf_stat *stat = (void *)tool;
+	return perf_stat__write(stat, event, event->header.size);
+}
+
 /*
  * Read out the results of a single counter:
  * do not aggregate counts across CPUs in system-wide mode
@@ -361,6 +396,15 @@ static int __run_perf_stat(int argc, const char **argv)
 		return -1;
 	}
 
+	if (STAT_RECORD) {
+		int err, fd = perf_data_file__fd(&perf_stat.file);
+
+		err = perf_session__write_header(perf_stat.session, evsel_list,
+						 fd, false);
+		if (err < 0)
+			return err;
+	}
+
 	/*
 	 * Enable counters and exec the command:
 	 */
@@ -1261,6 +1305,38 @@ static int add_default_attributes(void)
 	return perf_evlist__add_default_attrs(evsel_list, very_very_detailed_attrs);
 }
 
+static const char * const recort_usage[] = {
+	"perf stat record [<options>]",
+	NULL,
+};
+
+static int __cmd_record(int argc, const char **argv)
+{
+	struct perf_session *session;
+	struct perf_data_file *file = &perf_stat.file;
+
+	argc = parse_options(argc, argv, stat_options, record_usage,
+			     PARSE_OPT_STOP_AT_NON_OPTION);
+
+	if (output_name)
+		file->path = output_name;
+
+	session = perf_session__new(file, false, NULL);
+	if (session == NULL) {
+		pr_err("Perf session creation failed.\n");
+		return -1;
+	}
+
+	/* No pipe support ATM */
+	if (perf_stat.file.is_pipe)
+		return -EINVAL;
+
+	session->evlist   = evsel_list;
+	perf_stat.session = session;
+	perf_stat.record  = true;
+	return argc;
+}
+
 int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
 {
 	const char * const stat_usage[] = {
@@ -1271,6 +1347,7 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
 	const char *mode;
 	FILE *output = stderr;
 	unsigned int interval;
+	const char * const stat_subcommands[] = { "record" };
 
 	setlocale(LC_ALL, "");
 
@@ -1278,12 +1355,22 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
 	if (evsel_list == NULL)
 		return -ENOMEM;
 
-	argc = parse_options(argc, argv, stat_options, stat_usage,
-		PARSE_OPT_STOP_AT_NON_OPTION);
+	argc = parse_options_subcommand(argc, argv, stat_options, stat_subcommands,
+					(const char **) stat_usage,
+					PARSE_OPT_STOP_AT_NON_OPTION);
+
+	if (argc && !strncmp(argv[0], "rec", 3)) {
+		argc = __cmd_record(argc, argv);
+		if (argc < 0)
+			return -1;
+	}
 
 	interval = stat_config.interval;
 
-	if (output_name && strcmp(output_name, "-"))
+	/*
+	 * For record command the -o is already taken care of.
+	 */
+	if (!STAT_RECORD && output_name && strcmp(output_name, "-"))
 		output = NULL;
 
 	if (output_name && output_fd) {
@@ -1450,6 +1537,31 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
 	if (!forever && status != -1 && !interval)
 		print_counters(NULL, argc, argv);
 
+	if (STAT_RECORD) {
+		/*
+		 * We synthesize the kernel mmap record just so that older tools
+		 * don't emit warnings about not being able to resolve symbols
+		 * due to /proc/sys/kernel/kptr_restrict settings and instear provide
+		 * a saner message about no samples being in the perf.data file.
+		 *
+		 * This also serves to suppress a warning about f_header.data.size == 0
+		 * in header.c.  -acme
+		 */
+		int fd = perf_data_file__fd(&perf_stat.file);
+		int err = perf_event__synthesize_kernel_mmap((void *)&perf_stat,
+							     process_synthesized_event,
+							     &perf_stat.session->machines.host);
+		if (err) {
+			pr_warning("Couldn't synthesize the kernel mmap record, harmless, "
+				   "older tools may produce warnings about this file\n.");
+		}
+
+		perf_stat.session->header.data_size += perf_stat.bytes_written;
+		perf_session__write_header(perf_stat.session, evsel_list, fd, true);
+
+		perf_session__delete(perf_stat.session);
+	}
+
 	perf_stat__exit_aggr_mode();
 	perf_evlist__free_stats(evsel_list);
 out:
-- 
cgit v1.1


From 3ba78bd00e508bf46a6aa2b8e296dc8287ea4c29 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 5 Nov 2015 15:40:47 +0100
Subject: perf stat record: Initialize record features

Disabling all non stat related features.

Also as we now enable STAT feature in the data file, adding code to
instruct session open to skip sample type checking for stat data files.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Tested-by: Kan Liang <kan.liang@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1446734469-11352-4-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-stat.c | 15 +++++++++++++++
 tools/perf/util/session.c |  3 +++
 2 files changed, 18 insertions(+)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index af2a3bf..c9c896a 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1310,6 +1310,19 @@ static const char * const recort_usage[] = {
 	NULL,
 };
 
+static void init_features(struct perf_session *session)
+{
+	int feat;
+
+	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
+		perf_header__set_feat(&session->header, feat);
+
+	perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
+	perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
+	perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
+	perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
+}
+
 static int __cmd_record(int argc, const char **argv)
 {
 	struct perf_session *session;
@@ -1331,6 +1344,8 @@ static int __cmd_record(int argc, const char **argv)
 	if (perf_stat.file.is_pipe)
 		return -EINVAL;
 
+	init_features(session);
+
 	session->evlist   = evsel_list;
 	perf_stat.session = session;
 	perf_stat.record  = true;
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index a90c74b..d5636ba 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -37,6 +37,9 @@ static int perf_session__open(struct perf_session *session)
 	if (perf_data_file__is_pipe(file))
 		return 0;
 
+	if (perf_header__has_feat(&session->header, HEADER_STAT))
+		return 0;
+
 	if (!perf_evlist__valid_sample_type(session->evlist)) {
 		pr_err("non matching sample_type\n");
 		return -1;
-- 
cgit v1.1


From 8b99b1a4e0b082ea6a277766982dac84483d4d3c Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 5 Nov 2015 15:40:48 +0100
Subject: perf stat record: Synthesize stat record data

Synthesizing needed stat record data for report/script:
  - cpu/thread maps
  - stat config

Committer note:

New records generated on a perf.data file with this patch:

  $ perf report -D | grep PERF_RECORD_
  0x568 [0x28]: PERF_RECORD_THREAD_MAP nr: 1 thread: 29097
  0x590 [0x12]: PERF_RECORD_CPU_MAP nr: 1 cpu: 65535
  0x5a2 [0x40]: PERF_RECORD_STAT_CONFIG
  $

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Tested-by: Kan Liang <kan.liang@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1446734469-11352-5-git-send-email-jolsa@kernel.org
[ Adjusted wrt kernel PERF_RECORD_MMAP added when introducing 'perf stat record' ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-stat.c | 56 ++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 43 insertions(+), 13 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index c9c896a..45bf4d2 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -217,26 +217,20 @@ static inline int nsec_counter(struct perf_evsel *evsel)
 	return 0;
 }
 
-static int perf_stat__write(struct perf_stat *stat, void *bf, size_t size)
+static int process_synthesized_event(struct perf_tool *tool __maybe_unused,
+				     union perf_event *event,
+				     struct perf_sample *sample __maybe_unused,
+				     struct machine *machine __maybe_unused)
 {
-	if (perf_data_file__write(stat->session->file, bf, size) < 0) {
+	if (perf_data_file__write(&perf_stat.file, event, event->header.size) < 0) {
 		pr_err("failed to write perf data, error: %m\n");
 		return -1;
 	}
 
-	stat->bytes_written += size;
+	perf_stat.bytes_written += event->header.size;
 	return 0;
 }
 
-static int process_synthesized_event(struct perf_tool *tool,
-				     union perf_event *event,
-				     struct perf_sample *sample __maybe_unused,
-				     struct machine *machine __maybe_unused)
-{
-	struct perf_stat *stat = (void *)tool;
-	return perf_stat__write(stat, event, event->header.size);
-}
-
 /*
  * Read out the results of a single counter:
  * do not aggregate counts across CPUs in system-wide mode
@@ -323,6 +317,35 @@ static void workload_exec_failed_signal(int signo __maybe_unused, siginfo_t *inf
 	workload_exec_errno = info->si_value.sival_int;
 }
 
+static int perf_stat_synthesize_config(void)
+{
+	int err;
+
+	err = perf_event__synthesize_thread_map2(NULL, evsel_list->threads,
+						process_synthesized_event,
+						NULL);
+	if (err < 0) {
+		pr_err("Couldn't synthesize thread map.\n");
+		return err;
+	}
+
+	err = perf_event__synthesize_cpu_map(NULL, evsel_list->cpus,
+					     process_synthesized_event, NULL);
+	if (err < 0) {
+		pr_err("Couldn't synthesize thread map.\n");
+		return err;
+	}
+
+	err = perf_event__synthesize_stat_config(NULL, &stat_config,
+						 process_synthesized_event, NULL);
+	if (err < 0) {
+		pr_err("Couldn't synthesize config.\n");
+		return err;
+	}
+
+	return 0;
+}
+
 static int __run_perf_stat(int argc, const char **argv)
 {
 	int interval = stat_config.interval;
@@ -403,6 +426,10 @@ static int __run_perf_stat(int argc, const char **argv)
 						 fd, false);
 		if (err < 0)
 			return err;
+
+		err = perf_stat_synthesize_config();
+		if (err < 0)
+			return err;
 	}
 
 	/*
@@ -1560,7 +1587,10 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
 		 * a saner message about no samples being in the perf.data file.
 		 *
 		 * This also serves to suppress a warning about f_header.data.size == 0
-		 * in header.c.  -acme
+		 * in header.c at the moment 'perf stat record' gets introduced, which
+		 * is not really needed once we start adding the stat specific PERF_RECORD_
+		 * records, but the need to suppress the kptr_restrict messages in older
+		 * tools remain  -acme
 		 */
 		int fd = perf_data_file__fd(&perf_stat.file);
 		int err = perf_event__synthesize_kernel_mmap((void *)&perf_stat,
-- 
cgit v1.1


From 1c59612de0264790698e32eb0368daf3fcba4c65 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 5 Nov 2015 15:40:49 +0100
Subject: perf evlist: Export id_add_fd()

Will be used to storing the event IDs in evlist object so it get stored
into perf.data file.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Kan Liang <kan.liang@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1446734469-11352-6-git-send-email-jolsa@kernel.org
[ Split from the patch storing the ids in the perf.data file ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/evlist.c | 6 +++---
 tools/perf/util/evlist.h | 3 +++
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 8c44aad..b9eac0d 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -534,9 +534,9 @@ void perf_evlist__id_add(struct perf_evlist *evlist, struct perf_evsel *evsel,
 	evsel->id[evsel->ids++] = id;
 }
 
-static int perf_evlist__id_add_fd(struct perf_evlist *evlist,
-				  struct perf_evsel *evsel,
-				  int cpu, int thread, int fd)
+int perf_evlist__id_add_fd(struct perf_evlist *evlist,
+			   struct perf_evsel *evsel,
+			   int cpu, int thread, int fd)
 {
 	u64 read_data[4] = { 0, };
 	int id_idx = 1; /* The first entry is the counter value */
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index a459fe7..139a500 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -97,6 +97,9 @@ perf_evlist__find_tracepoint_by_name(struct perf_evlist *evlist,
 
 void perf_evlist__id_add(struct perf_evlist *evlist, struct perf_evsel *evsel,
 			 int cpu, int thread, u64 id);
+int perf_evlist__id_add_fd(struct perf_evlist *evlist,
+			   struct perf_evsel *evsel,
+			   int cpu, int thread, int fd);
 
 int perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd);
 int perf_evlist__alloc_pollfd(struct perf_evlist *evlist);
-- 
cgit v1.1


From 2af4646d1041ee590b0032d2b0103fa81aa43174 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 5 Nov 2015 15:40:49 +0100
Subject: perf stat record: Store events IDs in perf data file

Store event IDs in evlist object so it get stored into perf.data file.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Kan Liang <kan.liang@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1446734469-11352-6-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-stat.c | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 45bf4d2..39d0c30f 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -346,6 +346,38 @@ static int perf_stat_synthesize_config(void)
 	return 0;
 }
 
+#define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y))
+
+static int __store_counter_ids(struct perf_evsel *counter,
+			       struct cpu_map *cpus,
+			       struct thread_map *threads)
+{
+	int cpu, thread;
+
+	for (cpu = 0; cpu < cpus->nr; cpu++) {
+		for (thread = 0; thread < threads->nr; thread++) {
+			int fd = FD(counter, cpu, thread);
+
+			if (perf_evlist__id_add_fd(evsel_list, counter,
+						   cpu, thread, fd) < 0)
+				return -1;
+		}
+	}
+
+	return 0;
+}
+
+static int store_counter_ids(struct perf_evsel *counter)
+{
+	struct cpu_map *cpus = counter->cpus;
+	struct thread_map *threads = counter->threads;
+
+	if (perf_evsel__alloc_id(counter, cpus->nr, threads->nr))
+		return -ENOMEM;
+
+	return __store_counter_ids(counter, cpus, threads);
+}
+
 static int __run_perf_stat(int argc, const char **argv)
 {
 	int interval = stat_config.interval;
@@ -410,6 +442,9 @@ static int __run_perf_stat(int argc, const char **argv)
 		l = strlen(counter->unit);
 		if (l > unit_width)
 			unit_width = l;
+
+		if (STAT_RECORD && store_counter_ids(counter))
+			return -1;
 	}
 
 	if (perf_evlist__apply_filters(evsel_list, &counter)) {
-- 
cgit v1.1


From 664c98d4e1c2ff60627d78d4c8ae81cd2df13783 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 5 Nov 2015 15:40:50 +0100
Subject: perf stat record: Add pipe support for record command

Allowing storing stat record data into pipe, so report tools
(report/script) could read data directly from record.

Committer note:

Before this patch:

  $ perf stat record -o - usleep 1 | perf report -i -
  incompatible file format (rerun with -v to learn more)
  $ perf stat record -o - usleep 1 | perf script -i -
  incompatible file format (rerun with -v to learn more)
  $ ls -la perf.data
  ls: cannot access perf.data: No such file or directory
  $

After:

  $ perf stat record -o - usleep 1 | perf report -i -
  # To display the perf.data header info, please use
  # --header/--header-only options.
  #
  Error:
  The - file has no samples!
  $ perf stat record -o - usleep 1 | perf script -i -
  Display of symbols requested but neither sample IP nor sample address
  is selected. Hence, no addresses to convert to symbols.
  0 [0x80]: failed to process type: 64
  $ ls -la perf.data
  ls: cannot access perf.data: No such file or directory
  $

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Tested-by: Kan Liang <kan.liang@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1446734469-11352-7-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-stat.c | 39 ++++++++++++++++++++++++++++-----------
 1 file changed, 28 insertions(+), 11 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 39d0c30f..8a2f9ce 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -317,10 +317,19 @@ static void workload_exec_failed_signal(int signo __maybe_unused, siginfo_t *inf
 	workload_exec_errno = info->si_value.sival_int;
 }
 
-static int perf_stat_synthesize_config(void)
+static int perf_stat_synthesize_config(bool is_pipe)
 {
 	int err;
 
+	if (is_pipe) {
+		err = perf_event__synthesize_attrs(NULL, perf_stat.session,
+						   process_synthesized_event);
+		if (err < 0) {
+			pr_err("Couldn't synthesize attrs.\n");
+			return err;
+		}
+	}
+
 	err = perf_event__synthesize_thread_map2(NULL, evsel_list->threads,
 						process_synthesized_event,
 						NULL);
@@ -388,6 +397,7 @@ static int __run_perf_stat(int argc, const char **argv)
 	size_t l;
 	int status = 0;
 	const bool forks = (argc > 0);
+	bool is_pipe = STAT_RECORD ? perf_stat.file.is_pipe : false;
 
 	if (interval) {
 		ts.tv_sec  = interval / 1000;
@@ -398,7 +408,7 @@ static int __run_perf_stat(int argc, const char **argv)
 	}
 
 	if (forks) {
-		if (perf_evlist__prepare_workload(evsel_list, &target, argv, false,
+		if (perf_evlist__prepare_workload(evsel_list, &target, argv, is_pipe,
 						  workload_exec_failed_signal) < 0) {
 			perror("failed to prepare workload");
 			return -1;
@@ -457,12 +467,17 @@ static int __run_perf_stat(int argc, const char **argv)
 	if (STAT_RECORD) {
 		int err, fd = perf_data_file__fd(&perf_stat.file);
 
-		err = perf_session__write_header(perf_stat.session, evsel_list,
-						 fd, false);
+		if (is_pipe) {
+			err = perf_header__write_pipe(perf_data_file__fd(&perf_stat.file));
+		} else {
+			err = perf_session__write_header(perf_stat.session, evsel_list,
+							 fd, false);
+		}
+
 		if (err < 0)
 			return err;
 
-		err = perf_stat_synthesize_config();
+		err = perf_stat_synthesize_config(is_pipe);
 		if (err < 0)
 			return err;
 	}
@@ -970,6 +985,10 @@ static void print_counters(struct timespec *ts, int argc, const char **argv)
 	struct perf_evsel *counter;
 	char buf[64], *prefix = NULL;
 
+	/* Do not print anything if we record to the pipe. */
+	if (STAT_RECORD && perf_stat.file.is_pipe)
+		return;
+
 	if (interval)
 		print_interval(prefix = buf, ts);
 	else
@@ -1402,10 +1421,6 @@ static int __cmd_record(int argc, const char **argv)
 		return -1;
 	}
 
-	/* No pipe support ATM */
-	if (perf_stat.file.is_pipe)
-		return -EINVAL;
-
 	init_features(session);
 
 	session->evlist   = evsel_list;
@@ -1636,8 +1651,10 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
 				   "older tools may produce warnings about this file\n.");
 		}
 
-		perf_stat.session->header.data_size += perf_stat.bytes_written;
-		perf_session__write_header(perf_stat.session, evsel_list, fd, true);
+		if (!perf_stat.file.is_pipe) {
+			perf_stat.session->header.data_size += perf_stat.bytes_written;
+			perf_session__write_header(perf_stat.session, evsel_list, fd, true);
+		}
 
 		perf_session__delete(perf_stat.session);
 	}
-- 
cgit v1.1


From 5a6ea81b8f9ce2736759d256ac4d63be65751199 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 5 Nov 2015 15:40:51 +0100
Subject: perf stat record: Write stat events on record

Writing stat events on 'perf stat record' at the time we read counter
values from kernel.

Committer note:

After the patch:

  $ perf stat record usleep 1

   Performance counter stats for 'usleep 1':

          0.598006      task-clock (msec)         #    0.484 CPUs utilized
                 1      context-switches          #    0.002 M/sec
                 0      cpu-migrations            #    0.000 K/sec
                52      page-faults               #    0.087 M/sec
           882,744      cycles                    #    1.476 GHz
           581,416      stalled-cycles-frontend   #   65.86% frontend cycles idle
   <not supported>      stalled-cycles-backend
           636,479      instructions              #    0.72  insns per cycle
                                                  #    0.91  stalled cycles per insn
           129,334      branches                  #  216.275 M/sec
             7,512      branch-misses             #    5.81% of all branches

       0.001235157 seconds time elapsed

  $ oldperf evlist
  task-clock
  context-switches
  cpu-migrations
  page-faults
  cycles
  stalled-cycles-frontend
  stalled-cycles-backend
  instructions
  branches
  branch-misses
  $ oldperf report --stdio
  Error:
  The perf.data file has no samples!
  # To display the perf.data header info, please use --header/--header-only options.
  #
  $ perf report -D | grep PERF_RECORD
  0x5b0 [0x28]: PERF_RECORD_THREAD_MAP nr: 1 thread: 5504
  0x5d8 [0x12]: PERF_RECORD_CPU_MAP nr: 1 cpu: 65535
  0x5ea [0x40]: PERF_RECORD_STAT_CONFIG
  0x62a [0x30]: PERF_RECORD_STAT
  0x65a [0x30]: PERF_RECORD_STAT
  0x68a [0x30]: PERF_RECORD_STAT
  0x6ba [0x30]: PERF_RECORD_STAT
  0x6ea [0x30]: PERF_RECORD_STAT
  0x71a [0x30]: PERF_RECORD_STAT
  0x74a [0x30]: PERF_RECORD_STAT
  0x77a [0x30]: PERF_RECORD_STAT
  0x7aa [0x30]: PERF_RECORD_STAT
  -1 -1 0x7da [0x40]: PERF_RECORD_MMAP -1/0: [0xffffffff81000000(0x1f000000) @ 0xffffffff81000000]: x [kernel.kallsyms]_text
  $

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Kan Liang <kan.liang@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1446734469-11352-8-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-stat.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 8a2f9ce..32aa2ea 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -231,6 +231,18 @@ static int process_synthesized_event(struct perf_tool *tool __maybe_unused,
 	return 0;
 }
 
+#define SID(e, x, y) xyarray__entry(e->sample_id, x, y)
+
+static int
+perf_evsel__write_stat_event(struct perf_evsel *counter, u32 cpu, u32 thread,
+			     struct perf_counts_values *count)
+{
+	struct perf_sample_id *sid = SID(counter, cpu, thread);
+
+	return perf_event__synthesize_stat(NULL, cpu, thread, sid->id, count,
+					   process_synthesized_event, NULL);
+}
+
 /*
  * Read out the results of a single counter:
  * do not aggregate counts across CPUs in system-wide mode
@@ -254,6 +266,13 @@ static int read_counter(struct perf_evsel *counter)
 			count = perf_counts(counter->counts, cpu, thread);
 			if (perf_evsel__read(counter, cpu, thread, count))
 				return -1;
+
+			if (STAT_RECORD) {
+				if (perf_evsel__write_stat_event(counter, cpu, thread, count)) {
+					pr_err("failed to write stat event\n");
+					return -1;
+				}
+			}
 		}
 	}
 
-- 
cgit v1.1


From 7aad0c32bb6aaa39aab596264ddc49d44c8088f3 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 5 Nov 2015 15:40:52 +0100
Subject: perf stat record: Write stat round events on record

Writing stat round events on 'perf stat record' for each interval round.
In non interval mode we store round event after the last stat event.

Committer note:

After the patch:

  $ perf report -D | grep PERF_RECORD | grep ROUND
  0x852 [0x18]: PERF_RECORD_STAT_ROUND
  $

Reported-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1446734469-11352-9-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-stat.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 32aa2ea..fcece42 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -231,6 +231,16 @@ static int process_synthesized_event(struct perf_tool *tool __maybe_unused,
 	return 0;
 }
 
+static int write_stat_round_event(u64 time, u64 type)
+{
+	return perf_event__synthesize_stat_round(NULL, time, type,
+						 process_synthesized_event,
+						 NULL);
+}
+
+#define WRITE_STAT_ROUND_EVENT(time, interval) \
+	write_stat_round_event(time, PERF_STAT_ROUND_TYPE__ ## interval)
+
 #define SID(e, x, y) xyarray__entry(e->sample_id, x, y)
 
 static int
@@ -306,6 +316,11 @@ static void process_interval(void)
 	clock_gettime(CLOCK_MONOTONIC, &ts);
 	diff_timespec(&rs, &ts, &ref_time);
 
+	if (STAT_RECORD) {
+		if (WRITE_STAT_ROUND_EVENT(rs.tv_sec * NSECS_PER_SEC + rs.tv_nsec, INTERVAL))
+			pr_err("failed to write stat round event\n");
+	}
+
 	print_counters(&rs, 0, NULL);
 }
 
@@ -1670,6 +1685,11 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
 				   "older tools may produce warnings about this file\n.");
 		}
 
+		if (!interval) {
+			if (WRITE_STAT_ROUND_EVENT(walltime_nsecs_stats.max, FINAL))
+				pr_err("failed to write stat round event\n");
+		}
+
 		if (!perf_stat.file.is_pipe) {
 			perf_stat.session->header.data_size += perf_stat.bytes_written;
 			perf_session__write_header(perf_stat.session, evsel_list, fd, true);
-- 
cgit v1.1


From e9d6db8e8df42a38f79f264ab58c104e1678b12c Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 5 Nov 2015 15:40:53 +0100
Subject: perf stat record: Do not allow record with multiple runs mode

We currently don't support storing multiple session in perf.data,
so we can't allow -r option in stat record.

  $ perf stat -e cycles -r 2 record ls
  Cannot use -r option with perf stat record.

Committer note:

Before this patch we would a perf.data file such as:

  $ perf stat -e cycles -r 2 record ls
  <SNIP>

   Performance counter stats for 'ls' (2 runs):

         3,935,236      cycles

       0.002353261 seconds time elapsed                                          ( +-  4.76% )

  $ perf report -D | grep PERF_RECORD | grep ROUND
  0xf0 [0]: failed to process type: 16
  Error:
  failed to process sample
  $

Reported-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1446734469-11352-10-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-stat.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index fcece42..10f86a6 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1449,6 +1449,11 @@ static int __cmd_record(int argc, const char **argv)
 	if (output_name)
 		file->path = output_name;
 
+	if (run_count != 1 || forever) {
+		pr_err("Cannot use -r option with perf stat record.\n");
+		return -1;
+	}
+
 	session = perf_session__new(file, false, NULL);
 	if (session == NULL) {
 		pr_err("Perf session creation failed.\n");
-- 
cgit v1.1


From 7b60a7e3a687481553d2b6ec7e6390a6e82f1849 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 5 Nov 2015 15:40:54 +0100
Subject: perf stat record: Synthesize event update events

Synthesize other events stuff not carried within attr event - unit,
scale, name.

Reported-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1446734469-11352-11-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-stat.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 10f86a6..575e253 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -351,8 +351,19 @@ static void workload_exec_failed_signal(int signo __maybe_unused, siginfo_t *inf
 	workload_exec_errno = info->si_value.sival_int;
 }
 
+static bool has_unit(struct perf_evsel *counter)
+{
+	return counter->unit && *counter->unit;
+}
+
+static bool has_scale(struct perf_evsel *counter)
+{
+	return counter->scale != 1;
+}
+
 static int perf_stat_synthesize_config(bool is_pipe)
 {
+	struct perf_evsel *counter;
 	int err;
 
 	if (is_pipe) {
@@ -364,6 +375,54 @@ static int perf_stat_synthesize_config(bool is_pipe)
 		}
 	}
 
+	/*
+	 * Synthesize other events stuff not carried within
+	 * attr event - unit, scale, name
+	 */
+	evlist__for_each(evsel_list, counter) {
+		if (!counter->supported)
+			continue;
+
+		/*
+		 * Synthesize unit and scale only if it's defined.
+		 */
+		if (has_unit(counter)) {
+			err = perf_event__synthesize_event_update_unit(NULL, counter, process_synthesized_event);
+			if (err < 0) {
+				pr_err("Couldn't synthesize evsel unit.\n");
+				return err;
+			}
+		}
+
+		if (has_scale(counter)) {
+			err = perf_event__synthesize_event_update_scale(NULL, counter, process_synthesized_event);
+			if (err < 0) {
+				pr_err("Couldn't synthesize evsel scale.\n");
+				return err;
+			}
+		}
+
+		if (counter->own_cpus) {
+			err = perf_event__synthesize_event_update_cpus(NULL, counter, process_synthesized_event);
+			if (err < 0) {
+				pr_err("Couldn't synthesize evsel scale.\n");
+				return err;
+			}
+		}
+
+		/*
+		 * Name is needed only for pipe output,
+		 * perf.data carries event names.
+		 */
+		if (is_pipe) {
+			err = perf_event__synthesize_event_update_name(NULL, counter, process_synthesized_event);
+			if (err < 0) {
+				pr_err("Couldn't synthesize evsel name.\n");
+				return err;
+			}
+		}
+	}
+
 	err = perf_event__synthesize_thread_map2(NULL, evsel_list->threads,
 						process_synthesized_event,
 						NULL);
-- 
cgit v1.1


From ba6039b6c8fcc24de7d6ab7b0bada4becaf84a2c Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 5 Nov 2015 15:40:55 +0100
Subject: perf stat report: Add report command

Adding 'perf stat report' command support. ATM it only processes attr
events and display nothing.

Reported-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1446734469-11352-12-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-stat.txt | 12 +++++++
 tools/perf/builtin-stat.c              | 61 +++++++++++++++++++++++++++++++---
 2 files changed, 69 insertions(+), 4 deletions(-)

diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
index 70eee1c..95f4928 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -11,6 +11,7 @@ SYNOPSIS
 'perf stat' [-e <EVENT> | --event=EVENT] [-a] <command>
 'perf stat' [-e <EVENT> | --event=EVENT] [-a] -- <command> [<options>]
 'perf stat' [-e <EVENT> | --event=EVENT] [-a] record [-o file] -- <command> [<options>]
+'perf stat' report [-i file]
 
 DESCRIPTION
 -----------
@@ -26,6 +27,9 @@ OPTIONS
 record::
 	See STAT RECORD.
 
+report::
+	See STAT REPORT.
+
 -e::
 --event=::
 	Select the PMU event. Selection can be:
@@ -170,6 +174,14 @@ Stores stat data into perf data file.
 --output file::
 Output file name.
 
+STAT REPORT
+-----------
+Reads and reports stat data from perf data file.
+
+-i file::
+--input file::
+Input file name.
+
 
 EXAMPLES
 --------
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 575e253..abba49b 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -60,6 +60,8 @@
 #include "util/thread_map.h"
 #include "util/counts.h"
 #include "util/session.h"
+#include "util/tool.h"
+#include "asm/bug.h"
 
 #include <stdlib.h>
 #include <sys/prctl.h>
@@ -132,6 +134,7 @@ struct perf_stat {
 	struct perf_data_file	 file;
 	struct perf_session	*session;
 	u64			 bytes_written;
+	struct perf_tool	 tool;
 };
 
 static struct perf_stat		perf_stat;
@@ -1041,8 +1044,8 @@ static void print_header(int argc, const char **argv)
 		else if (target.cpu_list)
 			fprintf(output, "\'CPU(s) %s", target.cpu_list);
 		else if (!target__has_task(&target)) {
-			fprintf(output, "\'%s", argv[0]);
-			for (i = 1; i < argc; i++)
+			fprintf(output, "\'%s", argv ? argv[0] : "pipe");
+			for (i = 1; argv && (i < argc); i++)
 				fprintf(output, " %s", argv[i]);
 		} else if (target.pid)
 			fprintf(output, "process id \'%s", target.pid);
@@ -1527,6 +1530,55 @@ static int __cmd_record(int argc, const char **argv)
 	return argc;
 }
 
+static const char * const report_usage[] = {
+	"perf stat report [<options>]",
+	NULL,
+};
+
+static struct perf_stat perf_stat = {
+	.tool = {
+		.attr		= perf_event__process_attr,
+	},
+};
+
+static int __cmd_report(int argc, const char **argv)
+{
+	struct perf_session *session;
+	const struct option options[] = {
+	OPT_STRING('i', "input", &input_name, "file", "input file name"),
+	OPT_END()
+	};
+	struct stat st;
+	int ret;
+
+	argc = parse_options(argc, argv, options, report_usage, 0);
+
+	if (!input_name || !strlen(input_name)) {
+		if (!fstat(STDIN_FILENO, &st) && S_ISFIFO(st.st_mode))
+			input_name = "-";
+		else
+			input_name = "perf.data";
+	}
+
+	perf_stat.file.path = input_name;
+	perf_stat.file.mode = PERF_DATA_MODE_READ;
+
+	session = perf_session__new(&perf_stat.file, false, &perf_stat.tool);
+	if (session == NULL)
+		return -1;
+
+	perf_stat.session  = session;
+	stat_config.output = stderr;
+	evsel_list         = session->evlist;
+
+	ret = perf_session__process_events(session);
+	if (ret)
+		return ret;
+
+	perf_session__delete(session);
+	return 0;
+}
+
 int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
 {
 	const char * const stat_usage[] = {
@@ -1537,7 +1589,7 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
 	const char *mode;
 	FILE *output = stderr;
 	unsigned int interval;
-	const char * const stat_subcommands[] = { "record" };
+	const char * const stat_subcommands[] = { "record", "report" };
 
 	setlocale(LC_ALL, "");
 
@@ -1553,7 +1605,8 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
 		argc = __cmd_record(argc, argv);
 		if (argc < 0)
 			return -1;
-	}
+	} else if (argc && !strncmp(argv[0], "rep", 3))
+		return __cmd_report(argc, argv);
 
 	interval = stat_config.interval;
 
-- 
cgit v1.1


From 1975d36e14b3314d1d0c7a428946ec0c27fd6e95 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 5 Nov 2015 15:40:56 +0100
Subject: perf stat report: Process cpu/threads maps

Adding processing of cpu/threads maps. Configuring session's evlist with
these maps.

Reported-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1446734469-11352-13-git-send-email-jolsa@kernel.org
[ s/stat/st/g, s/time/tm/g parameters to fix 'already defined' build error with older distros (e.g. RHEL6.7) ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-stat.c | 66 +++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 64 insertions(+), 2 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index abba49b..0a1cfdd 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -135,6 +135,9 @@ struct perf_stat {
 	struct perf_session	*session;
 	u64			 bytes_written;
 	struct perf_tool	 tool;
+	bool			 maps_allocated;
+	struct cpu_map		*cpus;
+	struct thread_map	*threads;
 };
 
 static struct perf_stat		perf_stat;
@@ -234,9 +237,9 @@ static int process_synthesized_event(struct perf_tool *tool __maybe_unused,
 	return 0;
 }
 
-static int write_stat_round_event(u64 time, u64 type)
+static int write_stat_round_event(u64 tm, u64 type)
 {
-	return perf_event__synthesize_stat_round(NULL, time, type,
+	return perf_event__synthesize_stat_round(NULL, tm, type,
 						 process_synthesized_event,
 						 NULL);
 }
@@ -1530,6 +1533,63 @@ static int __cmd_record(int argc, const char **argv)
 	return argc;
 }
 
+static int set_maps(struct perf_stat *st)
+{
+	if (!st->cpus || !st->threads)
+		return 0;
+
+	if (WARN_ONCE(st->maps_allocated, "stats double allocation\n"))
+		return -EINVAL;
+
+	perf_evlist__set_maps(evsel_list, st->cpus, st->threads);
+
+	if (perf_evlist__alloc_stats(evsel_list, true))
+		return -ENOMEM;
+
+	st->maps_allocated = true;
+	return 0;
+}
+
+static
+int process_thread_map_event(struct perf_tool *tool __maybe_unused,
+			     union perf_event *event,
+			     struct perf_session *session __maybe_unused)
+{
+	struct perf_stat *st = container_of(tool, struct perf_stat, tool);
+
+	if (st->threads) {
+		pr_warning("Extra thread map event, ignoring.\n");
+		return 0;
+	}
+
+	st->threads = thread_map__new_event(&event->thread_map);
+	if (!st->threads)
+		return -ENOMEM;
+
+	return set_maps(st);
+}
+
+static
+int process_cpu_map_event(struct perf_tool *tool __maybe_unused,
+			  union perf_event *event,
+			  struct perf_session *session __maybe_unused)
+{
+	struct perf_stat *st = container_of(tool, struct perf_stat, tool);
+	struct cpu_map *cpus;
+
+	if (st->cpus) {
+		pr_warning("Extra cpu map event, ignoring.\n");
+		return 0;
+	}
+
+	cpus = cpu_map__new_data(&event->cpu_map.data);
+	if (!cpus)
+		return -ENOMEM;
+
+	st->cpus = cpus;
+	return set_maps(st);
+}
+
 static const char * const report_usage[] = {
 	"perf stat report [<options>]",
 	NULL,
@@ -1538,6 +1598,8 @@ static const char * const report_usage[] = {
 static struct perf_stat perf_stat = {
 	.tool = {
 		.attr		= perf_event__process_attr,
+		.thread_map	= process_thread_map_event,
+		.cpu_map	= process_cpu_map_event,
 	},
 };
 
-- 
cgit v1.1


From 62ba18ba938a8740ab18e02342b282d7378986f7 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 5 Nov 2015 15:40:57 +0100
Subject: perf stat report: Process stat config event

Adding processing of stat config event and initialize stat_config
object.

Reported-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1446734469-11352-14-git-send-email-jolsa@kernel.org
[ Renamed 'stat' parameter to 'st' to fix 'already defined' build error with older distros (e.g. RHEL6.7) ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-stat.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 0a1cfdd..1e5db50 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1533,6 +1533,15 @@ static int __cmd_record(int argc, const char **argv)
 	return argc;
 }
 
+static
+int process_stat_config_event(struct perf_tool *tool __maybe_unused,
+			      union perf_event *event,
+			      struct perf_session *session __maybe_unused)
+{
+	perf_event__read_stat_config(&stat_config, &event->stat_config);
+	return 0;
+}
+
 static int set_maps(struct perf_stat *st)
 {
 	if (!st->cpus || !st->threads)
@@ -1600,6 +1609,7 @@ static struct perf_stat perf_stat = {
 		.attr		= perf_event__process_attr,
 		.thread_map	= process_thread_map_event,
 		.cpu_map	= process_cpu_map_event,
+		.stat_config	= process_stat_config_event,
 	},
 };
 
-- 
cgit v1.1


From 68d702f7a1202dd39d9fa01b7bea92ba9e5785d9 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 5 Nov 2015 15:40:58 +0100
Subject: perf stat report: Add support to initialize aggr_map from file

Using perf.data's perf_env data to initialize aggregate config.

Reported-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1446734469-11352-15-git-send-email-jolsa@kernel.org
[ s/stat/st/g, s/socket/socket_id/g to fix 'already defined' build error with older distros (e.g. RHEL6.7) ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-stat.c | 103 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 103 insertions(+)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 1e5db50..c780525 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1326,6 +1326,101 @@ static void perf_stat__exit_aggr_mode(void)
 	cpus_aggr_map = NULL;
 }
 
+static inline int perf_env__get_cpu(struct perf_env *env, struct cpu_map *map, int idx)
+{
+	int cpu;
+
+	if (idx > map->nr)
+		return -1;
+
+	cpu = map->map[idx];
+
+	if (cpu >= env->nr_cpus_online)
+		return -1;
+
+	return cpu;
+}
+
+static int perf_env__get_socket(struct cpu_map *map, int idx, void *data)
+{
+	struct perf_env *env = data;
+	int cpu = perf_env__get_cpu(env, map, idx);
+
+	return cpu == -1 ? -1 : env->cpu[cpu].socket_id;
+}
+
+static int perf_env__get_core(struct cpu_map *map, int idx, void *data)
+{
+	struct perf_env *env = data;
+	int core = -1, cpu = perf_env__get_cpu(env, map, idx);
+
+	if (cpu != -1) {
+		int socket_id = env->cpu[cpu].socket_id;
+
+		/*
+		 * Encode socket in upper 16 bits
+		 * core_id is relative to socket, and
+		 * we need a global id. So we combine
+		 * socket + core id.
+		 */
+		core = (socket_id << 16) | (env->cpu[cpu].core_id & 0xffff);
+	}
+
+	return core;
+}
+
+static int perf_env__build_socket_map(struct perf_env *env, struct cpu_map *cpus,
+				      struct cpu_map **sockp)
+{
+	return cpu_map__build_map(cpus, sockp, perf_env__get_socket, env);
+}
+
+static int perf_env__build_core_map(struct perf_env *env, struct cpu_map *cpus,
+				    struct cpu_map **corep)
+{
+	return cpu_map__build_map(cpus, corep, perf_env__get_core, env);
+}
+
+static int perf_stat__get_socket_file(struct cpu_map *map, int idx)
+{
+	return perf_env__get_socket(map, idx, &perf_stat.session->header.env);
+}
+
+static int perf_stat__get_core_file(struct cpu_map *map, int idx)
+{
+	return perf_env__get_core(map, idx, &perf_stat.session->header.env);
+}
+
+static int perf_stat_init_aggr_mode_file(struct perf_stat *st)
+{
+	struct perf_env *env = &st->session->header.env;
+
+	switch (stat_config.aggr_mode) {
+	case AGGR_SOCKET:
+		if (perf_env__build_socket_map(env, evsel_list->cpus, &aggr_map)) {
+			perror("cannot build socket map");
+			return -1;
+		}
+		aggr_get_id = perf_stat__get_socket_file;
+		break;
+	case AGGR_CORE:
+		if (perf_env__build_core_map(env, evsel_list->cpus, &aggr_map)) {
+			perror("cannot build core map");
+			return -1;
+		}
+		aggr_get_id = perf_stat__get_core_file;
+		break;
+	case AGGR_NONE:
+	case AGGR_GLOBAL:
+	case AGGR_THREAD:
+	case AGGR_UNSET:
+	default:
+		break;
+	}
+
+	return 0;
+}
+
 /*
  * Add default attributes, if there were no attributes specified or
  * if -d/--detailed, -d -d or -d -d -d is used:
@@ -1538,7 +1633,15 @@ int process_stat_config_event(struct perf_tool *tool __maybe_unused,
 			      union perf_event *event,
 			      struct perf_session *session __maybe_unused)
 {
+	struct perf_stat *st = container_of(tool, struct perf_stat, tool);
+
 	perf_event__read_stat_config(&stat_config, &event->stat_config);
+
+	if (perf_stat.file.is_pipe)
+		perf_stat_init_aggr_mode();
+	else
+		perf_stat_init_aggr_mode_file(st);
+
 	return 0;
 }
 
-- 
cgit v1.1


From 6edb78a2178fd85d07b1a7fbb3629be56b860224 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 5 Nov 2015 15:41:01 +0100
Subject: perf stat report: Move csv_sep initialization before report command

So we have csv_sep properly initialized before report command leg.

Reported-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1446734469-11352-18-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-stat.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index c780525..f9d4e09 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1776,6 +1776,13 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
 					(const char **) stat_usage,
 					PARSE_OPT_STOP_AT_NON_OPTION);
 
+	if (csv_sep) {
+		csv_output = true;
+		if (!strcmp(csv_sep, "\\t"))
+			csv_sep = "\t";
+	} else
+		csv_sep = DEFAULT_SEPARATOR;
+
 	if (argc && !strncmp(argv[0], "rec", 3)) {
 		argc = __cmd_record(argc, argv);
 		if (argc < 0)
@@ -1826,13 +1833,6 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
 
 	stat_config.output = output;
 
-	if (csv_sep) {
-		csv_output = true;
-		if (!strcmp(csv_sep, "\\t"))
-			csv_sep = "\t";
-	} else
-		csv_sep = DEFAULT_SEPARATOR;
-
 	/*
 	 * let the spreadsheet do the pretty-printing
 	 */
-- 
cgit v1.1


From a56f9390aa9d9b1c782c3dbd5ca2c4245eb265fc Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 5 Nov 2015 15:40:59 +0100
Subject: perf stat report: Process stat and stat round events

Adding processing of stat and stat round events.

The stat data com in stat events, using generic function
process_stat_round_event to store data under perf_evsel object.

The stat-round events comes each interval or as last event in non
interval mode. The function process_stat_round_event process stored data
for each perf_evsel object and print it out.

Committer note:

After this patch:

  $ perf stat record usleep 1

   Performance counter stats for 'usleep 1':

        0.498381  task-clock (msec)       #    0.571 CPUs utilized
               2  context-switches        #    0.004 M/sec
               0  cpu-migrations          #    0.000 K/sec
             149  page-faults             #    0.299 M/sec
       1,271,635  cycles                  #    2.552 GHz
         928,712  stalled-cycles-frontend #   73.03% frontend cycles idle
         663,286  stalled-cycles-backend  #   52.16% backend  cycles idle
         792,614  instructions            #    0.62  insns per cycle
                                          #    1.17  stalled cycles per insn
         136,850  branches                #  274.589 M/sec
   <not counted>  branch-misses            (0.00%)

     0.000873419 seconds time elapsed

  $
  $ perf stat report

   Performance counter stats for '/home/acme/bin/perf stat record usleep 1':

        0.498381  task-clock (msec)       #    0.571 CPUs utilized
               2  context-switches        #    0.004 M/sec
               0  cpu-migrations          #    0.000 K/sec
             149  page-faults             #    0.299 M/sec
       1,271,635  cycles                  #    2.552 GHz
         928,712  stalled-cycles-frontend #   73.03% frontend cycles idle
         663,286  stalled-cycles-backend  #   52.16% backend  cycles idle
         792,614  instructions            #    0.62  insns per cycle
                                          #    1.17  stalled cycles per insn
         136,850  branches                #  274.589 M/sec
   <not counted>  branch-misses            (0.00%)

     0.000873419 seconds time elapsed

  $

Reported-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1446734469-11352-16-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-stat.c | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index f9d4e09..d27d1b9 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1628,6 +1628,32 @@ static int __cmd_record(int argc, const char **argv)
 	return argc;
 }
 
+static int process_stat_round_event(struct perf_tool *tool __maybe_unused,
+				    union perf_event *event,
+				    struct perf_session *session)
+{
+	struct stat_round_event *round = &event->stat_round;
+	struct perf_evsel *counter;
+	struct timespec tsh, *ts = NULL;
+	const char **argv = session->header.env.cmdline_argv;
+	int argc = session->header.env.nr_cmdline;
+
+	evlist__for_each(evsel_list, counter)
+		perf_stat_process_counter(&stat_config, counter);
+
+	if (round->type == PERF_STAT_ROUND_TYPE__FINAL)
+		update_stats(&walltime_nsecs_stats, round->time);
+
+	if (stat_config.interval && round->time) {
+		tsh.tv_sec  = round->time / NSECS_PER_SEC;
+		tsh.tv_nsec = round->time % NSECS_PER_SEC;
+		ts = &tsh;
+	}
+
+	print_counters(ts, argc, argv);
+	return 0;
+}
+
 static
 int process_stat_config_event(struct perf_tool *tool __maybe_unused,
 			      union perf_event *event,
@@ -1713,6 +1739,8 @@ static struct perf_stat perf_stat = {
 		.thread_map	= process_thread_map_event,
 		.cpu_map	= process_cpu_map_event,
 		.stat_config	= process_stat_config_event,
+		.stat		= perf_event__process_stat_event,
+		.stat_round	= process_stat_round_event,
 	},
 };
 
-- 
cgit v1.1


From fa6ea7817db3839b58d46649b7834320257e7702 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 5 Nov 2015 15:41:00 +0100
Subject: perf stat report: Process event update events

Adding processing of event update events, so perf stat report can store
additional info for events - unit,scale,name.

Committer note:

Before:

  # perf stat record -e power/energy-cores/ -a
  ^C
  Performance counter stats for 'system wide':

             77.41 Joules power/energy-cores/

       1.597176695 seconds time elapsed

  # perf stat report

  Performance counter stats for '/home/acme/bin/perf stat record -e power/energy-cores/ -a':

   332,488,114,176      power/energy-cores/

       1.597176695 seconds time elapsed

  #

After, using the same perf.data file generated in the "Before" case
above:

  # perf stat report

  Performance counter stats for '/home/acme/bin/perf stat record -e power/energy-cores/ -a':

             77.41 Joules power/energy-cores/

       1.597176695 seconds time elapsed

  #

Reported-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1446734469-11352-17-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-stat.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index d27d1b9..3ccf5a9 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1736,6 +1736,7 @@ static const char * const report_usage[] = {
 static struct perf_stat perf_stat = {
 	.tool = {
 		.attr		= perf_event__process_attr,
+		.event_update	= perf_event__process_event_update,
 		.thread_map	= process_thread_map_event,
 		.cpu_map	= process_cpu_map_event,
 		.stat_config	= process_stat_config_event,
-- 
cgit v1.1


From 89af4e05c21d68f22e07fe66940ea675615a49ed Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 5 Nov 2015 15:41:02 +0100
Subject: perf stat report: Allow to override aggr_mode

Allowing to override record aggr_mode. It's possible to use perf stat
like:

   $ perf stat report -A
   $ perf stat report --per-core
   $ perf stat report --per-socket

To customize the recorded aggregate mode regardless what was used during
the stat record command.

Reported-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1446734469-11352-19-git-send-email-jolsa@kernel.org
[ Renamed 'stat' parameter to 'st' to fix 'already defined' build error with older distros (e.g. RHEL6.7) ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-stat.txt | 10 ++++++++++
 tools/perf/builtin-stat.c              | 17 +++++++++++++++++
 2 files changed, 27 insertions(+)

diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
index 95f4928..52ef7a9 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -182,6 +182,16 @@ Reads and reports stat data from perf data file.
 --input file::
 Input file name.
 
+--per-socket::
+Aggregate counts per processor socket for system-wide mode measurements.
+
+--per-core::
+Aggregate counts per physical processor for system-wide mode measurements.
+
+-A::
+--no-aggr::
+Do not aggregate counts across all monitored CPUs.
+
 
 EXAMPLES
 --------
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 3ccf5a9..9805e03 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -138,6 +138,7 @@ struct perf_stat {
 	bool			 maps_allocated;
 	struct cpu_map		*cpus;
 	struct thread_map	*threads;
+	enum aggr_mode		 aggr_mode;
 };
 
 static struct perf_stat		perf_stat;
@@ -1663,6 +1664,15 @@ int process_stat_config_event(struct perf_tool *tool __maybe_unused,
 
 	perf_event__read_stat_config(&stat_config, &event->stat_config);
 
+	if (cpu_map__empty(st->cpus)) {
+		if (st->aggr_mode != AGGR_UNSET)
+			pr_warning("warning: processing task data, aggregation mode not set\n");
+		return 0;
+	}
+
+	if (st->aggr_mode != AGGR_UNSET)
+		stat_config.aggr_mode = st->aggr_mode;
+
 	if (perf_stat.file.is_pipe)
 		perf_stat_init_aggr_mode();
 	else
@@ -1743,6 +1753,7 @@ static struct perf_stat perf_stat = {
 		.stat		= perf_event__process_stat_event,
 		.stat_round	= process_stat_round_event,
 	},
+	.aggr_mode = AGGR_UNSET,
 };
 
 static int __cmd_report(int argc, const char **argv)
@@ -1750,6 +1761,12 @@ static int __cmd_report(int argc, const char **argv)
 	struct perf_session *session;
 	const struct option options[] = {
 	OPT_STRING('i', "input", &input_name, "file", "input file name"),
+	OPT_SET_UINT(0, "per-socket", &perf_stat.aggr_mode,
+		     "aggregate counts per processor socket", AGGR_SOCKET),
+	OPT_SET_UINT(0, "per-core", &perf_stat.aggr_mode,
+		     "aggregate counts per physical processor core", AGGR_CORE),
+	OPT_SET_UINT('A', "no-aggr", &perf_stat.aggr_mode,
+		     "disable CPU count aggregation", AGGR_NONE),
 	OPT_END()
 	};
 	struct stat st;
-- 
cgit v1.1


From 1717f2096b543cede7a380c858c765c41936bc35 Mon Sep 17 00:00:00 2001
From: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Date: Mon, 14 Dec 2015 11:19:09 +0100
Subject: panic, x86: Fix re-entrance problem due to panic on NMI

If panic on NMI happens just after panic() on the same CPU, panic() is
recursively called. Kernel stalls, as a result, after failing to acquire
panic_lock.

To avoid this problem, don't call panic() in NMI context if we've
already entered panic().

For that, introduce nmi_panic() macro to reduce code duplication. In
the case of panic on NMI, don't return from NMI handlers if another CPU
already panicked.

Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Aaron Tomlin <atomlin@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: David Hildenbrand <dahi@linux.vnet.ibm.com>
Cc: Don Zickus <dzickus@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Gobinda Charan Maji <gobinda.cemk07@gmail.com>
Cc: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Javi Merino <javi.merino@arm.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: kexec@lists.infradead.org
Cc: linux-doc@vger.kernel.org
Cc: lkml <linux-kernel@vger.kernel.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Nicolas Iooss <nicolas.iooss_linux@m4x.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Seth Jennings <sjenning@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Link: http://lkml.kernel.org/r/20151210014626.25437.13302.stgit@softrs
[ Cleanup comments, fixup formatting. ]
Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/nmi.c  | 16 ++++++++++++----
 include/linux/kernel.h | 20 ++++++++++++++++++++
 kernel/panic.c         | 16 +++++++++++++---
 kernel/watchdog.c      |  2 +-
 4 files changed, 46 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 697f90d..fca8793 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -231,7 +231,7 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs)
 #endif
 
 	if (panic_on_unrecovered_nmi)
-		panic("NMI: Not continuing");
+		nmi_panic("NMI: Not continuing");
 
 	pr_emerg("Dazed and confused, but trying to continue\n");
 
@@ -255,8 +255,16 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
 		 reason, smp_processor_id());
 	show_regs(regs);
 
-	if (panic_on_io_nmi)
-		panic("NMI IOCK error: Not continuing");
+	if (panic_on_io_nmi) {
+		nmi_panic("NMI IOCK error: Not continuing");
+
+		/*
+		 * If we end up here, it means we have received an NMI while
+		 * processing panic(). Simply return without delaying and
+		 * re-enabling NMIs.
+		 */
+		return;
+	}
 
 	/* Re-enable the IOCK line, wait for a few seconds */
 	reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK;
@@ -297,7 +305,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
 
 	pr_emerg("Do you have a strange power saving mode enabled?\n");
 	if (unknown_nmi_panic || panic_on_unrecovered_nmi)
-		panic("NMI: Not continuing");
+		nmi_panic("NMI: Not continuing");
 
 	pr_emerg("Dazed and confused, but trying to continue\n");
 }
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 350dfb0..750cc5c 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -446,6 +446,26 @@ extern int sysctl_panic_on_stackoverflow;
 extern bool crash_kexec_post_notifiers;
 
 /*
+ * panic_cpu is used for synchronizing panic() and crash_kexec() execution. It
+ * holds a CPU number which is executing panic() currently. A value of
+ * PANIC_CPU_INVALID means no CPU has entered panic() or crash_kexec().
+ */
+extern atomic_t panic_cpu;
+#define PANIC_CPU_INVALID	-1
+
+/*
+ * A variant of panic() called from NMI context. We return if we've already
+ * panicked on this CPU.
+ */
+#define nmi_panic(fmt, ...)						\
+do {									\
+	int cpu = raw_smp_processor_id();				\
+									\
+	if (atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, cpu) != cpu)	\
+		panic(fmt, ##__VA_ARGS__);				\
+} while (0)
+
+/*
  * Only to be used by arch init code. If the user over-wrote the default
  * CONFIG_PANIC_TIMEOUT, honor it.
  */
diff --git a/kernel/panic.c b/kernel/panic.c
index 4b150bc..3344524 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -61,6 +61,8 @@ void __weak panic_smp_self_stop(void)
 		cpu_relax();
 }
 
+atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID);
+
 /**
  *	panic - halt the system
  *	@fmt: The text string to print
@@ -71,17 +73,17 @@ void __weak panic_smp_self_stop(void)
  */
 void panic(const char *fmt, ...)
 {
-	static DEFINE_SPINLOCK(panic_lock);
 	static char buf[1024];
 	va_list args;
 	long i, i_next = 0;
 	int state = 0;
+	int old_cpu, this_cpu;
 
 	/*
 	 * Disable local interrupts. This will prevent panic_smp_self_stop
 	 * from deadlocking the first cpu that invokes the panic, since
 	 * there is nothing to prevent an interrupt handler (that runs
-	 * after the panic_lock is acquired) from invoking panic again.
+	 * after setting panic_cpu) from invoking panic() again.
 	 */
 	local_irq_disable();
 
@@ -94,8 +96,16 @@ void panic(const char *fmt, ...)
 	 * multiple parallel invocations of panic, all other CPUs either
 	 * stop themself or will wait until they are stopped by the 1st CPU
 	 * with smp_send_stop().
+	 *
+	 * `old_cpu == PANIC_CPU_INVALID' means this is the 1st CPU which
+	 * comes here, so go ahead.
+	 * `old_cpu == this_cpu' means we came from nmi_panic() which sets
+	 * panic_cpu to this CPU.  In this case, this is also the 1st CPU.
 	 */
-	if (!spin_trylock(&panic_lock))
+	this_cpu = raw_smp_processor_id();
+	old_cpu  = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu);
+
+	if (old_cpu != PANIC_CPU_INVALID && old_cpu != this_cpu)
 		panic_smp_self_stop();
 
 	console_verbose();
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 18f34cf..b9be18f 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -351,7 +351,7 @@ static void watchdog_overflow_callback(struct perf_event *event,
 			trigger_allbutself_cpu_backtrace();
 
 		if (hardlockup_panic)
-			panic("Hard LOCKUP");
+			nmi_panic("Hard LOCKUP");
 
 		__this_cpu_write(hard_watchdog_warn, true);
 		return;
-- 
cgit v1.1


From 58c5661f2144c089bbc2e5d87c9ec1dc1d2964fe Mon Sep 17 00:00:00 2001
From: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Date: Mon, 14 Dec 2015 11:19:10 +0100
Subject: panic, x86: Allow CPUs to save registers even if looping in NMI
 context

Currently, kdump_nmi_shootdown_cpus(), a subroutine of crash_kexec(),
sends an NMI IPI to CPUs which haven't called panic() to stop them,
save their register information and do some cleanups for crash dumping.
However, if such a CPU is infinitely looping in NMI context, we fail to
save its register information into the crash dump.

For example, this can happen when unknown NMIs are broadcast to all
CPUs as follows:

  CPU 0                             CPU 1
  ===========================       ==========================
  receive an unknown NMI
  unknown_nmi_error()
    panic()                         receive an unknown NMI
      spin_trylock(&panic_lock)     unknown_nmi_error()
      crash_kexec()                   panic()
                                        spin_trylock(&panic_lock)
                                        panic_smp_self_stop()
                                          infinite loop
        kdump_nmi_shootdown_cpus()
          issue NMI IPI -----------> blocked until IRET
                                          infinite loop...

Here, since CPU 1 is in NMI context, the second NMI from CPU 0 is
blocked until CPU 1 executes IRET. However, CPU 1 never executes IRET,
so the NMI is not handled and the callback function to save registers is
never called.

In practice, this can happen on some servers which broadcast NMIs to all
CPUs when the NMI button is pushed.

To save registers in this case, we need to:

  a) Return from NMI handler instead of looping infinitely
  or
  b) Call the callback function directly from the infinite loop

Inherently, a) is risky because NMI is also used to prevent corrupted
data from being propagated to devices.  So, we chose b).

This patch does the following:

1. Move the infinite looping of CPUs which haven't called panic() in NMI
   context (actually done by panic_smp_self_stop()) outside of panic() to
   enable us to refer pt_regs. Please note that panic_smp_self_stop() is
   still used for normal context.

2. Call a callback of kdump_nmi_shootdown_cpus() directly to save
   registers and do some cleanups after setting waiting_for_crash_ipi which
   is used for counting down the number of CPUs which handled the callback

Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Aaron Tomlin <atomlin@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: David Hildenbrand <dahi@linux.vnet.ibm.com>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Gobinda Charan Maji <gobinda.cemk07@gmail.com>
Cc: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Cc: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Javi Merino <javi.merino@arm.com>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: kexec@lists.infradead.org
Cc: linux-doc@vger.kernel.org
Cc: lkml <linux-kernel@vger.kernel.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Nicolas Iooss <nicolas.iooss_linux@m4x.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Seth Jennings <sjenning@redhat.com>
Cc: Stefan Lippers-Hollmann <s.l-h@gmx.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Link: http://lkml.kernel.org/r/20151210014628.25437.75256.stgit@softrs
[ Cleanup comments, fixup formatting. ]
Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/nmi.c    |  6 +++---
 arch/x86/kernel/reboot.c | 20 ++++++++++++++++++++
 include/linux/kernel.h   | 16 ++++++++++++----
 kernel/panic.c           |  9 +++++++++
 kernel/watchdog.c        |  2 +-
 5 files changed, 45 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index fca8793..424aec4 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -231,7 +231,7 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs)
 #endif
 
 	if (panic_on_unrecovered_nmi)
-		nmi_panic("NMI: Not continuing");
+		nmi_panic(regs, "NMI: Not continuing");
 
 	pr_emerg("Dazed and confused, but trying to continue\n");
 
@@ -256,7 +256,7 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
 	show_regs(regs);
 
 	if (panic_on_io_nmi) {
-		nmi_panic("NMI IOCK error: Not continuing");
+		nmi_panic(regs, "NMI IOCK error: Not continuing");
 
 		/*
 		 * If we end up here, it means we have received an NMI while
@@ -305,7 +305,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
 
 	pr_emerg("Do you have a strange power saving mode enabled?\n");
 	if (unknown_nmi_panic || panic_on_unrecovered_nmi)
-		nmi_panic("NMI: Not continuing");
+		nmi_panic(regs, "NMI: Not continuing");
 
 	pr_emerg("Dazed and confused, but trying to continue\n");
 }
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 02693dd..1da1302 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -718,6 +718,7 @@ static int crashing_cpu;
 static nmi_shootdown_cb shootdown_callback;
 
 static atomic_t waiting_for_crash_ipi;
+static int crash_ipi_issued;
 
 static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
 {
@@ -780,6 +781,9 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
 
 	smp_send_nmi_allbutself();
 
+	/* Kick CPUs looping in NMI context. */
+	WRITE_ONCE(crash_ipi_issued, 1);
+
 	msecs = 1000; /* Wait at most a second for the other cpus to stop */
 	while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) {
 		mdelay(1);
@@ -788,6 +792,22 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
 
 	/* Leave the nmi callback set */
 }
+
+/* Override the weak function in kernel/panic.c */
+void nmi_panic_self_stop(struct pt_regs *regs)
+{
+	while (1) {
+		/*
+		 * Wait for the crash dumping IPI to be issued, and then
+		 * call its callback directly.
+		 */
+		if (READ_ONCE(crash_ipi_issued))
+			crash_nmi_callback(0, regs); /* Don't return */
+
+		cpu_relax();
+	}
+}
+
 #else /* !CONFIG_SMP */
 void nmi_shootdown_cpus(nmi_shootdown_cb callback)
 {
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 750cc5c..7311c32 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -255,6 +255,7 @@ extern long (*panic_blink)(int state);
 __printf(1, 2)
 void panic(const char *fmt, ...)
 	__noreturn __cold;
+void nmi_panic_self_stop(struct pt_regs *);
 extern void oops_enter(void);
 extern void oops_exit(void);
 void print_oops_end_marker(void);
@@ -455,14 +456,21 @@ extern atomic_t panic_cpu;
 
 /*
  * A variant of panic() called from NMI context. We return if we've already
- * panicked on this CPU.
+ * panicked on this CPU. If another CPU already panicked, loop in
+ * nmi_panic_self_stop() which can provide architecture dependent code such
+ * as saving register state for crash dump.
  */
-#define nmi_panic(fmt, ...)						\
+#define nmi_panic(regs, fmt, ...)					\
 do {									\
-	int cpu = raw_smp_processor_id();				\
+	int old_cpu, cpu;						\
 									\
-	if (atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, cpu) != cpu)	\
+	cpu = raw_smp_processor_id();					\
+	old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, cpu);	\
+									\
+	if (old_cpu == PANIC_CPU_INVALID)				\
 		panic(fmt, ##__VA_ARGS__);				\
+	else if (old_cpu != cpu)					\
+		nmi_panic_self_stop(regs);				\
 } while (0)
 
 /*
diff --git a/kernel/panic.c b/kernel/panic.c
index 3344524..06f31b49 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -61,6 +61,15 @@ void __weak panic_smp_self_stop(void)
 		cpu_relax();
 }
 
+/*
+ * Stop ourselves in NMI context if another CPU has already panicked. Arch code
+ * may override this to prepare for crash dumping, e.g. save regs info.
+ */
+void __weak nmi_panic_self_stop(struct pt_regs *regs)
+{
+	panic_smp_self_stop();
+}
+
 atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID);
 
 /**
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index b9be18f..84b5035 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -351,7 +351,7 @@ static void watchdog_overflow_callback(struct perf_event *event,
 			trigger_allbutself_cpu_backtrace();
 
 		if (hardlockup_panic)
-			nmi_panic("Hard LOCKUP");
+			nmi_panic(regs, "Hard LOCKUP");
 
 		__this_cpu_write(hard_watchdog_warn, true);
 		return;
-- 
cgit v1.1


From 7bbee5ca3896f69f09c68be549cb8997abe6bca6 Mon Sep 17 00:00:00 2001
From: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Date: Mon, 14 Dec 2015 11:19:11 +0100
Subject: kexec: Fix race between panic() and crash_kexec()

Currently, panic() and crash_kexec() can be called at the same time.
For example (x86 case):

CPU 0:
  oops_end()
    crash_kexec()
      mutex_trylock() // acquired
        nmi_shootdown_cpus() // stop other CPUs

CPU 1:
  panic()
    crash_kexec()
      mutex_trylock() // failed to acquire
    smp_send_stop() // stop other CPUs
    infinite loop

If CPU 1 calls smp_send_stop() before nmi_shootdown_cpus(), kdump
fails.

In another case:

CPU 0:
  oops_end()
    crash_kexec()
      mutex_trylock() // acquired
        <NMI>
        io_check_error()
          panic()
            crash_kexec()
              mutex_trylock() // failed to acquire
            infinite loop

Clearly, this is an undesirable result.

To fix this problem, this patch changes crash_kexec() to exclude others
by using the panic_cpu atomic.

Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: kexec@lists.infradead.org
Cc: linux-doc@vger.kernel.org
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Minfei Huang <mnfhuang@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Seth Jennings <sjenning@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: x86-ml <x86@kernel.org>
Link: http://lkml.kernel.org/r/20151210014630.25437.94161.stgit@softrs
Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/kexec.h |  2 ++
 kernel/kexec_core.c   | 30 +++++++++++++++++++++++++++++-
 kernel/panic.c        |  8 ++++++--
 3 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index d140b1e..7b68d27 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -237,6 +237,7 @@ extern int kexec_purgatory_get_set_symbol(struct kimage *image,
 					  unsigned int size, bool get_value);
 extern void *kexec_purgatory_get_symbol_addr(struct kimage *image,
 					     const char *name);
+extern void __crash_kexec(struct pt_regs *);
 extern void crash_kexec(struct pt_regs *);
 int kexec_should_crash(struct task_struct *);
 void crash_save_cpu(struct pt_regs *regs, int cpu);
@@ -332,6 +333,7 @@ int __weak arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
 #else /* !CONFIG_KEXEC_CORE */
 struct pt_regs;
 struct task_struct;
+static inline void __crash_kexec(struct pt_regs *regs) { }
 static inline void crash_kexec(struct pt_regs *regs) { }
 static inline int kexec_should_crash(struct task_struct *p) { return 0; }
 #define kexec_in_progress false
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 11b64a6..c823f30 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -853,7 +853,12 @@ struct kimage *kexec_image;
 struct kimage *kexec_crash_image;
 int kexec_load_disabled;
 
-void crash_kexec(struct pt_regs *regs)
+/*
+ * No panic_cpu check version of crash_kexec().  This function is called
+ * only when panic_cpu holds the current CPU number; this is the only CPU
+ * which processes crash_kexec routines.
+ */
+void __crash_kexec(struct pt_regs *regs)
 {
 	/* Take the kexec_mutex here to prevent sys_kexec_load
 	 * running on one cpu from replacing the crash kernel
@@ -876,6 +881,29 @@ void crash_kexec(struct pt_regs *regs)
 	}
 }
 
+void crash_kexec(struct pt_regs *regs)
+{
+	int old_cpu, this_cpu;
+
+	/*
+	 * Only one CPU is allowed to execute the crash_kexec() code as with
+	 * panic().  Otherwise parallel calls of panic() and crash_kexec()
+	 * may stop each other.  To exclude them, we use panic_cpu here too.
+	 */
+	this_cpu = raw_smp_processor_id();
+	old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu);
+	if (old_cpu == PANIC_CPU_INVALID) {
+		/* This is the 1st CPU which comes here, so go ahead. */
+		__crash_kexec(regs);
+
+		/*
+		 * Reset panic_cpu to allow another panic()/crash_kexec()
+		 * call.
+		 */
+		atomic_set(&panic_cpu, PANIC_CPU_INVALID);
+	}
+}
+
 size_t crash_get_memory_size(void)
 {
 	size_t size = 0;
diff --git a/kernel/panic.c b/kernel/panic.c
index 06f31b49..b333380 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -136,9 +136,11 @@ void panic(const char *fmt, ...)
 	 * everything else.
 	 * If we want to run this after calling panic_notifiers, pass
 	 * the "crash_kexec_post_notifiers" option to the kernel.
+	 *
+	 * Bypass the panic_cpu check and call __crash_kexec directly.
 	 */
 	if (!crash_kexec_post_notifiers)
-		crash_kexec(NULL);
+		__crash_kexec(NULL);
 
 	/*
 	 * Note smp_send_stop is the usual smp shutdown function, which
@@ -161,9 +163,11 @@ void panic(const char *fmt, ...)
 	 * panic_notifiers and dumping kmsg before kdump.
 	 * Note: since some panic_notifiers can make crashed kernel
 	 * more unstable, it can increase risks of the kdump failure too.
+	 *
+	 * Bypass the panic_cpu check and call __crash_kexec directly.
 	 */
 	if (crash_kexec_post_notifiers)
-		crash_kexec(NULL);
+		__crash_kexec(NULL);
 
 	bust_spinlocks(0);
 
-- 
cgit v1.1


From b7c4948e9881fb38b048269f376fb4bf194ce24a Mon Sep 17 00:00:00 2001
From: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Date: Mon, 14 Dec 2015 11:19:12 +0100
Subject: x86/apic: Introduce apic_extnmi command line parameter

This patch introduces a command line parameter apic_extnmi:

 apic_extnmi=( bsp|all|none )

The default value is "bsp" and this is the current behavior: only the
Boot-Strapping Processor receives an external NMI.

"all" allows external NMIs to be broadcast to all CPUs. This would
raise the success rate of panic on NMI when BSP hangs in NMI context
or the external NMI is swallowed by other NMI handlers on the BSP.

If you specify "none", no CPUs receive external NMIs. This is useful for
the dump capture kernel so that it cannot be shot down by accidentally
pressing the external NMI button (on platforms which have it) while
saving a crash dump.

Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Bandan Das <bsd@redhat.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: kexec@lists.infradead.org
Cc: linux-doc@vger.kernel.org
Cc: "Maciej W. Rozycki" <macro@linux-mips.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ricardo Ribalda Delgado <ricardo.ribalda@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: x86-ml <x86@kernel.org>
Link: http://lkml.kernel.org/r/20151210014632.25437.43778.stgit@softrs
Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/kernel-parameters.txt |  9 +++++++++
 arch/x86/include/asm/apic.h         |  5 +++++
 arch/x86/kernel/apic/apic.c         | 34 ++++++++++++++++++++++++++++++++--
 3 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 742f69d..74acea5 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -472,6 +472,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			Change the amount of debugging information output
 			when initialising the APIC and IO-APIC components.
 
+	apic_extnmi=	[APIC,X86] External NMI delivery setting
+			Format: { bsp (default) | all | none }
+			bsp:  External NMI is delivered only to CPU 0
+			all:  External NMIs are broadcast to all CPUs as a
+			      backup of CPU 0
+			none: External NMI is masked for all CPUs. This is
+			      useful so that a dump capture kernel won't be
+			      shot down by NMI
+
 	autoconf=	[IPV6]
 			See Documentation/networking/ipv6.txt.
 
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 7f62ad4..c80f6b6 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -23,6 +23,11 @@
 #define APIC_VERBOSE 1
 #define APIC_DEBUG   2
 
+/* Macros for apic_extnmi which controls external NMI masking */
+#define APIC_EXTNMI_BSP		0 /* Default */
+#define APIC_EXTNMI_ALL		1
+#define APIC_EXTNMI_NONE	2
+
 /*
  * Define the default level of output to be very little
  * This can be turned up by using apic=verbose for more
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 8d7df74..8a5cdda 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -82,6 +82,12 @@ physid_mask_t phys_cpu_present_map;
 static unsigned int disabled_cpu_apicid __read_mostly = BAD_APICID;
 
 /*
+ * This variable controls which CPUs receive external NMIs.  By default,
+ * external NMIs are delivered only to the BSP.
+ */
+static int apic_extnmi = APIC_EXTNMI_BSP;
+
+/*
  * Map cpu index to physical APIC ID
  */
 DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID);
@@ -1161,6 +1167,8 @@ void __init init_bsp_APIC(void)
 	value = APIC_DM_NMI;
 	if (!lapic_is_integrated())		/* 82489DX */
 		value |= APIC_LVT_LEVEL_TRIGGER;
+	if (apic_extnmi == APIC_EXTNMI_NONE)
+		value |= APIC_LVT_MASKED;
 	apic_write(APIC_LVT1, value);
 }
 
@@ -1378,9 +1386,11 @@ void setup_local_APIC(void)
 	apic_write(APIC_LVT0, value);
 
 	/*
-	 * only the BP should see the LINT1 NMI signal, obviously.
+	 * Only the BSP sees the LINT1 NMI signal by default. This can be
+	 * modified by apic_extnmi= boot option.
 	 */
-	if (!cpu)
+	if ((!cpu && apic_extnmi != APIC_EXTNMI_NONE) ||
+	    apic_extnmi == APIC_EXTNMI_ALL)
 		value = APIC_DM_NMI;
 	else
 		value = APIC_DM_NMI | APIC_LVT_MASKED;
@@ -2557,3 +2567,23 @@ static int __init apic_set_disabled_cpu_apicid(char *arg)
 	return 0;
 }
 early_param("disable_cpu_apicid", apic_set_disabled_cpu_apicid);
+
+static int __init apic_set_extnmi(char *arg)
+{
+	if (!arg)
+		return -EINVAL;
+
+	if (!strncmp("all", arg, 3))
+		apic_extnmi = APIC_EXTNMI_ALL;
+	else if (!strncmp("none", arg, 4))
+		apic_extnmi = APIC_EXTNMI_NONE;
+	else if (!strncmp("bsp", arg, 3))
+		apic_extnmi = APIC_EXTNMI_BSP;
+	else {
+		pr_warn("Unknown external NMI delivery mode `%s' ignored\n", arg);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+early_param("apic_extnmi", apic_set_extnmi);
-- 
cgit v1.1


From b279d67df88a49c6ca32b3eebd195660254be394 Mon Sep 17 00:00:00 2001
From: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Date: Mon, 14 Dec 2015 11:19:13 +0100
Subject: x86/nmi: Save regs in crash dump on external NMI

Now, multiple CPUs can receive an external NMI simultaneously by
specifying the "apic_extnmi=all" command line parameter. When we take
a crash dump by using external NMI with this option, we fail to save
registers into the crash dump. This happens as follows:

  CPU 0                              CPU 1
  ================================   =============================
  receive an external NMI
  default_do_nmi()                   receive an external NMI
    spin_lock(&nmi_reason_lock)      default_do_nmi()
    io_check_error()                   spin_lock(&nmi_reason_lock)
      panic()                            busy loop
      ...
        kdump_nmi_shootdown_cpus()
          issue NMI IPI -----------> blocked until IRET
                                         busy loop...

Here, since CPU 1 is in NMI context, an additional NMI from CPU 0
remains unhandled until CPU 1 IRETs. However, CPU 1 will never execute
IRET so the NMI is not handled and the callback function to save
registers is never called.

To solve this issue, we check if the IPI for crash dumping was issued
while waiting for nmi_reason_lock to be released, and if so, call its
callback function directly. If the IPI is not issued (e.g. kdump is
disabled), the actual behavior doesn't change.

Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: kexec@lists.infradead.org
Cc: linux-doc@vger.kernel.org
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stefan Lippers-Hollmann <s.l-h@gmx.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: x86-ml <x86@kernel.org>
Link: http://lkml.kernel.org/r/20151210065245.4587.39316.stgit@softrs
Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/reboot.h |  1 +
 arch/x86/kernel/nmi.c         | 16 ++++++++++++++--
 arch/x86/kernel/reboot.c      | 24 +++++++++++++++++-------
 3 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h
index a82c4f1..2cb1cc2 100644
--- a/arch/x86/include/asm/reboot.h
+++ b/arch/x86/include/asm/reboot.h
@@ -25,5 +25,6 @@ void __noreturn machine_real_restart(unsigned int type);
 
 typedef void (*nmi_shootdown_cb)(int, struct pt_regs*);
 void nmi_shootdown_cpus(nmi_shootdown_cb callback);
+void run_crash_ipi_callback(struct pt_regs *regs);
 
 #endif /* _ASM_X86_REBOOT_H */
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 424aec4..8a2cdd7 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -29,6 +29,7 @@
 #include <asm/mach_traps.h>
 #include <asm/nmi.h>
 #include <asm/x86_init.h>
+#include <asm/reboot.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/nmi.h>
@@ -356,8 +357,19 @@ static void default_do_nmi(struct pt_regs *regs)
 		return;
 	}
 
-	/* Non-CPU-specific NMI: NMI sources can be processed on any CPU */
-	raw_spin_lock(&nmi_reason_lock);
+	/*
+	 * Non-CPU-specific NMI: NMI sources can be processed on any CPU.
+	 *
+	 * Another CPU may be processing panic routines while holding
+	 * nmi_reason_lock. Check if the CPU issued the IPI for crash dumping,
+	 * and if so, call its callback directly.  If there is no CPU preparing
+	 * crash dump, we simply loop here.
+	 */
+	while (!raw_spin_trylock(&nmi_reason_lock)) {
+		run_crash_ipi_callback(regs);
+		cpu_relax();
+	}
+
 	reason = x86_platform.get_nmi_reason();
 
 	if (reason & NMI_REASON_MASK) {
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 1da1302..d64889a 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -793,17 +793,23 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
 	/* Leave the nmi callback set */
 }
 
+/*
+ * Check if the crash dumping IPI got issued and if so, call its callback
+ * directly. This function is used when we have already been in NMI handler.
+ * It doesn't return.
+ */
+void run_crash_ipi_callback(struct pt_regs *regs)
+{
+	if (crash_ipi_issued)
+		crash_nmi_callback(0, regs);
+}
+
 /* Override the weak function in kernel/panic.c */
 void nmi_panic_self_stop(struct pt_regs *regs)
 {
 	while (1) {
-		/*
-		 * Wait for the crash dumping IPI to be issued, and then
-		 * call its callback directly.
-		 */
-		if (READ_ONCE(crash_ipi_issued))
-			crash_nmi_callback(0, regs); /* Don't return */
-
+		/* If no CPU is preparing crash dump, we simply loop here. */
+		run_crash_ipi_callback(regs);
 		cpu_relax();
 	}
 }
@@ -813,4 +819,8 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
 {
 	/* No other CPUs to shoot down */
 }
+
+void run_crash_ipi_callback(struct pt_regs *regs)
+{
+}
 #endif
-- 
cgit v1.1


From 9f318e3fcb1d4c48c26e8ca2ff2a459b82f36a23 Mon Sep 17 00:00:00 2001
From: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Date: Mon, 14 Dec 2015 11:19:14 +0100
Subject: Documentation: Document kernel.panic_on_io_nmi sysctl

kernel.panic_on_io_nmi sysctl was introduced by commit

  5211a242d0cb ("x86: Add sysctl to allow panic on IOCK NMI error")

but its documentation is missing. So, add it.

Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Requested-by: Borislav Petkov <bp@alien8.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Don Zickus <dzickus@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Heinrich Schuchardt <xypron.glpk@gmx.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: kexec@lists.infradead.org
Cc: linux-doc@vger.kernel.org
Cc: Manfred Spraul <manfred@colorfullife.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Nicolas Iooss <nicolas.iooss_linux@m4x.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Seth Jennings <sjenning@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: x86-ml <x86@kernel.org>
Link: http://lkml.kernel.org/r/20151210014637.25437.71903.stgit@softrs
Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/sysctl/kernel.txt | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index af70d15..73c6b1e 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -551,6 +551,21 @@ the recommended setting is 60.
 
 ==============================================================
 
+panic_on_io_nmi:
+
+Controls the kernel's behavior when a CPU receives an NMI caused by
+an IO error.
+
+0: try to continue operation (default)
+
+1: panic immediately. The IO error triggered an NMI. This indicates a
+   serious system condition which could result in IO data corruption.
+   Rather than continuing, panicking might be a better choice. Some
+   servers issue this sort of NMI when the dump button is pushed,
+   and you can use this option to take a crash dump.
+
+==============================================================
+
 panic_on_oops:
 
 Controls the kernel's behaviour when an oops or BUG is encountered.
-- 
cgit v1.1


From 2ccd71f1b278d450a6f8c8c737c7fe237ca06dc6 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 7 Dec 2015 10:39:39 +0100
Subject: x86/cpufeature: Move some of the scattered feature bits to
 x86_capability

Turn the CPUID leafs which are proper CPUID feature bit leafs into
separate ->x86_capability words.

Signed-off-by: Borislav Petkov <bp@suse.de>
Link: http://lkml.kernel.org/r/1449481182-27541-2-git-send-email-bp@alien8.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/cpufeature.h | 54 +++++++++++++++++++++++----------------
 arch/x86/kernel/cpu/common.c      |  5 ++++
 arch/x86/kernel/cpu/scattered.c   | 20 ---------------
 3 files changed, 37 insertions(+), 42 deletions(-)

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index e4f8010..13d78e0 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -12,7 +12,7 @@
 #include <asm/disabled-features.h>
 #endif
 
-#define NCAPINTS	14	/* N 32-bit words worth of info */
+#define NCAPINTS	16	/* N 32-bit words worth of info */
 #define NBUGINTS	1	/* N 32-bit bug flags */
 
 /*
@@ -181,22 +181,17 @@
 
 /*
  * Auxiliary flags: Linux defined - For features scattered in various
- * CPUID levels like 0x6, 0xA etc, word 7
+ * CPUID levels like 0x6, 0xA etc, word 7.
+ *
+ * Reuse free bits when adding new feature flags!
  */
-#define X86_FEATURE_IDA		( 7*32+ 0) /* Intel Dynamic Acceleration */
-#define X86_FEATURE_ARAT	( 7*32+ 1) /* Always Running APIC Timer */
+
 #define X86_FEATURE_CPB		( 7*32+ 2) /* AMD Core Performance Boost */
 #define X86_FEATURE_EPB		( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
-#define X86_FEATURE_PLN		( 7*32+ 5) /* Intel Power Limit Notification */
-#define X86_FEATURE_PTS		( 7*32+ 6) /* Intel Package Thermal Status */
-#define X86_FEATURE_DTHERM	( 7*32+ 7) /* Digital Thermal Sensor */
+
 #define X86_FEATURE_HW_PSTATE	( 7*32+ 8) /* AMD HW-PState */
 #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
-#define X86_FEATURE_HWP		( 7*32+ 10) /* "hwp" Intel HWP */
-#define X86_FEATURE_HWP_NOTIFY	( 7*32+ 11) /* Intel HWP_NOTIFY */
-#define X86_FEATURE_HWP_ACT_WINDOW ( 7*32+ 12) /* Intel HWP_ACT_WINDOW */
-#define X86_FEATURE_HWP_EPP	( 7*32+13) /* Intel HWP_EPP */
-#define X86_FEATURE_HWP_PKG_REQ ( 7*32+14) /* Intel HWP_PKG_REQ */
+
 #define X86_FEATURE_INTEL_PT	( 7*32+15) /* Intel Processor Trace */
 
 /* Virtualization flags: Linux defined, word 8 */
@@ -205,16 +200,7 @@
 #define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */
 #define X86_FEATURE_EPT         ( 8*32+ 3) /* Intel Extended Page Table */
 #define X86_FEATURE_VPID        ( 8*32+ 4) /* Intel Virtual Processor ID */
-#define X86_FEATURE_NPT		( 8*32+ 5) /* AMD Nested Page Table support */
-#define X86_FEATURE_LBRV	( 8*32+ 6) /* AMD LBR Virtualization support */
-#define X86_FEATURE_SVML	( 8*32+ 7) /* "svm_lock" AMD SVM locking MSR */
-#define X86_FEATURE_NRIPS	( 8*32+ 8) /* "nrip_save" AMD SVM next_rip save */
-#define X86_FEATURE_TSCRATEMSR  ( 8*32+ 9) /* "tsc_scale" AMD TSC scaling support */
-#define X86_FEATURE_VMCBCLEAN   ( 8*32+10) /* "vmcb_clean" AMD VMCB clean bits support */
-#define X86_FEATURE_FLUSHBYASID ( 8*32+11) /* AMD flush-by-ASID support */
-#define X86_FEATURE_DECODEASSISTS ( 8*32+12) /* AMD Decode Assists support */
-#define X86_FEATURE_PAUSEFILTER ( 8*32+13) /* AMD filtered pause intercept */
-#define X86_FEATURE_PFTHRESHOLD ( 8*32+14) /* AMD pause filter threshold */
+
 #define X86_FEATURE_VMMCALL     ( 8*32+15) /* Prefer vmmcall to vmcall */
 
 
@@ -258,6 +244,30 @@
 /* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */
 #define X86_FEATURE_CLZERO	(13*32+0) /* CLZERO instruction */
 
+/* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */
+#define X86_FEATURE_DTHERM	(14*32+ 0) /* Digital Thermal Sensor */
+#define X86_FEATURE_IDA		(14*32+ 1) /* Intel Dynamic Acceleration */
+#define X86_FEATURE_ARAT	(14*32+ 2) /* Always Running APIC Timer */
+#define X86_FEATURE_PLN		(14*32+ 4) /* Intel Power Limit Notification */
+#define X86_FEATURE_PTS		(14*32+ 6) /* Intel Package Thermal Status */
+#define X86_FEATURE_HWP		(14*32+ 7) /* Intel Hardware P-states */
+#define X86_FEATURE_HWP_NOTIFY	(14*32+ 8) /* HWP Notification */
+#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */
+#define X86_FEATURE_HWP_EPP	(14*32+10) /* HWP Energy Perf. Preference */
+#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */
+
+/* AMD SVM Feature Identification, CPUID level 0x8000000a (edx), word 15 */
+#define X86_FEATURE_NPT		(15*32+ 0) /* Nested Page Table support */
+#define X86_FEATURE_LBRV	(15*32+ 1) /* LBR Virtualization support */
+#define X86_FEATURE_SVML	(15*32+ 2) /* "svm_lock" SVM locking MSR */
+#define X86_FEATURE_NRIPS	(15*32+ 3) /* "nrip_save" SVM next_rip save */
+#define X86_FEATURE_TSCRATEMSR  (15*32+ 4) /* "tsc_scale" TSC scaling support */
+#define X86_FEATURE_VMCBCLEAN   (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */
+#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */
+#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */
+#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */
+#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */
+
 /*
  * BUG word(s)
  */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c2b7522..c755173 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -618,6 +618,8 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
 		cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
 
 		c->x86_capability[9] = ebx;
+
+		c->x86_capability[14] = cpuid_eax(0x00000006);
 	}
 
 	/* Extended state features: level 0x0000000d */
@@ -679,6 +681,9 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
 	if (c->extended_cpuid_level >= 0x80000007)
 		c->x86_power = cpuid_edx(0x80000007);
 
+	if (c->extended_cpuid_level >= 0x8000000a)
+		c->x86_capability[15] = cpuid_edx(0x8000000a);
+
 	init_scattered_cpuid_features(c);
 }
 
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 608fb26..8cb57df 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -31,32 +31,12 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
 	const struct cpuid_bit *cb;
 
 	static const struct cpuid_bit cpuid_bits[] = {
-		{ X86_FEATURE_DTHERM,		CR_EAX, 0, 0x00000006, 0 },
-		{ X86_FEATURE_IDA,		CR_EAX, 1, 0x00000006, 0 },
-		{ X86_FEATURE_ARAT,		CR_EAX, 2, 0x00000006, 0 },
-		{ X86_FEATURE_PLN,		CR_EAX, 4, 0x00000006, 0 },
-		{ X86_FEATURE_PTS,		CR_EAX, 6, 0x00000006, 0 },
-		{ X86_FEATURE_HWP,		CR_EAX, 7, 0x00000006, 0 },
-		{ X86_FEATURE_HWP_NOTIFY,	CR_EAX, 8, 0x00000006, 0 },
-		{ X86_FEATURE_HWP_ACT_WINDOW,	CR_EAX, 9, 0x00000006, 0 },
-		{ X86_FEATURE_HWP_EPP,		CR_EAX,10, 0x00000006, 0 },
-		{ X86_FEATURE_HWP_PKG_REQ,	CR_EAX,11, 0x00000006, 0 },
 		{ X86_FEATURE_INTEL_PT,		CR_EBX,25, 0x00000007, 0 },
 		{ X86_FEATURE_APERFMPERF,	CR_ECX, 0, 0x00000006, 0 },
 		{ X86_FEATURE_EPB,		CR_ECX, 3, 0x00000006, 0 },
 		{ X86_FEATURE_HW_PSTATE,	CR_EDX, 7, 0x80000007, 0 },
 		{ X86_FEATURE_CPB,		CR_EDX, 9, 0x80000007, 0 },
 		{ X86_FEATURE_PROC_FEEDBACK,	CR_EDX,11, 0x80000007, 0 },
-		{ X86_FEATURE_NPT,		CR_EDX, 0, 0x8000000a, 0 },
-		{ X86_FEATURE_LBRV,		CR_EDX, 1, 0x8000000a, 0 },
-		{ X86_FEATURE_SVML,		CR_EDX, 2, 0x8000000a, 0 },
-		{ X86_FEATURE_NRIPS,		CR_EDX, 3, 0x8000000a, 0 },
-		{ X86_FEATURE_TSCRATEMSR,	CR_EDX, 4, 0x8000000a, 0 },
-		{ X86_FEATURE_VMCBCLEAN,	CR_EDX, 5, 0x8000000a, 0 },
-		{ X86_FEATURE_FLUSHBYASID,	CR_EDX, 6, 0x8000000a, 0 },
-		{ X86_FEATURE_DECODEASSISTS,	CR_EDX, 7, 0x8000000a, 0 },
-		{ X86_FEATURE_PAUSEFILTER,	CR_EDX,10, 0x8000000a, 0 },
-		{ X86_FEATURE_PFTHRESHOLD,	CR_EDX,12, 0x8000000a, 0 },
 		{ 0, 0, 0, 0, 0 }
 	};
 
-- 
cgit v1.1


From 39c06df4dc10a41de5fe706f4378ee5f09beba73 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 7 Dec 2015 10:39:40 +0100
Subject: x86/cpufeature: Cleanup get_cpu_cap()

Add an enum for the ->x86_capability array indices and cleanup
get_cpu_cap() by killing some redundant local vars.

Signed-off-by: Borislav Petkov <bp@suse.de>
Link: http://lkml.kernel.org/r/1449481182-27541-3-git-send-email-bp@alien8.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/cpufeature.h | 20 +++++++++++++++++
 arch/x86/kernel/cpu/centaur.c     |  2 +-
 arch/x86/kernel/cpu/common.c      | 47 ++++++++++++++++++---------------------
 arch/x86/kernel/cpu/transmeta.c   |  4 ++--
 4 files changed, 45 insertions(+), 28 deletions(-)

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 13d78e0..35401fe 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -288,6 +288,26 @@
 #include <asm/asm.h>
 #include <linux/bitops.h>
 
+enum cpuid_leafs
+{
+	CPUID_1_EDX		= 0,
+	CPUID_8000_0001_EDX,
+	CPUID_8086_0001_EDX,
+	CPUID_LNX_1,
+	CPUID_1_ECX,
+	CPUID_C000_0001_EDX,
+	CPUID_8000_0001_ECX,
+	CPUID_LNX_2,
+	CPUID_LNX_3,
+	CPUID_7_0_EBX,
+	CPUID_D_1_EAX,
+	CPUID_F_0_EDX,
+	CPUID_F_1_EDX,
+	CPUID_8000_0008_EBX,
+	CPUID_6_EAX,
+	CPUID_8000_000A_EDX,
+};
+
 #ifdef CONFIG_X86_FEATURE_NAMES
 extern const char * const x86_cap_flags[NCAPINTS*32];
 extern const char * const x86_power_flags[32];
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index d8fba5c..ae20be6 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -43,7 +43,7 @@ static void init_c3(struct cpuinfo_x86 *c)
 		/* store Centaur Extended Feature Flags as
 		 * word 5 of the CPU capability bit array
 		 */
-		c->x86_capability[5] = cpuid_edx(0xC0000001);
+		c->x86_capability[CPUID_C000_0001_EDX] = cpuid_edx(0xC0000001);
 	}
 #ifdef CONFIG_X86_32
 	/* Cyrix III family needs CX8 & PGE explicitly enabled. */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c755173..e14d5bd 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -599,52 +599,47 @@ void cpu_detect(struct cpuinfo_x86 *c)
 
 void get_cpu_cap(struct cpuinfo_x86 *c)
 {
-	u32 tfms, xlvl;
-	u32 ebx;
+	u32 eax, ebx, ecx, edx;
 
 	/* Intel-defined flags: level 0x00000001 */
 	if (c->cpuid_level >= 0x00000001) {
-		u32 capability, excap;
+		cpuid(0x00000001, &eax, &ebx, &ecx, &edx);
 
-		cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
-		c->x86_capability[0] = capability;
-		c->x86_capability[4] = excap;
+		c->x86_capability[CPUID_1_ECX] = ecx;
+		c->x86_capability[CPUID_1_EDX] = edx;
 	}
 
 	/* Additional Intel-defined flags: level 0x00000007 */
 	if (c->cpuid_level >= 0x00000007) {
-		u32 eax, ebx, ecx, edx;
-
 		cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
 
-		c->x86_capability[9] = ebx;
+		c->x86_capability[CPUID_7_0_EBX] = ebx;
 
-		c->x86_capability[14] = cpuid_eax(0x00000006);
+		c->x86_capability[CPUID_6_EAX] = cpuid_eax(0x00000006);
 	}
 
 	/* Extended state features: level 0x0000000d */
 	if (c->cpuid_level >= 0x0000000d) {
-		u32 eax, ebx, ecx, edx;
-
 		cpuid_count(0x0000000d, 1, &eax, &ebx, &ecx, &edx);
 
-		c->x86_capability[10] = eax;
+		c->x86_capability[CPUID_D_1_EAX] = eax;
 	}
 
 	/* Additional Intel-defined flags: level 0x0000000F */
 	if (c->cpuid_level >= 0x0000000F) {
-		u32 eax, ebx, ecx, edx;
 
 		/* QoS sub-leaf, EAX=0Fh, ECX=0 */
 		cpuid_count(0x0000000F, 0, &eax, &ebx, &ecx, &edx);
-		c->x86_capability[11] = edx;
+		c->x86_capability[CPUID_F_0_EDX] = edx;
+
 		if (cpu_has(c, X86_FEATURE_CQM_LLC)) {
 			/* will be overridden if occupancy monitoring exists */
 			c->x86_cache_max_rmid = ebx;
 
 			/* QoS sub-leaf, EAX=0Fh, ECX=1 */
 			cpuid_count(0x0000000F, 1, &eax, &ebx, &ecx, &edx);
-			c->x86_capability[12] = edx;
+			c->x86_capability[CPUID_F_1_EDX] = edx;
+
 			if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) {
 				c->x86_cache_max_rmid = ecx;
 				c->x86_cache_occ_scale = ebx;
@@ -656,22 +651,24 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
 	}
 
 	/* AMD-defined flags: level 0x80000001 */
-	xlvl = cpuid_eax(0x80000000);
-	c->extended_cpuid_level = xlvl;
+	eax = cpuid_eax(0x80000000);
+	c->extended_cpuid_level = eax;
+
+	if ((eax & 0xffff0000) == 0x80000000) {
+		if (eax >= 0x80000001) {
+			cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
 
-	if ((xlvl & 0xffff0000) == 0x80000000) {
-		if (xlvl >= 0x80000001) {
-			c->x86_capability[1] = cpuid_edx(0x80000001);
-			c->x86_capability[6] = cpuid_ecx(0x80000001);
+			c->x86_capability[CPUID_8000_0001_ECX] = ecx;
+			c->x86_capability[CPUID_8000_0001_EDX] = edx;
 		}
 	}
 
 	if (c->extended_cpuid_level >= 0x80000008) {
-		u32 eax = cpuid_eax(0x80000008);
+		cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
 
 		c->x86_virt_bits = (eax >> 8) & 0xff;
 		c->x86_phys_bits = eax & 0xff;
-		c->x86_capability[13] = cpuid_ebx(0x80000008);
+		c->x86_capability[CPUID_8000_0008_EBX] = ebx;
 	}
 #ifdef CONFIG_X86_32
 	else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36))
@@ -682,7 +679,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
 		c->x86_power = cpuid_edx(0x80000007);
 
 	if (c->extended_cpuid_level >= 0x8000000a)
-		c->x86_capability[15] = cpuid_edx(0x8000000a);
+		c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a);
 
 	init_scattered_cpuid_features(c);
 }
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index 3fa0e5a..252da7a 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -12,7 +12,7 @@ static void early_init_transmeta(struct cpuinfo_x86 *c)
 	xlvl = cpuid_eax(0x80860000);
 	if ((xlvl & 0xffff0000) == 0x80860000) {
 		if (xlvl >= 0x80860001)
-			c->x86_capability[2] = cpuid_edx(0x80860001);
+			c->x86_capability[CPUID_8086_0001_EDX] = cpuid_edx(0x80860001);
 	}
 }
 
@@ -82,7 +82,7 @@ static void init_transmeta(struct cpuinfo_x86 *c)
 	/* Unhide possibly hidden capability flags */
 	rdmsr(0x80860004, cap_mask, uk);
 	wrmsr(0x80860004, ~0, uk);
-	c->x86_capability[0] = cpuid_edx(0x00000001);
+	c->x86_capability[CPUID_1_EDX] = cpuid_edx(0x00000001);
 	wrmsr(0x80860004, cap_mask, uk);
 
 	/* All Transmeta CPUs have a constant TSC */
-- 
cgit v1.1


From 362f924b64ba0f4be2ee0cb697690c33d40be721 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 7 Dec 2015 10:39:41 +0100
Subject: x86/cpufeature: Remove unused and seldomly used cpu_has_xx macros

Those are stupid and code should use static_cpu_has_safe() or
boot_cpu_has() instead. Kill the least used and unused ones.

The remaining ones need more careful inspection before a conversion can
happen. On the TODO.

Signed-off-by: Borislav Petkov <bp@suse.de>
Link: http://lkml.kernel.org/r/1449481182-27541-4-git-send-email-bp@alien8.de
Cc: David Sterba <dsterba@suse.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Chris Mason <clm@fb.com>
Cc: Josef Bacik <jbacik@fb.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/crypto/chacha20_glue.c             |  2 +-
 arch/x86/crypto/crc32c-intel_glue.c         |  2 +-
 arch/x86/include/asm/cmpxchg_32.h           |  2 +-
 arch/x86/include/asm/cmpxchg_64.h           |  2 +-
 arch/x86/include/asm/cpufeature.h           | 37 ++++-------------------------
 arch/x86/include/asm/xor_32.h               |  2 +-
 arch/x86/kernel/cpu/amd.c                   |  4 ++--
 arch/x86/kernel/cpu/common.c                |  4 +++-
 arch/x86/kernel/cpu/intel.c                 |  3 ++-
 arch/x86/kernel/cpu/intel_cacheinfo.c       |  6 ++---
 arch/x86/kernel/cpu/mtrr/generic.c          |  2 +-
 arch/x86/kernel/cpu/mtrr/main.c             |  2 +-
 arch/x86/kernel/cpu/perf_event_amd.c        |  4 ++--
 arch/x86/kernel/cpu/perf_event_amd_uncore.c | 11 +++++----
 arch/x86/kernel/fpu/init.c                  |  4 ++--
 arch/x86/kernel/hw_breakpoint.c             |  6 +++--
 arch/x86/kernel/smpboot.c                   |  2 +-
 arch/x86/kernel/vm86_32.c                   |  4 +++-
 arch/x86/mm/setup_nx.c                      |  4 ++--
 drivers/char/hw_random/via-rng.c            |  5 ++--
 drivers/crypto/padlock-aes.c                |  2 +-
 drivers/crypto/padlock-sha.c                |  2 +-
 drivers/iommu/intel_irq_remapping.c         |  2 +-
 fs/btrfs/disk-io.c                          |  2 +-
 24 files changed, 48 insertions(+), 68 deletions(-)

diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
index 722bace..8baaff5 100644
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -125,7 +125,7 @@ static struct crypto_alg alg = {
 
 static int __init chacha20_simd_mod_init(void)
 {
-	if (!cpu_has_ssse3)
+	if (!boot_cpu_has(X86_FEATURE_SSSE3))
 		return -ENODEV;
 
 #ifdef CONFIG_AS_AVX2
diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c
index 81a595d..0e98716 100644
--- a/arch/x86/crypto/crc32c-intel_glue.c
+++ b/arch/x86/crypto/crc32c-intel_glue.c
@@ -257,7 +257,7 @@ static int __init crc32c_intel_mod_init(void)
 	if (!x86_match_cpu(crc32c_cpu_id))
 		return -ENODEV;
 #ifdef CONFIG_X86_64
-	if (cpu_has_pclmulqdq) {
+	if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
 		alg.update = crc32c_pcl_intel_update;
 		alg.finup = crc32c_pcl_intel_finup;
 		alg.digest = crc32c_pcl_intel_digest;
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index f7e1429..e4959d0 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -109,6 +109,6 @@ static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new)
 
 #endif
 
-#define system_has_cmpxchg_double() cpu_has_cx8
+#define system_has_cmpxchg_double() boot_cpu_has(X86_FEATURE_CX8)
 
 #endif /* _ASM_X86_CMPXCHG_32_H */
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
index 1af9469..caa23a3 100644
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -18,6 +18,6 @@ static inline void set_64bit(volatile u64 *ptr, u64 val)
 	cmpxchg_local((ptr), (o), (n));					\
 })
 
-#define system_has_cmpxchg_double() cpu_has_cx16
+#define system_has_cmpxchg_double() boot_cpu_has(X86_FEATURE_CX16)
 
 #endif /* _ASM_X86_CMPXCHG_64_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 35401fe..144b042 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -385,58 +385,29 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
 } while (0)
 
 #define cpu_has_fpu		boot_cpu_has(X86_FEATURE_FPU)
-#define cpu_has_de		boot_cpu_has(X86_FEATURE_DE)
 #define cpu_has_pse		boot_cpu_has(X86_FEATURE_PSE)
 #define cpu_has_tsc		boot_cpu_has(X86_FEATURE_TSC)
 #define cpu_has_pge		boot_cpu_has(X86_FEATURE_PGE)
 #define cpu_has_apic		boot_cpu_has(X86_FEATURE_APIC)
-#define cpu_has_sep		boot_cpu_has(X86_FEATURE_SEP)
-#define cpu_has_mtrr		boot_cpu_has(X86_FEATURE_MTRR)
-#define cpu_has_mmx		boot_cpu_has(X86_FEATURE_MMX)
 #define cpu_has_fxsr		boot_cpu_has(X86_FEATURE_FXSR)
 #define cpu_has_xmm		boot_cpu_has(X86_FEATURE_XMM)
 #define cpu_has_xmm2		boot_cpu_has(X86_FEATURE_XMM2)
-#define cpu_has_xmm3		boot_cpu_has(X86_FEATURE_XMM3)
-#define cpu_has_ssse3		boot_cpu_has(X86_FEATURE_SSSE3)
 #define cpu_has_aes		boot_cpu_has(X86_FEATURE_AES)
 #define cpu_has_avx		boot_cpu_has(X86_FEATURE_AVX)
 #define cpu_has_avx2		boot_cpu_has(X86_FEATURE_AVX2)
-#define cpu_has_ht		boot_cpu_has(X86_FEATURE_HT)
-#define cpu_has_nx		boot_cpu_has(X86_FEATURE_NX)
-#define cpu_has_xstore		boot_cpu_has(X86_FEATURE_XSTORE)
-#define cpu_has_xstore_enabled	boot_cpu_has(X86_FEATURE_XSTORE_EN)
-#define cpu_has_xcrypt		boot_cpu_has(X86_FEATURE_XCRYPT)
-#define cpu_has_xcrypt_enabled	boot_cpu_has(X86_FEATURE_XCRYPT_EN)
-#define cpu_has_ace2		boot_cpu_has(X86_FEATURE_ACE2)
-#define cpu_has_ace2_enabled	boot_cpu_has(X86_FEATURE_ACE2_EN)
-#define cpu_has_phe		boot_cpu_has(X86_FEATURE_PHE)
-#define cpu_has_phe_enabled	boot_cpu_has(X86_FEATURE_PHE_EN)
-#define cpu_has_pmm		boot_cpu_has(X86_FEATURE_PMM)
-#define cpu_has_pmm_enabled	boot_cpu_has(X86_FEATURE_PMM_EN)
-#define cpu_has_ds		boot_cpu_has(X86_FEATURE_DS)
-#define cpu_has_pebs		boot_cpu_has(X86_FEATURE_PEBS)
 #define cpu_has_clflush		boot_cpu_has(X86_FEATURE_CLFLUSH)
-#define cpu_has_bts		boot_cpu_has(X86_FEATURE_BTS)
 #define cpu_has_gbpages		boot_cpu_has(X86_FEATURE_GBPAGES)
 #define cpu_has_arch_perfmon	boot_cpu_has(X86_FEATURE_ARCH_PERFMON)
 #define cpu_has_pat		boot_cpu_has(X86_FEATURE_PAT)
-#define cpu_has_xmm4_1		boot_cpu_has(X86_FEATURE_XMM4_1)
-#define cpu_has_xmm4_2		boot_cpu_has(X86_FEATURE_XMM4_2)
 #define cpu_has_x2apic		boot_cpu_has(X86_FEATURE_X2APIC)
 #define cpu_has_xsave		boot_cpu_has(X86_FEATURE_XSAVE)
-#define cpu_has_xsaveopt	boot_cpu_has(X86_FEATURE_XSAVEOPT)
 #define cpu_has_xsaves		boot_cpu_has(X86_FEATURE_XSAVES)
 #define cpu_has_osxsave		boot_cpu_has(X86_FEATURE_OSXSAVE)
 #define cpu_has_hypervisor	boot_cpu_has(X86_FEATURE_HYPERVISOR)
-#define cpu_has_pclmulqdq	boot_cpu_has(X86_FEATURE_PCLMULQDQ)
-#define cpu_has_perfctr_core	boot_cpu_has(X86_FEATURE_PERFCTR_CORE)
-#define cpu_has_perfctr_nb	boot_cpu_has(X86_FEATURE_PERFCTR_NB)
-#define cpu_has_perfctr_l2	boot_cpu_has(X86_FEATURE_PERFCTR_L2)
-#define cpu_has_cx8		boot_cpu_has(X86_FEATURE_CX8)
-#define cpu_has_cx16		boot_cpu_has(X86_FEATURE_CX16)
-#define cpu_has_eager_fpu	boot_cpu_has(X86_FEATURE_EAGER_FPU)
-#define cpu_has_topoext		boot_cpu_has(X86_FEATURE_TOPOEXT)
-#define cpu_has_bpext		boot_cpu_has(X86_FEATURE_BPEXT)
+/*
+ * Do not add any more of those clumsy macros - use static_cpu_has_safe() for
+ * fast paths and boot_cpu_has() otherwise!
+ */
 
 #if __GNUC__ >= 4
 extern void warn_pre_alternatives(void);
diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h
index 5a08bc8..c54beb4 100644
--- a/arch/x86/include/asm/xor_32.h
+++ b/arch/x86/include/asm/xor_32.h
@@ -553,7 +553,7 @@ do {							\
 	if (cpu_has_xmm) {				\
 		xor_speed(&xor_block_pIII_sse);		\
 		xor_speed(&xor_block_sse_pf64);		\
-	} else if (cpu_has_mmx) {			\
+	} else if (boot_cpu_has(X86_FEATURE_MMX)) {	\
 		xor_speed(&xor_block_pII_mmx);		\
 		xor_speed(&xor_block_p5_mmx);		\
 	} else {					\
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index a8816b3..34c3ad6 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -304,7 +304,7 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
 	int cpu = smp_processor_id();
 
 	/* get information required for multi-node processors */
-	if (cpu_has_topoext) {
+	if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
 		u32 eax, ebx, ecx, edx;
 
 		cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
@@ -922,7 +922,7 @@ static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum)
 
 void set_dr_addr_mask(unsigned long mask, int dr)
 {
-	if (!cpu_has_bpext)
+	if (!boot_cpu_has(X86_FEATURE_BPEXT))
 		return;
 
 	switch (dr) {
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index e14d5bd..4d5279c 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1445,7 +1445,9 @@ void cpu_init(void)
 
 	printk(KERN_INFO "Initializing CPU#%d\n", cpu);
 
-	if (cpu_feature_enabled(X86_FEATURE_VME) || cpu_has_tsc || cpu_has_de)
+	if (cpu_feature_enabled(X86_FEATURE_VME) ||
+	    cpu_has_tsc ||
+	    boot_cpu_has(X86_FEATURE_DE))
 		cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
 
 	load_current_idt();
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 209ac1e..565648b 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -445,7 +445,8 @@ static void init_intel(struct cpuinfo_x86 *c)
 
 	if (cpu_has_xmm2)
 		set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
-	if (cpu_has_ds) {
+
+	if (boot_cpu_has(X86_FEATURE_DS)) {
 		unsigned int l1;
 		rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
 		if (!(l1 & (1<<11)))
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index e38d338..0b6c523 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -591,7 +591,7 @@ cpuid4_cache_lookup_regs(int index, struct _cpuid4_info_regs *this_leaf)
 	unsigned		edx;
 
 	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
-		if (cpu_has_topoext)
+		if (boot_cpu_has(X86_FEATURE_TOPOEXT))
 			cpuid_count(0x8000001d, index, &eax.full,
 				    &ebx.full, &ecx.full, &edx);
 		else
@@ -637,7 +637,7 @@ static int find_num_cache_leaves(struct cpuinfo_x86 *c)
 void init_amd_cacheinfo(struct cpuinfo_x86 *c)
 {
 
-	if (cpu_has_topoext) {
+	if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
 		num_cache_leaves = find_num_cache_leaves(c);
 	} else if (c->extended_cpuid_level >= 0x80000006) {
 		if (cpuid_edx(0x80000006) & 0xf000)
@@ -809,7 +809,7 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index,
 	struct cacheinfo *this_leaf;
 	int i, sibling;
 
-	if (cpu_has_topoext) {
+	if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
 		unsigned int apicid, nshared, first, last;
 
 		this_leaf = this_cpu_ci->info_list + index;
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 3b533cf..c870af1 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -349,7 +349,7 @@ static void get_fixed_ranges(mtrr_type *frs)
 
 void mtrr_save_fixed_ranges(void *info)
 {
-	if (cpu_has_mtrr)
+	if (boot_cpu_has(X86_FEATURE_MTRR))
 		get_fixed_ranges(mtrr_state.fixed_ranges);
 }
 
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index f891b47..5c3d149 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -682,7 +682,7 @@ void __init mtrr_bp_init(void)
 
 	phys_addr = 32;
 
-	if (cpu_has_mtrr) {
+	if (boot_cpu_has(X86_FEATURE_MTRR)) {
 		mtrr_if = &generic_mtrr_ops;
 		size_or_mask = SIZE_OR_MASK_BITS(36);
 		size_and_mask = 0x00f00000;
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 1cee5d2..3ea177c 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -160,7 +160,7 @@ static inline int amd_pmu_addr_offset(int index, bool eventsel)
 	if (offset)
 		return offset;
 
-	if (!cpu_has_perfctr_core)
+	if (!boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
 		offset = index;
 	else
 		offset = index << 1;
@@ -652,7 +652,7 @@ static __initconst const struct x86_pmu amd_pmu = {
 
 static int __init amd_core_pmu_init(void)
 {
-	if (!cpu_has_perfctr_core)
+	if (!boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
 		return 0;
 
 	switch (boot_cpu_data.x86) {
diff --git a/arch/x86/kernel/cpu/perf_event_amd_uncore.c b/arch/x86/kernel/cpu/perf_event_amd_uncore.c
index cc6cedb..4974274 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_uncore.c
@@ -523,10 +523,10 @@ static int __init amd_uncore_init(void)
 	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
 		goto fail_nodev;
 
-	if (!cpu_has_topoext)
+	if (!boot_cpu_has(X86_FEATURE_TOPOEXT))
 		goto fail_nodev;
 
-	if (cpu_has_perfctr_nb) {
+	if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) {
 		amd_uncore_nb = alloc_percpu(struct amd_uncore *);
 		if (!amd_uncore_nb) {
 			ret = -ENOMEM;
@@ -540,7 +540,7 @@ static int __init amd_uncore_init(void)
 		ret = 0;
 	}
 
-	if (cpu_has_perfctr_l2) {
+	if (boot_cpu_has(X86_FEATURE_PERFCTR_L2)) {
 		amd_uncore_l2 = alloc_percpu(struct amd_uncore *);
 		if (!amd_uncore_l2) {
 			ret = -ENOMEM;
@@ -583,10 +583,11 @@ fail_online:
 
 	/* amd_uncore_nb/l2 should have been freed by cleanup_cpu_online */
 	amd_uncore_nb = amd_uncore_l2 = NULL;
-	if (cpu_has_perfctr_l2)
+
+	if (boot_cpu_has(X86_FEATURE_PERFCTR_L2))
 		perf_pmu_unregister(&amd_l2_pmu);
 fail_l2:
-	if (cpu_has_perfctr_nb)
+	if (boot_cpu_has(X86_FEATURE_PERFCTR_NB))
 		perf_pmu_unregister(&amd_nb_pmu);
 	if (amd_uncore_l2)
 		free_percpu(amd_uncore_l2);
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index be39b5f..22abea0 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -12,7 +12,7 @@
  */
 static void fpu__init_cpu_ctx_switch(void)
 {
-	if (!cpu_has_eager_fpu)
+	if (!boot_cpu_has(X86_FEATURE_EAGER_FPU))
 		stts();
 	else
 		clts();
@@ -287,7 +287,7 @@ static void __init fpu__init_system_ctx_switch(void)
 	current_thread_info()->status = 0;
 
 	/* Auto enable eagerfpu for xsaveopt */
-	if (cpu_has_xsaveopt && eagerfpu != DISABLE)
+	if (boot_cpu_has(X86_FEATURE_XSAVEOPT) && eagerfpu != DISABLE)
 		eagerfpu = ENABLE;
 
 	if (xfeatures_mask & XFEATURE_MASK_EAGER) {
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index 50a3fad..2bcfb5f 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -300,6 +300,10 @@ static int arch_build_bp_info(struct perf_event *bp)
 			return -EINVAL;
 		if (bp->attr.bp_addr & (bp->attr.bp_len - 1))
 			return -EINVAL;
+
+		if (!boot_cpu_has(X86_FEATURE_BPEXT))
+			return -EOPNOTSUPP;
+
 		/*
 		 * It's impossible to use a range breakpoint to fake out
 		 * user vs kernel detection because bp_len - 1 can't
@@ -307,8 +311,6 @@ static int arch_build_bp_info(struct perf_event *bp)
 		 * breakpoints, then we'll have to check for kprobe-blacklisted
 		 * addresses anywhere in the range.
 		 */
-		if (!cpu_has_bpext)
-			return -EOPNOTSUPP;
 		info->mask = bp->attr.bp_len - 1;
 		info->len = X86_BREAKPOINT_LEN_1;
 	}
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index f2281e9..24d57f7 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -304,7 +304,7 @@ do {									\
 
 static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
 {
-	if (cpu_has_topoext) {
+	if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
 		int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
 
 		if (c->phys_proc_id == o->phys_proc_id &&
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 5246193..483231e 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -357,8 +357,10 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
 	tss = &per_cpu(cpu_tss, get_cpu());
 	/* make room for real-mode segments */
 	tsk->thread.sp0 += 16;
-	if (cpu_has_sep)
+
+	if (static_cpu_has_safe(X86_FEATURE_SEP))
 		tsk->thread.sysenter_cs = 0;
+
 	load_sp0(tss, &tsk->thread);
 	put_cpu();
 
diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c
index 90555bf..92e2eac 100644
--- a/arch/x86/mm/setup_nx.c
+++ b/arch/x86/mm/setup_nx.c
@@ -31,7 +31,7 @@ early_param("noexec", noexec_setup);
 
 void x86_configure_nx(void)
 {
-	if (cpu_has_nx && !disable_nx)
+	if (boot_cpu_has(X86_FEATURE_NX) && !disable_nx)
 		__supported_pte_mask |= _PAGE_NX;
 	else
 		__supported_pte_mask &= ~_PAGE_NX;
@@ -39,7 +39,7 @@ void x86_configure_nx(void)
 
 void __init x86_report_nx(void)
 {
-	if (!cpu_has_nx) {
+	if (!boot_cpu_has(X86_FEATURE_NX)) {
 		printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
 		       "missing in CPU!\n");
 	} else {
diff --git a/drivers/char/hw_random/via-rng.c b/drivers/char/hw_random/via-rng.c
index 0c98a9d..44ce806 100644
--- a/drivers/char/hw_random/via-rng.c
+++ b/drivers/char/hw_random/via-rng.c
@@ -140,7 +140,7 @@ static int via_rng_init(struct hwrng *rng)
 	 * RNG configuration like it used to be the case in this
 	 * register */
 	if ((c->x86 == 6) && (c->x86_model >= 0x0f)) {
-		if (!cpu_has_xstore_enabled) {
+		if (!boot_cpu_has(X86_FEATURE_XSTORE_EN)) {
 			pr_err(PFX "can't enable hardware RNG "
 				"if XSTORE is not enabled\n");
 			return -ENODEV;
@@ -200,8 +200,9 @@ static int __init mod_init(void)
 {
 	int err;
 
-	if (!cpu_has_xstore)
+	if (!boot_cpu_has(X86_FEATURE_XSTORE))
 		return -ENODEV;
+
 	pr_info("VIA RNG detected\n");
 	err = hwrng_register(&via_rng);
 	if (err) {
diff --git a/drivers/crypto/padlock-aes.c b/drivers/crypto/padlock-aes.c
index da2d677..97a3646 100644
--- a/drivers/crypto/padlock-aes.c
+++ b/drivers/crypto/padlock-aes.c
@@ -515,7 +515,7 @@ static int __init padlock_init(void)
 	if (!x86_match_cpu(padlock_cpu_id))
 		return -ENODEV;
 
-	if (!cpu_has_xcrypt_enabled) {
+	if (!boot_cpu_has(X86_FEATURE_XCRYPT_EN)) {
 		printk(KERN_NOTICE PFX "VIA PadLock detected, but not enabled. Hmm, strange...\n");
 		return -ENODEV;
 	}
diff --git a/drivers/crypto/padlock-sha.c b/drivers/crypto/padlock-sha.c
index 4e154c9..8c5f906 100644
--- a/drivers/crypto/padlock-sha.c
+++ b/drivers/crypto/padlock-sha.c
@@ -540,7 +540,7 @@ static int __init padlock_init(void)
 	struct shash_alg *sha1;
 	struct shash_alg *sha256;
 
-	if (!x86_match_cpu(padlock_sha_ids) || !cpu_has_phe_enabled)
+	if (!x86_match_cpu(padlock_sha_ids) || !boot_cpu_has(X86_FEATURE_PHE_EN))
 		return -ENODEV;
 
 	/* Register the newly added algorithm module if on *
diff --git a/drivers/iommu/intel_irq_remapping.c b/drivers/iommu/intel_irq_remapping.c
index 1fae188..c12ba45 100644
--- a/drivers/iommu/intel_irq_remapping.c
+++ b/drivers/iommu/intel_irq_remapping.c
@@ -753,7 +753,7 @@ static inline void set_irq_posting_cap(void)
 		 * should have X86_FEATURE_CX16 support, this has been confirmed
 		 * with Intel hardware guys.
 		 */
-		if ( cpu_has_cx16 )
+		if (boot_cpu_has(X86_FEATURE_CX16))
 			intel_irq_remap_ops.capability |= 1 << IRQ_POSTING_CAP;
 
 		for_each_iommu(iommu, drhd)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 974be09..42a378a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -923,7 +923,7 @@ static int check_async_write(struct inode *inode, unsigned long bio_flags)
 	if (bio_flags & EXTENT_BIO_TREE_LOG)
 		return 0;
 #ifdef CONFIG_X86
-	if (cpu_has_xmm4_2)
+	if (static_cpu_has_safe(X86_FEATURE_XMM4_2))
 		return 0;
 #endif
 	return 1;
-- 
cgit v1.1


From 6e1315fe82308cd29e7550eab967262e8bbc71a3 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 7 Dec 2015 10:39:42 +0100
Subject: x86/cpu: Provide a config option to disable static_cpu_has

This brings .text savings of about ~1.6K when building a tinyconfig. It
is off by default so nothing changes for the default.

Kconfig help text from Josh.

Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Link: http://lkml.kernel.org/r/1449481182-27541-5-git-send-email-bp@alien8.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Kconfig                  | 11 +++++++++++
 arch/x86/include/asm/cpufeature.h |  2 +-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index db3622f..a2abc2f 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -349,6 +349,17 @@ config X86_FEATURE_NAMES
 
 	  If in doubt, say Y.
 
+config X86_FAST_FEATURE_TESTS
+	bool "Fast CPU feature tests" if EMBEDDED
+	default y
+	---help---
+	  Some fast-paths in the kernel depend on the capabilities of the CPU.
+	  Say Y here for the kernel to patch in the appropriate code at runtime
+	  based on the capabilities of the CPU. The infrastructure for patching
+	  code at runtime takes up some additional space; space-constrained
+	  embedded systems may wish to say N here to produce smaller, slightly
+	  slower code.
+
 config X86_X2APIC
 	bool "Support x2apic"
 	depends on X86_LOCAL_APIC && X86_64 && (IRQ_REMAP || HYPERVISOR_GUEST)
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 144b042..43e1444 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -409,7 +409,7 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
  * fast paths and boot_cpu_has() otherwise!
  */
 
-#if __GNUC__ >= 4
+#if __GNUC__ >= 4 && defined(CONFIG_X86_FAST_FEATURE_TESTS)
 extern void warn_pre_alternatives(void);
 extern bool __static_cpu_has_safe(u16 bit);
 
-- 
cgit v1.1


From 4baf7fe40790c8ffdab54edc8e5b7051cfce3968 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 7 Dec 2015 10:24:28 +0100
Subject: x86/mm: Align macro defines

Bring PAGE_{SHIFT,SIZE,MASK} to the same indentation level as the rest
of the header.

Signed-off-by: Borislav Petkov <bp@suse.de>
Link: http://lkml.kernel.org/r/1449480268-26583-1-git-send-email-bp@alien8.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/page_types.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index cc071c6..7bd0099 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -5,9 +5,9 @@
 #include <linux/types.h>
 
 /* PAGE_SHIFT determines the page size */
-#define PAGE_SHIFT	12
-#define PAGE_SIZE	(_AC(1,UL) << PAGE_SHIFT)
-#define PAGE_MASK	(~(PAGE_SIZE-1))
+#define PAGE_SHIFT		12
+#define PAGE_SIZE		(_AC(1,UL) << PAGE_SHIFT)
+#define PAGE_MASK		(~(PAGE_SIZE-1))
 
 #define PMD_PAGE_SIZE		(_AC(1, UL) << PMD_SHIFT)
 #define PMD_PAGE_MASK		(~(PMD_PAGE_SIZE-1))
-- 
cgit v1.1


From c8f3e518d3444ee9200a4987421fcee60f768f11 Mon Sep 17 00:00:00 2001
From: Jake Oshins <jakeo@microsoft.com>
Date: Thu, 10 Dec 2015 17:52:59 +0000
Subject: x86/irq: Export functions to allow MSI domains in modules

The Linux kernel already has the concept of IRQ domain, wherein a
component can expose a set of IRQs which are managed by a particular
interrupt controller chip or other subsystem. The PCI driver exposes
the notion of an IRQ domain for Message-Signaled Interrupts (MSI) from
PCI Express devices. This patch exposes the functions which are
necessary for creating a MSI IRQ domain within a module.

[ tglx: Split it into x86 and core irq parts ]

Signed-off-by: Jake Oshins <jakeo@microsoft.com>
Cc: gregkh@linuxfoundation.org
Cc: kys@microsoft.com
Cc: devel@linuxdriverproject.org
Cc: olaf@aepfle.de
Cc: apw@canonical.com
Cc: vkuznets@redhat.com
Cc: haiyangz@microsoft.com
Cc: marc.zyngier@arm.com
Cc: bhelgaas@google.com
Link: http://lkml.kernel.org/r/1449769983-12948-4-git-send-email-jakeo@microsoft.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/msi.h    | 6 ++++++
 arch/x86/kernel/apic/msi.c    | 8 +++++---
 arch/x86/kernel/apic/vector.c | 2 ++
 3 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/msi.h b/arch/x86/include/asm/msi.h
index 93724cc..eb4b09b 100644
--- a/arch/x86/include/asm/msi.h
+++ b/arch/x86/include/asm/msi.h
@@ -1,7 +1,13 @@
 #ifndef _ASM_X86_MSI_H
 #define _ASM_X86_MSI_H
 #include <asm/hw_irq.h>
+#include <asm/irqdomain.h>
 
 typedef struct irq_alloc_info msi_alloc_info_t;
 
+int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec,
+		    msi_alloc_info_t *arg);
+
+void pci_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc);
+
 #endif /* _ASM_X86_MSI_H */
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 5f1feb6..ade2532 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -96,8 +96,8 @@ static irq_hw_number_t pci_msi_get_hwirq(struct msi_domain_info *info,
 	return arg->msi_hwirq;
 }
 
-static int pci_msi_prepare(struct irq_domain *domain, struct device *dev,
-			   int nvec, msi_alloc_info_t *arg)
+int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec,
+		    msi_alloc_info_t *arg)
 {
 	struct pci_dev *pdev = to_pci_dev(dev);
 	struct msi_desc *desc = first_pci_msi_entry(pdev);
@@ -113,11 +113,13 @@ static int pci_msi_prepare(struct irq_domain *domain, struct device *dev,
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(pci_msi_prepare);
 
-static void pci_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc)
+void pci_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc)
 {
 	arg->msi_hwirq = pci_msi_domain_calc_hwirq(arg->msi_dev, desc);
 }
+EXPORT_SYMBOL_GPL(pci_msi_set_desc);
 
 static struct msi_domain_ops pci_msi_domain_ops = {
 	.get_hwirq	= pci_msi_get_hwirq,
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 861bc59..908cb37 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -29,6 +29,7 @@ struct apic_chip_data {
 };
 
 struct irq_domain *x86_vector_domain;
+EXPORT_SYMBOL_GPL(x86_vector_domain);
 static DEFINE_RAW_SPINLOCK(vector_lock);
 static cpumask_var_t vector_cpumask;
 static struct irq_chip lapic_controller;
@@ -66,6 +67,7 @@ struct irq_cfg *irqd_cfg(struct irq_data *irq_data)
 
 	return data ? &data->cfg : NULL;
 }
+EXPORT_SYMBOL_GPL(irqd_cfg);
 
 struct irq_cfg *irq_cfg(unsigned int irq)
 {
-- 
cgit v1.1


From fb75a4282d0d9a3c7c44d940582c2d226cf3acfb Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 19 Dec 2015 20:07:38 +0000
Subject: futex: Drop refcount if requeue_pi() acquired the rtmutex

If the proxy lock in the requeue loop acquires the rtmutex for a
waiter then it acquired also refcount on the pi_state related to the
futex, but the waiter side does not drop the reference count.

Add the missing free_pi_state() call.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Darren Hart <darren@dvhart.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Bhuvanesh_Surachari@mentor.com
Cc: Andy Lowe <Andy_Lowe@mentor.com>
Link: http://lkml.kernel.org/r/20151219200607.178132067@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
---
 kernel/futex.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/kernel/futex.c b/kernel/futex.c
index 684d754..24fbc77 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2755,6 +2755,11 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 		if (q.pi_state && (q.pi_state->owner != current)) {
 			spin_lock(q.lock_ptr);
 			ret = fixup_pi_state_owner(uaddr2, &q, current);
+			/*
+			 * Drop the reference to the pi state which
+			 * the requeue_pi() code acquired for us.
+			 */
+			free_pi_state(q.pi_state);
 			spin_unlock(q.lock_ptr);
 		}
 	} else {
-- 
cgit v1.1


From 29e9ee5d48c35d6cf8afe09bdf03f77125c9ac11 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 19 Dec 2015 20:07:39 +0000
Subject: futex: Rename free_pi_state() to put_pi_state()

free_pi_state() is confusing as it is in fact only freeing/caching the
pi state when the last reference is gone. Rename it to put_pi_state()
which reflects better what it is doing.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Darren Hart <darren@dvhart.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Bhuvanesh_Surachari@mentor.com
Cc: Andy Lowe <Andy_Lowe@mentor.com>
Link: http://lkml.kernel.org/r/20151219200607.259636467@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/futex.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index 24fbc77..f1581ff 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -725,9 +725,12 @@ static struct futex_pi_state * alloc_pi_state(void)
 }
 
 /*
+ * Drops a reference to the pi_state object and frees or caches it
+ * when the last reference is gone.
+ *
  * Must be called with the hb lock held.
  */
-static void free_pi_state(struct futex_pi_state *pi_state)
+static void put_pi_state(struct futex_pi_state *pi_state)
 {
 	if (!pi_state)
 		return;
@@ -1729,7 +1732,7 @@ retry_private:
 		case 0:
 			break;
 		case -EFAULT:
-			free_pi_state(pi_state);
+			put_pi_state(pi_state);
 			pi_state = NULL;
 			double_unlock_hb(hb1, hb2);
 			hb_waiters_dec(hb2);
@@ -1746,7 +1749,7 @@ retry_private:
 			 *   exit to complete.
 			 * - The user space value changed.
 			 */
-			free_pi_state(pi_state);
+			put_pi_state(pi_state);
 			pi_state = NULL;
 			double_unlock_hb(hb1, hb2);
 			hb_waiters_dec(hb2);
@@ -1815,7 +1818,7 @@ retry_private:
 			} else if (ret) {
 				/* -EDEADLK */
 				this->pi_state = NULL;
-				free_pi_state(pi_state);
+				put_pi_state(pi_state);
 				goto out_unlock;
 			}
 		}
@@ -1824,7 +1827,7 @@ retry_private:
 	}
 
 out_unlock:
-	free_pi_state(pi_state);
+	put_pi_state(pi_state);
 	double_unlock_hb(hb1, hb2);
 	wake_up_q(&wake_q);
 	hb_waiters_dec(hb2);
@@ -1973,7 +1976,7 @@ static void unqueue_me_pi(struct futex_q *q)
 	__unqueue_futex(q);
 
 	BUG_ON(!q->pi_state);
-	free_pi_state(q->pi_state);
+	put_pi_state(q->pi_state);
 	q->pi_state = NULL;
 
 	spin_unlock(q->lock_ptr);
@@ -2759,7 +2762,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 			 * Drop the reference to the pi state which
 			 * the requeue_pi() code acquired for us.
 			 */
-			free_pi_state(q.pi_state);
+			put_pi_state(q.pi_state);
 			spin_unlock(q.lock_ptr);
 		}
 	} else {
-- 
cgit v1.1


From ecb38b78f698a51988ec456751b20440e54702fb Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 19 Dec 2015 20:07:39 +0000
Subject: futex: Document pi_state refcounting in requeue code

Documentation of the pi_state refcounting in the requeue code is non
existent. Add it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Darren Hart <darren@dvhart.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Bhuvanesh_Surachari@mentor.com
Cc: Andy Lowe <Andy_Lowe@mentor.com>
Link: http://lkml.kernel.org/r/20151219200607.335938312@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/futex.c | 51 +++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 39 insertions(+), 12 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index f1581ff..20c4683 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1709,27 +1709,31 @@ retry_private:
 		 * exist yet, look it up one more time to ensure we have a
 		 * reference to it. If the lock was taken, ret contains the
 		 * vpid of the top waiter task.
+		 * If the lock was not taken, we have pi_state and an initial
+		 * refcount on it. In case of an error we have nothing.
 		 */
 		if (ret > 0) {
 			WARN_ON(pi_state);
 			drop_count++;
 			task_count++;
 			/*
-			 * If we acquired the lock, then the user
-			 * space value of uaddr2 should be vpid. It
-			 * cannot be changed by the top waiter as it
-			 * is blocked on hb2 lock if it tries to do
-			 * so. If something fiddled with it behind our
-			 * back the pi state lookup might unearth
-			 * it. So we rather use the known value than
-			 * rereading and handing potential crap to
-			 * lookup_pi_state.
+			 * If we acquired the lock, then the user space value
+			 * of uaddr2 should be vpid. It cannot be changed by
+			 * the top waiter as it is blocked on hb2 lock if it
+			 * tries to do so. If something fiddled with it behind
+			 * our back the pi state lookup might unearth it. So
+			 * we rather use the known value than rereading and
+			 * handing potential crap to lookup_pi_state.
+			 *
+			 * If that call succeeds then we have pi_state and an
+			 * initial refcount on it.
 			 */
 			ret = lookup_pi_state(ret, hb2, &key2, &pi_state);
 		}
 
 		switch (ret) {
 		case 0:
+			/* We hold a reference on the pi state. */
 			break;
 		case -EFAULT:
 			put_pi_state(pi_state);
@@ -1804,19 +1808,37 @@ retry_private:
 		 * of requeue_pi if we couldn't acquire the lock atomically.
 		 */
 		if (requeue_pi) {
-			/* Prepare the waiter to take the rt_mutex. */
+			/*
+			 * Prepare the waiter to take the rt_mutex. Take a
+			 * refcount on the pi_state and store the pointer in
+			 * the futex_q object of the waiter.
+			 */
 			atomic_inc(&pi_state->refcount);
 			this->pi_state = pi_state;
 			ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
 							this->rt_waiter,
 							this->task);
 			if (ret == 1) {
-				/* We got the lock. */
+				/*
+				 * We got the lock. We do neither drop the
+				 * refcount on pi_state nor clear
+				 * this->pi_state because the waiter needs the
+				 * pi_state for cleaning up the user space
+				 * value. It will drop the refcount after
+				 * doing so.
+				 */
 				requeue_pi_wake_futex(this, &key2, hb2);
 				drop_count++;
 				continue;
 			} else if (ret) {
-				/* -EDEADLK */
+				/*
+				 * rt_mutex_start_proxy_lock() detected a
+				 * potential deadlock when we tried to queue
+				 * that waiter. Drop the pi_state reference
+				 * which we took above and remove the pointer
+				 * to the state from the waiters futex_q
+				 * object.
+				 */
 				this->pi_state = NULL;
 				put_pi_state(pi_state);
 				goto out_unlock;
@@ -1827,6 +1849,11 @@ retry_private:
 	}
 
 out_unlock:
+	/*
+	 * We took an extra initial reference to the pi_state either
+	 * in futex_proxy_trylock_atomic() or in lookup_pi_state(). We
+	 * need to drop it here again.
+	 */
 	put_pi_state(pi_state);
 	double_unlock_hb(hb1, hb2);
 	wake_up_q(&wake_q);
-- 
cgit v1.1


From 4959f2de11ca532a120a337429e5576fd283700f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 19 Dec 2015 20:07:40 +0000
Subject: futex: Remove pointless put_pi_state calls in requeue()

In the error handling cases we neither have pi_state nor a reference
to it. Remove the pointless code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Darren Hart <darren@dvhart.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Bhuvanesh_Surachari@mentor.com
Cc: Andy Lowe <Andy_Lowe@mentor.com>
Link: http://lkml.kernel.org/r/20151219200607.432780944@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/futex.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index 20c4683..dcec018 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1735,9 +1735,9 @@ retry_private:
 		case 0:
 			/* We hold a reference on the pi state. */
 			break;
+
+			/* If the above failed, then pi_state is NULL */
 		case -EFAULT:
-			put_pi_state(pi_state);
-			pi_state = NULL;
 			double_unlock_hb(hb1, hb2);
 			hb_waiters_dec(hb2);
 			put_futex_key(&key2);
@@ -1753,8 +1753,6 @@ retry_private:
 			 *   exit to complete.
 			 * - The user space value changed.
 			 */
-			put_pi_state(pi_state);
-			pi_state = NULL;
 			double_unlock_hb(hb1, hb2);
 			hb_waiters_dec(hb2);
 			put_futex_key(&key2);
-- 
cgit v1.1


From 885c2cb770b5ac2507c41bc9f91a5d1c98337bee Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 19 Dec 2015 20:07:41 +0000
Subject: futex: Cleanup the goto confusion in requeue_pi()

out_unlock: does not only drop the locks, it also drops the refcount
on the pi_state. Really intuitive.

Move the label after the put_pi_state() call and use 'break' in the
error handling path of the requeue loop.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Darren Hart <darren@dvhart.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Bhuvanesh_Surachari@mentor.com
Cc: Andy Lowe <Andy_Lowe@mentor.com>
Link: http://lkml.kernel.org/r/20151219200607.526665141@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/futex.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index dcec018..461d438 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1839,20 +1839,25 @@ retry_private:
 				 */
 				this->pi_state = NULL;
 				put_pi_state(pi_state);
-				goto out_unlock;
+				/*
+				 * We stop queueing more waiters and let user
+				 * space deal with the mess.
+				 */
+				break;
 			}
 		}
 		requeue_futex(this, hb1, hb2, &key2);
 		drop_count++;
 	}
 
-out_unlock:
 	/*
 	 * We took an extra initial reference to the pi_state either
 	 * in futex_proxy_trylock_atomic() or in lookup_pi_state(). We
 	 * need to drop it here again.
 	 */
 	put_pi_state(pi_state);
+
+out_unlock:
 	double_unlock_hb(hb1, hb2);
 	wake_up_q(&wake_q);
 	hb_waiters_dec(hb2);
-- 
cgit v1.1


From 337f13046ff03717a9e99675284a817527440a49 Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhart@linux.intel.com>
Date: Fri, 18 Dec 2015 13:36:37 -0800
Subject: futex: Allow FUTEX_CLOCK_REALTIME with FUTEX_WAIT op

While reviewing Michael Kerrisk's recent futex manpage update, I noticed
that we allow the FUTEX_CLOCK_REALTIME flag for FUTEX_WAIT_BITSET but
not for FUTEX_WAIT.

FUTEX_WAIT is treated as a simple version for FUTEX_WAIT_BITSET
internally (with a bitmask of FUTEX_BITSET_MATCH_ANY). As such, I cannot
come up with a reason for this exclusion for FUTEX_WAIT.

This change does modify the behavior of the futex syscall, changing a
call with FUTEX_WAIT | FUTEX_CLOCK_REALTIME from returning -ENOSYS, to be
equivalent to FUTEX_WAIT_BITSET | FUTEX_CLOCK_REALTIME with a bitset of
FUTEX_BITSET_MATCH_ANY.

Reported-by: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Darren Hart <dvhart@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Link: http://lkml.kernel.org/r/9f3bdc116d79d23f5ee72ceb9a2a857f5ff8fa29.1450474525.git.dvhart@linux.intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/futex.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index 461d438..8a310e2 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -3084,7 +3084,8 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 
 	if (op & FUTEX_CLOCK_REALTIME) {
 		flags |= FLAGS_CLOCKRT;
-		if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
+		if (cmd != FUTEX_WAIT && cmd != FUTEX_WAIT_BITSET && \
+		    cmd != FUTEX_WAIT_REQUEUE_PI)
 			return -ENOSYS;
 	}
 
-- 
cgit v1.1


From b92c453d520ebf0703f8195e9f2c6e7522b85e1d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 23 Dec 2015 12:35:21 +0100
Subject: Revert "x86/kvm: On KVM re-enable (e.g. after suspend), update
 clocks"

This reverts commit 677a73a9aa54. This patch was not meant to be merged and
has issues. Revert it.

Requested-by: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kvm/x86.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 72 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6e32e87..00462bd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -123,6 +123,8 @@ module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
 unsigned int __read_mostly lapic_timer_advance_ns = 0;
 module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
 
+static bool __read_mostly backwards_tsc_observed = false;
+
 #define KVM_NR_SHARED_MSRS 16
 
 struct kvm_shared_msrs_global {
@@ -1669,6 +1671,7 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
 					&ka->master_cycle_now);
 
 	ka->use_master_clock = host_tsc_clocksource && vcpus_matched
+				&& !backwards_tsc_observed
 				&& !ka->boot_vcpu_runs_old_kvmclock;
 
 	if (ka->use_master_clock)
@@ -7363,22 +7366,88 @@ int kvm_arch_hardware_enable(void)
 	struct kvm_vcpu *vcpu;
 	int i;
 	int ret;
+	u64 local_tsc;
+	u64 max_tsc = 0;
+	bool stable, backwards_tsc = false;
 
 	kvm_shared_msr_cpu_online();
 	ret = kvm_x86_ops->hardware_enable();
 	if (ret != 0)
 		return ret;
 
+	local_tsc = rdtsc();
+	stable = !check_tsc_unstable();
 	list_for_each_entry(kvm, &vm_list, vm_list) {
 		kvm_for_each_vcpu(i, vcpu, kvm) {
-			if (vcpu->cpu == smp_processor_id()) {
+			if (!stable && vcpu->cpu == smp_processor_id())
 				kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
-				kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE,
-						 vcpu);
+			if (stable && vcpu->arch.last_host_tsc > local_tsc) {
+				backwards_tsc = true;
+				if (vcpu->arch.last_host_tsc > max_tsc)
+					max_tsc = vcpu->arch.last_host_tsc;
 			}
 		}
 	}
 
+	/*
+	 * Sometimes, even reliable TSCs go backwards.  This happens on
+	 * platforms that reset TSC during suspend or hibernate actions, but
+	 * maintain synchronization.  We must compensate.  Fortunately, we can
+	 * detect that condition here, which happens early in CPU bringup,
+	 * before any KVM threads can be running.  Unfortunately, we can't
+	 * bring the TSCs fully up to date with real time, as we aren't yet far
+	 * enough into CPU bringup that we know how much real time has actually
+	 * elapsed; our helper function, get_kernel_ns() will be using boot
+	 * variables that haven't been updated yet.
+	 *
+	 * So we simply find the maximum observed TSC above, then record the
+	 * adjustment to TSC in each VCPU.  When the VCPU later gets loaded,
+	 * the adjustment will be applied.  Note that we accumulate
+	 * adjustments, in case multiple suspend cycles happen before some VCPU
+	 * gets a chance to run again.  In the event that no KVM threads get a
+	 * chance to run, we will miss the entire elapsed period, as we'll have
+	 * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may
+	 * loose cycle time.  This isn't too big a deal, since the loss will be
+	 * uniform across all VCPUs (not to mention the scenario is extremely
+	 * unlikely). It is possible that a second hibernate recovery happens
+	 * much faster than a first, causing the observed TSC here to be
+	 * smaller; this would require additional padding adjustment, which is
+	 * why we set last_host_tsc to the local tsc observed here.
+	 *
+	 * N.B. - this code below runs only on platforms with reliable TSC,
+	 * as that is the only way backwards_tsc is set above.  Also note
+	 * that this runs for ALL vcpus, which is not a bug; all VCPUs should
+	 * have the same delta_cyc adjustment applied if backwards_tsc
+	 * is detected.  Note further, this adjustment is only done once,
+	 * as we reset last_host_tsc on all VCPUs to stop this from being
+	 * called multiple times (one for each physical CPU bringup).
+	 *
+	 * Platforms with unreliable TSCs don't have to deal with this, they
+	 * will be compensated by the logic in vcpu_load, which sets the TSC to
+	 * catchup mode.  This will catchup all VCPUs to real time, but cannot
+	 * guarantee that they stay in perfect synchronization.
+	 */
+	if (backwards_tsc) {
+		u64 delta_cyc = max_tsc - local_tsc;
+		backwards_tsc_observed = true;
+		list_for_each_entry(kvm, &vm_list, vm_list) {
+			kvm_for_each_vcpu(i, vcpu, kvm) {
+				vcpu->arch.tsc_offset_adjustment += delta_cyc;
+				vcpu->arch.last_host_tsc = local_tsc;
+				kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
+			}
+
+			/*
+			 * We have to disable TSC offset matching.. if you were
+			 * booting a VM while issuing an S4 host suspend....
+			 * you may have some problem.  Solving this issue is
+			 * left as an exercise to the reader.
+			 */
+			kvm->arch.last_tsc_nsec = 0;
+			kvm->arch.last_tsc_write = 0;
+		}
+
+	}
 	return 0;
 }
 
-- 
cgit v1.1


From 0105c8d8334fc941e0297ca6708fa57854114c0e Mon Sep 17 00:00:00 2001
From: "chengang@emindsoft.com.cn" <chengang@emindsoft.com.cn>
Date: Sat, 26 Dec 2015 21:49:58 +0800
Subject: arch/x86/kernel/ptrace.c: Remove unused arg_offs_table
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The related warning from gcc 6.0:

  arch/x86/kernel/ptrace.c:127:18: warning: ‘arg_offs_table’ defined but not used [-Wunused-const-variable]
   static const int arg_offs_table[] = {
                    ^~~~~~~~~~~~~~

Signed-off-by: Chen Gang <gang.chen.5i5j@gmail.com>
Link: http://lkml.kernel.org/r/1451137798-28701-1-git-send-email-chengang@emindsoft.com.cn
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/ptrace.c | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 558f50e..32e9d9c 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -124,21 +124,6 @@ const char *regs_query_register_name(unsigned int offset)
 	return NULL;
 }
 
-static const int arg_offs_table[] = {
-#ifdef CONFIG_X86_32
-	[0] = offsetof(struct pt_regs, ax),
-	[1] = offsetof(struct pt_regs, dx),
-	[2] = offsetof(struct pt_regs, cx)
-#else /* CONFIG_X86_64 */
-	[0] = offsetof(struct pt_regs, di),
-	[1] = offsetof(struct pt_regs, si),
-	[2] = offsetof(struct pt_regs, dx),
-	[3] = offsetof(struct pt_regs, cx),
-	[4] = offsetof(struct pt_regs, r8),
-	[5] = offsetof(struct pt_regs, r9)
-#endif
-};
-
 /*
  * does not yet catch signals sent when the child dies.
  * in exit.c or in signal.c.
-- 
cgit v1.1


From 0d430e3fb3f7cdc13c0d22078b820f682821b45a Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Tue, 22 Dec 2015 08:42:44 -0700
Subject: x86/LDT: Print the real LDT base address

This was meant to print base address and entry count; make it do so
again.

Fixes: 37868fe113ff "x86/ldt: Make modify_ldt synchronous"
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Andy Lutomirski <luto@kernel.org>
Link: http://lkml.kernel.org/r/56797D8402000078000C24F0@prv-mh.provo.novell.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/process_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index e835d26..b9d99e0 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -125,7 +125,7 @@ void release_thread(struct task_struct *dead_task)
 		if (dead_task->mm->context.ldt) {
 			pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
 				dead_task->comm,
-				dead_task->mm->context.ldt,
+				dead_task->mm->context.ldt->entries,
 				dead_task->mm->context.ldt->size);
 			BUG();
 		}
-- 
cgit v1.1


From 9abb0ecdee69a2577560cc283368e490da974934 Mon Sep 17 00:00:00 2001
From: Laura Abbott <labbott@fedoraproject.org>
Date: Mon, 21 Dec 2015 12:01:14 -0800
Subject: x86/mm: Drop WARN from multi-BAR check

ioremapping multiple BARs produces a warning with a message "Your kernel is
fine". This message mostly serves to comfort kernel developers. Users do
not read the message, they only see the big scary warning which means
something must be horribly broken with their system. Less dramatically, the
warn also sets the taint flag which makes it difficult to differentiate
problems. If the kernel is actually fine as the warning claims it doesn't
make sense for it to be tainted. Change the WARN_ONCE to a pr_warn with the
caller of the ioremap.

Signed-off-by: Laura Abbott <labbott@fedoraproject.org>
Link: http://lkml.kernel.org/r/1450728074-31029-1-git-send-email-labbott@fedoraproject.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/ioremap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index b9c78f3..0d8d53d 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -194,8 +194,8 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
 	 * Check if the request spans more than any BAR in the iomem resource
 	 * tree.
 	 */
-	WARN_ONCE(iomem_map_sanity_check(unaligned_phys_addr, unaligned_size),
-		  KERN_INFO "Info: mapping multiple BARs. Your kernel is fine.");
+	if (iomem_map_sanity_check(unaligned_phys_addr, unaligned_size))
+		pr_warn("caller %pS mapping multiple BARs\n", caller);
 
 	return ret_addr;
 err_free_area:
-- 
cgit v1.1


From cd3417c8fc9504cc1afe944515f338aff9ec286b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 29 Dec 2015 16:03:53 -0500
Subject: kill free_page_put_link()

all callers are better off with kfree_put_link()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/configfs/symlink.c | 12 ++++++------
 fs/fuse/dir.c         |  6 +++---
 fs/kernfs/symlink.c   | 12 ++++++------
 fs/libfs.c            |  6 ------
 include/linux/fs.h    |  1 -
 5 files changed, 15 insertions(+), 22 deletions(-)

diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index b91c01e..e9de962 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -282,29 +282,29 @@ static int configfs_getlink(struct dentry *dentry, char * path)
 static const char *configfs_get_link(struct dentry *dentry,
 				     struct inode *inode, void **cookie)
 {
-	unsigned long page;
+	char *page;
 	int error;
 
 	if (!dentry)
 		return ERR_PTR(-ECHILD);
 
-	page = get_zeroed_page(GFP_KERNEL);
+	page = kzalloc(PAGE_SIZE, GFP_KERNEL);
 	if (!page)
 		return ERR_PTR(-ENOMEM);
 
-	error = configfs_getlink(dentry, (char *)page);
+	error = configfs_getlink(dentry, page);
 	if (!error) {
-		return *cookie = (void *)page;
+		return *cookie = page;
 	}
 
-	free_page(page);
+	kfree(page);
 	return ERR_PTR(error);
 }
 
 const struct inode_operations configfs_symlink_inode_operations = {
 	.get_link = configfs_get_link,
 	.readlink = generic_readlink,
-	.put_link = free_page_put_link,
+	.put_link = kfree_put_link,
 	.setattr = configfs_setattr,
 };
 
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 148e8ef..def0a4d 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1376,7 +1376,7 @@ static const char *fuse_get_link(struct dentry *dentry,
 	if (!dentry)
 		return ERR_PTR(-ECHILD);
 
-	link = (char *) __get_free_page(GFP_KERNEL);
+	link = kmalloc(PAGE_SIZE, GFP_KERNEL);
 	if (!link)
 		return ERR_PTR(-ENOMEM);
 
@@ -1388,7 +1388,7 @@ static const char *fuse_get_link(struct dentry *dentry,
 	args.out.args[0].value = link;
 	ret = fuse_simple_request(fc, &args);
 	if (ret < 0) {
-		free_page((unsigned long) link);
+		kfree(link);
 		link = ERR_PTR(ret);
 	} else {
 		link[ret] = '\0';
@@ -1913,7 +1913,7 @@ static const struct inode_operations fuse_common_inode_operations = {
 static const struct inode_operations fuse_symlink_inode_operations = {
 	.setattr	= fuse_setattr,
 	.get_link	= fuse_get_link,
-	.put_link	= free_page_put_link,
+	.put_link	= kfree_put_link,
 	.readlink	= generic_readlink,
 	.getattr	= fuse_getattr,
 	.setxattr	= fuse_setxattr,
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index ffae857..f9efdae 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -116,19 +116,19 @@ static const char *kernfs_iop_get_link(struct dentry *dentry,
 				       struct inode *inode, void **cookie)
 {
 	int error = -ENOMEM;
-	unsigned long page;
+	char *page;
 
 	if (!dentry)
 		return ERR_PTR(-ECHILD);
-	page = get_zeroed_page(GFP_KERNEL);
+	page = kzalloc(PAGE_SIZE, GFP_KERNEL);
 	if (!page)
 		return ERR_PTR(-ENOMEM);
-	error = kernfs_getlink(dentry, (char *)page);
+	error = kernfs_getlink(dentry, page);
 	if (unlikely(error < 0)) {
-		free_page((unsigned long)page);
+		kfree(page);
 		return ERR_PTR(error);
 	}
-	return *cookie = (char *)page;
+	return *cookie = page;
 }
 
 const struct inode_operations kernfs_symlink_iops = {
@@ -138,7 +138,7 @@ const struct inode_operations kernfs_symlink_iops = {
 	.listxattr	= kernfs_iop_listxattr,
 	.readlink	= generic_readlink,
 	.get_link	= kernfs_iop_get_link,
-	.put_link	= free_page_put_link,
+	.put_link	= kfree_put_link,
 	.setattr	= kernfs_iop_setattr,
 	.getattr	= kernfs_iop_getattr,
 	.permission	= kernfs_iop_permission,
diff --git a/fs/libfs.c b/fs/libfs.c
index 8dc37fc..fec7ab0 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1025,12 +1025,6 @@ void kfree_put_link(struct inode *unused, void *cookie)
 }
 EXPORT_SYMBOL(kfree_put_link);
 
-void free_page_put_link(struct inode *unused, void *cookie)
-{
-	free_page((unsigned long) cookie);
-}
-EXPORT_SYMBOL(free_page_put_link);
-
 /*
  * nop .set_page_dirty method so that people can use .page_mkwrite on
  * anon inodes.
diff --git a/include/linux/fs.h b/include/linux/fs.h
index d2fdf09..138e206 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2743,7 +2743,6 @@ extern int __page_symlink(struct inode *inode, const char *symname, int len,
 extern int page_symlink(struct inode *inode, const char *symname, int len);
 extern const struct inode_operations page_symlink_inode_operations;
 extern void kfree_put_link(struct inode *, void *);
-extern void free_page_put_link(struct inode *, void *);
 extern int generic_readlink(struct dentry *, char __user *, int);
 extern void generic_fillattr(struct inode *, struct kstat *);
 int vfs_getattr_nosec(struct path *path, struct kstat *stat);
-- 
cgit v1.1


From 984cf355aeaa8f2eda3861b50d0e8d3e3f77e83b Mon Sep 17 00:00:00 2001
From: Ani Sinha <ani@arista.com>
Date: Thu, 17 Dec 2015 17:15:10 -0800
Subject: sysrq: Fix warning in sysrq generated crash.

Commit 984d74a72076a1 ("sysrq: rcu-ify __handle_sysrq") replaced
spin_lock_irqsave() calls with rcu_read_lock() calls in sysrq. Since
rcu_read_lock() does not disable preemption, faulthandler_disabled() in
__do_page_fault() in x86/fault.c returns false. When the code later calls
might_sleep() in the pagefault handler, we get the following warning:

BUG: sleeping function called from invalid context at ../arch/x86/mm/fault.c:1187
in_atomic(): 0, irqs_disabled(): 0, pid: 4706, name: bash
Preemption disabled at:[<ffffffff81484339>] printk+0x48/0x4a

To fix this, we release the RCU read lock before we crash.

Tested this patch on linux 3.18 by booting off one of our boards.

Fixes: 984d74a72076a1 ("sysrq: rcu-ify __handle_sysrq")

Signed-off-by: Ani Sinha <ani@arista.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 drivers/tty/sysrq.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index 5381a72..e513940 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -133,6 +133,12 @@ static void sysrq_handle_crash(int key)
 {
 	char *killer = NULL;
 
+	/* we need to release the RCU read lock here,
+	 * otherwise we get an annoying
+	 * 'BUG: sleeping function called from invalid context'
+	 * complaint from the kernel before the panic.
+	 */
+	rcu_read_unlock();
 	panic_on_oops = 1;	/* force panic */
 	wmb();
 	*killer = 1;
-- 
cgit v1.1


From fceef393a538134f03b778c5d2519e670269342f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 29 Dec 2015 15:58:39 -0500
Subject: switch ->get_link() to delayed_call, kill ->put_link()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/Locking             |  2 -
 Documentation/filesystems/porting             |  6 +++
 Documentation/filesystems/vfs.txt             | 21 +++++----
 drivers/staging/lustre/lustre/llite/symlink.c | 18 ++++----
 fs/9p/vfs_inode.c                             |  9 ++--
 fs/9p/vfs_inode_dotl.c                        |  9 ++--
 fs/affs/symlink.c                             |  1 -
 fs/autofs4/symlink.c                          |  3 +-
 fs/btrfs/inode.c                              |  1 -
 fs/cifs/cifsfs.c                              |  1 -
 fs/cifs/cifsfs.h                              |  3 +-
 fs/cifs/link.c                                |  6 ++-
 fs/coda/cnode.c                               |  1 -
 fs/configfs/symlink.c                         | 17 ++++----
 fs/ecryptfs/inode.c                           |  7 +--
 fs/ext2/symlink.c                             |  1 -
 fs/ext4/symlink.c                             |  8 ++--
 fs/f2fs/namei.c                               | 16 ++++---
 fs/fuse/dir.c                                 |  6 +--
 fs/gfs2/inode.c                               |  8 ++--
 fs/hostfs/hostfs_kern.c                       | 16 +++----
 fs/jfs/symlink.c                              |  1 -
 fs/kernfs/symlink.c                           | 19 ++++----
 fs/libfs.c                                    |  9 ++--
 fs/minix/inode.c                              |  1 -
 fs/namei.c                                    | 63 +++++++++++----------------
 fs/ncpfs/inode.c                              |  1 -
 fs/nfs/symlink.c                              |  6 +--
 fs/nilfs2/namei.c                             |  1 -
 fs/ocfs2/symlink.c                            |  1 -
 fs/overlayfs/inode.c                          | 45 ++-----------------
 fs/proc/base.c                                |  8 ++--
 fs/proc/inode.c                               | 16 +++----
 fs/proc/namespaces.c                          |  3 +-
 fs/proc/self.c                                |  7 +--
 fs/proc/thread_self.c                         |  7 +--
 fs/reiserfs/namei.c                           |  1 -
 fs/squashfs/symlink.c                         |  1 -
 fs/sysv/inode.c                               |  1 -
 fs/xfs/xfs_iops.c                             |  6 +--
 include/linux/delayed_call.h                  | 34 +++++++++++++++
 include/linux/fs.h                            | 14 +++---
 mm/shmem.c                                    | 19 ++++----
 43 files changed, 206 insertions(+), 218 deletions(-)
 create mode 100644 include/linux/delayed_call.h

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 4fba54b..619af9b 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -51,7 +51,6 @@ prototypes:
 			struct inode *, struct dentry *, unsigned int);
 	int (*readlink) (struct dentry *, char __user *,int);
 	const char *(*get_link) (struct dentry *, struct inode *, void **);
-	void (*put_link) (struct inode *, void *);
 	void (*truncate) (struct inode *);
 	int (*permission) (struct inode *, int, unsigned int);
 	int (*get_acl)(struct inode *, int);
@@ -84,7 +83,6 @@ rename:		yes (all)	(see below)
 rename2:	yes (all)	(see below)
 readlink:	no
 get_link:	no
-put_link:	no
 setattr:	yes
 permission:	no (may not block if called in rcu-walk mode)
 get_acl:	no
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index cf92a8c..0f88e60 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -515,3 +515,9 @@ in your dentry operations instead.
 		* ->get_link() gets inode as a separate argument
 		* ->get_link() may be called in RCU mode - in that case NULL
 		  dentry is passed
+--
+[mandatory]
+	->get_link() gets struct delayed_call *done now, and should do
+	set_delayed_call() where it used to set *cookie.
+	->put_link() is gone - just give the destructor to set_delayed_call()
+	in ->get_link().
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 8c6f07a..b02a7d5 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -350,8 +350,8 @@ struct inode_operations {
 	int (*rename2) (struct inode *, struct dentry *,
 			struct inode *, struct dentry *, unsigned int);
 	int (*readlink) (struct dentry *, char __user *,int);
-	const char *(*follow_link) (struct dentry *, void **);
-	void (*put_link) (struct inode *, void *);
+	const char *(*get_link) (struct dentry *, struct inode *,
+				 struct delayed_call *);
 	int (*permission) (struct inode *, int);
 	int (*get_acl)(struct inode *, int);
 	int (*setattr) (struct dentry *, struct iattr *);
@@ -434,20 +434,19 @@ otherwise noted.
   readlink: called by the readlink(2) system call. Only required if
 	you want to support reading symbolic links
 
-  follow_link: called by the VFS to follow a symbolic link to the
+  get_link: called by the VFS to follow a symbolic link to the
 	inode it points to.  Only required if you want to support
 	symbolic links.  This method returns the symlink body
 	to traverse (and possibly resets the current position with
 	nd_jump_link()).  If the body won't go away until the inode
 	is gone, nothing else is needed; if it needs to be otherwise
-	pinned, the data needed to release whatever we'd grabbed
-	is to be stored in void * variable passed by address to
-	follow_link() instance.
-
-  put_link: called by the VFS to release resources allocated by
-	follow_link().  The cookie stored by follow_link() is passed
-	to this method as the last parameter; only called when
-	cookie isn't NULL.
+	pinned, arrange for its release by having get_link(..., ..., done)
+	do set_delayed_call(done, destructor, argument).
+	In that case destructor(argument) will be called once VFS is
+	done with the body you've returned.
+	May be called in RCU mode; that is indicated by NULL dentry
+	argument.  If request can't be handled without leaving RCU mode,
+	have it return ERR_PTR(-ECHILD).
 
   permission: called by the VFS to check for access rights on a POSIX-like
   	filesystem.
diff --git a/drivers/staging/lustre/lustre/llite/symlink.c b/drivers/staging/lustre/lustre/llite/symlink.c
index 153fdf9..e489a32 100644
--- a/drivers/staging/lustre/lustre/llite/symlink.c
+++ b/drivers/staging/lustre/lustre/llite/symlink.c
@@ -118,8 +118,14 @@ failed:
 	return rc;
 }
 
+static void ll_put_link(void *p)
+{
+	ptlrpc_req_finished(p);
+}
+
 static const char *ll_get_link(struct dentry *dentry,
-			       struct inode *inode, void **cookie)
+			       struct inode *inode,
+			       struct delayed_call *done)
 {
 	struct ptlrpc_request *request = NULL;
 	int rc;
@@ -137,22 +143,16 @@ static const char *ll_get_link(struct dentry *dentry,
 	}
 
 	/* symname may contain a pointer to the request message buffer,
-	 * we delay request releasing until ll_put_link then.
+	 * we delay request releasing then.
 	 */
-	*cookie = request;
+	set_delayed_call(done, ll_put_link, request);
 	return symname;
 }
 
-static void ll_put_link(struct inode *unused, void *cookie)
-{
-	ptlrpc_req_finished(cookie);
-}
-
 struct inode_operations ll_fast_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.setattr	= ll_setattr,
 	.get_link	= ll_get_link,
-	.put_link	= ll_put_link,
 	.getattr	= ll_getattr,
 	.permission	= ll_inode_permission,
 	.setxattr	= ll_setxattr,
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 8ba5a89..f928f87 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1226,11 +1226,12 @@ ino_t v9fs_qid2ino(struct p9_qid *qid)
  * v9fs_vfs_get_link - follow a symlink path
  * @dentry: dentry for symlink
  * @inode: inode for symlink
- * @cookie: place to pass the data to put_link()
+ * @done: delayed call for when we are done with the return value
  */
 
 static const char *v9fs_vfs_get_link(struct dentry *dentry,
-				     struct inode *inode, void **cookie)
+				     struct inode *inode,
+				     struct delayed_call *done)
 {
 	struct v9fs_session_info *v9ses;
 	struct p9_fid *fid;
@@ -1266,7 +1267,8 @@ static const char *v9fs_vfs_get_link(struct dentry *dentry,
 
 	p9stat_free(st);
 	kfree(st);
-	return *cookie = res;
+	set_delayed_call(done, kfree_link, res);
+	return res;
 }
 
 /**
@@ -1460,7 +1462,6 @@ static const struct inode_operations v9fs_file_inode_operations = {
 static const struct inode_operations v9fs_symlink_inode_operations = {
 	.readlink = generic_readlink,
 	.get_link = v9fs_vfs_get_link,
-	.put_link = kfree_put_link,
 	.getattr = v9fs_vfs_getattr,
 	.setattr = v9fs_vfs_setattr,
 };
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 0cc105d..a34702c 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -902,12 +902,13 @@ error:
  * v9fs_vfs_get_link_dotl - follow a symlink path
  * @dentry: dentry for symlink
  * @inode: inode for symlink
- * @cookie: place to pass the data to put_link()
+ * @done: destructor for return value
  */
 
 static const char *
 v9fs_vfs_get_link_dotl(struct dentry *dentry,
-		       struct inode *inode, void **cookie)
+		       struct inode *inode,
+		       struct delayed_call *done)
 {
 	struct p9_fid *fid;
 	char *target;
@@ -924,7 +925,8 @@ v9fs_vfs_get_link_dotl(struct dentry *dentry,
 	retval = p9_client_readlink(fid, &target);
 	if (retval)
 		return ERR_PTR(retval);
-	return *cookie = target;
+	set_delayed_call(done, kfree_link, target);
+	return target;
 }
 
 int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
@@ -991,7 +993,6 @@ const struct inode_operations v9fs_file_inode_operations_dotl = {
 const struct inode_operations v9fs_symlink_inode_operations_dotl = {
 	.readlink = generic_readlink,
 	.get_link = v9fs_vfs_get_link_dotl,
-	.put_link = kfree_put_link,
 	.getattr = v9fs_vfs_getattr_dotl,
 	.setattr = v9fs_vfs_setattr_dotl,
 	.setxattr = generic_setxattr,
diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c
index 39d1194..69b03db 100644
--- a/fs/affs/symlink.c
+++ b/fs/affs/symlink.c
@@ -72,6 +72,5 @@ const struct address_space_operations affs_symlink_aops = {
 const struct inode_operations affs_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.get_link	= page_get_link,
-	.put_link	= page_put_link,
 	.setattr	= affs_notify_change,
 };
diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c
index 39e6f0b..84e037d 100644
--- a/fs/autofs4/symlink.c
+++ b/fs/autofs4/symlink.c
@@ -13,7 +13,8 @@
 #include "autofs_i.h"
 
 static const char *autofs4_get_link(struct dentry *dentry,
-				    struct inode *inode, void **cookie)
+				    struct inode *inode,
+				    struct delayed_call *done)
 {
 	struct autofs_sb_info *sbi;
 	struct autofs_info *ino;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3d4aa69..1a41a65 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -10097,7 +10097,6 @@ static const struct inode_operations btrfs_special_inode_operations = {
 static const struct inode_operations btrfs_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.get_link	= page_get_link,
-	.put_link	= page_put_link,
 	.getattr	= btrfs_getattr,
 	.setattr	= btrfs_setattr,
 	.permission	= btrfs_permission,
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 4593f416..90e4e2b 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -901,7 +901,6 @@ const struct inode_operations cifs_file_inode_ops = {
 const struct inode_operations cifs_symlink_inode_ops = {
 	.readlink = generic_readlink,
 	.get_link = cifs_get_link,
-	.put_link = kfree_put_link,
 	.permission = cifs_permission,
 	/* BB add the following two eventually */
 	/* revalidate: cifs_revalidate,
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 6886328..26a1187 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -120,7 +120,8 @@ extern struct vfsmount *cifs_dfs_d_automount(struct path *path);
 #endif
 
 /* Functions related to symlinks */
-extern const char *cifs_get_link(struct dentry *, struct inode *, void **);
+extern const char *cifs_get_link(struct dentry *, struct inode *,
+			struct delayed_call *);
 extern int cifs_symlink(struct inode *inode, struct dentry *direntry,
 			const char *symname);
 extern int	cifs_removexattr(struct dentry *, const char *);
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 6f2439b5..062c237 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -627,7 +627,8 @@ cifs_hl_exit:
 }
 
 const char *
-cifs_get_link(struct dentry *direntry, struct inode *inode, void **cookie)
+cifs_get_link(struct dentry *direntry, struct inode *inode,
+	      struct delayed_call *done)
 {
 	int rc = -ENOMEM;
 	unsigned int xid;
@@ -680,7 +681,8 @@ cifs_get_link(struct dentry *direntry, struct inode *inode, void **cookie)
 		kfree(target_path);
 		return ERR_PTR(rc);
 	}
-	return *cookie = target_path;
+	set_delayed_call(done, kfree_link, target_path);
+	return target_path;
 }
 
 int
diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c
index f18139c..1bfb7ba 100644
--- a/fs/coda/cnode.c
+++ b/fs/coda/cnode.c
@@ -19,7 +19,6 @@ static inline int coda_fideq(struct CodaFid *fid1, struct CodaFid *fid2)
 static const struct inode_operations coda_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.get_link	= page_get_link,
-	.put_link	= page_put_link,
 	.setattr	= coda_setattr,
 };
 
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index e9de962..db6d692 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -280,31 +280,32 @@ static int configfs_getlink(struct dentry *dentry, char * path)
 }
 
 static const char *configfs_get_link(struct dentry *dentry,
-				     struct inode *inode, void **cookie)
+				     struct inode *inode,
+				     struct delayed_call *done)
 {
-	char *page;
+	char *body;
 	int error;
 
 	if (!dentry)
 		return ERR_PTR(-ECHILD);
 
-	page = kzalloc(PAGE_SIZE, GFP_KERNEL);
-	if (!page)
+	body = kzalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!body)
 		return ERR_PTR(-ENOMEM);
 
-	error = configfs_getlink(dentry, page);
+	error = configfs_getlink(dentry, body);
 	if (!error) {
-		return *cookie = page;
+		set_delayed_call(done, kfree_link, body);
+		return body;
 	}
 
-	kfree(page);
+	kfree(body);
 	return ERR_PTR(error);
 }
 
 const struct inode_operations configfs_symlink_inode_operations = {
 	.get_link = configfs_get_link,
 	.readlink = generic_readlink,
-	.put_link = kfree_put_link,
 	.setattr = configfs_setattr,
 };
 
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 5a05559..a4dddc61 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -675,7 +675,8 @@ out:
 }
 
 static const char *ecryptfs_get_link(struct dentry *dentry,
-				     struct inode *inode, void **cookie)
+				     struct inode *inode,
+				     struct delayed_call *done)
 {
 	size_t len;
 	char *buf;
@@ -689,7 +690,8 @@ static const char *ecryptfs_get_link(struct dentry *dentry,
 	fsstack_copy_attr_atime(d_inode(dentry),
 				d_inode(ecryptfs_dentry_to_lower(dentry)));
 	buf[len] = '\0';
-	return *cookie = buf;
+	set_delayed_call(done, kfree_link, buf);
+	return buf;
 }
 
 /**
@@ -1102,7 +1104,6 @@ out:
 const struct inode_operations ecryptfs_symlink_iops = {
 	.readlink = generic_readlink,
 	.get_link = ecryptfs_get_link,
-	.put_link = kfree_put_link,
 	.permission = ecryptfs_permission,
 	.setattr = ecryptfs_setattr,
 	.getattr = ecryptfs_getattr_link,
diff --git a/fs/ext2/symlink.c b/fs/ext2/symlink.c
index 4690511..3495d8a 100644
--- a/fs/ext2/symlink.c
+++ b/fs/ext2/symlink.c
@@ -23,7 +23,6 @@
 const struct inode_operations ext2_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.get_link	= page_get_link,
-	.put_link	= page_put_link,
 	.setattr	= ext2_setattr,
 #ifdef CONFIG_EXT2_FS_XATTR
 	.setxattr	= generic_setxattr,
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index 3b4bfe2..2281ac2 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -24,7 +24,8 @@
 
 #ifdef CONFIG_EXT4_FS_ENCRYPTION
 static const char *ext4_encrypted_get_link(struct dentry *dentry,
-					   struct inode *inode, void **cookie)
+					   struct inode *inode,
+					   struct delayed_call *done)
 {
 	struct page *cpage = NULL;
 	char *caddr, *paddr = NULL;
@@ -80,7 +81,8 @@ static const char *ext4_encrypted_get_link(struct dentry *dentry,
 		paddr[res] = '\0';
 	if (cpage)
 		page_cache_release(cpage);
-	return *cookie = paddr;
+	set_delayed_call(done, kfree_link, paddr);
+	return paddr;
 errout:
 	if (cpage)
 		page_cache_release(cpage);
@@ -91,7 +93,6 @@ errout:
 const struct inode_operations ext4_encrypted_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.get_link	= ext4_encrypted_get_link,
-	.put_link       = kfree_put_link,
 	.setattr	= ext4_setattr,
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
@@ -103,7 +104,6 @@ const struct inode_operations ext4_encrypted_symlink_inode_operations = {
 const struct inode_operations ext4_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.get_link	= page_get_link,
-	.put_link	= page_put_link,
 	.setattr	= ext4_setattr,
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 2a8d84b..e7587fc 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -316,12 +316,14 @@ fail:
 }
 
 static const char *f2fs_get_link(struct dentry *dentry,
-				 struct inode *inode, void **cookie)
+				 struct inode *inode,
+				 struct delayed_call *done)
 {
-	const char *link = page_get_link(dentry, inode, cookie);
+	const char *link = page_get_link(dentry, inode, done);
 	if (!IS_ERR(link) && !*link) {
 		/* this is broken symlink case */
-		page_put_link(NULL, *cookie);
+		do_delayed_call(done);
+		clear_delayed_call(done);
 		link = ERR_PTR(-ENOENT);
 	}
 	return link;
@@ -926,7 +928,8 @@ static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry,
 
 #ifdef CONFIG_F2FS_FS_ENCRYPTION
 static const char *f2fs_encrypted_get_link(struct dentry *dentry,
-					   struct inode *inode, void **cookie)
+					   struct inode *inode,
+					   struct delayed_call *done)
 {
 	struct page *cpage = NULL;
 	char *caddr, *paddr = NULL;
@@ -988,7 +991,8 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry,
 	paddr[res] = '\0';
 
 	page_cache_release(cpage);
-	return *cookie = paddr;
+	set_delayed_call(done, kfree_link, paddr);
+	return paddr;
 errout:
 	kfree(cstr.name);
 	f2fs_fname_crypto_free_buffer(&pstr);
@@ -999,7 +1003,6 @@ errout:
 const struct inode_operations f2fs_encrypted_symlink_inode_operations = {
 	.readlink       = generic_readlink,
 	.get_link       = f2fs_encrypted_get_link,
-	.put_link       = kfree_put_link,
 	.getattr	= f2fs_getattr,
 	.setattr	= f2fs_setattr,
 	.setxattr	= generic_setxattr,
@@ -1035,7 +1038,6 @@ const struct inode_operations f2fs_dir_inode_operations = {
 const struct inode_operations f2fs_symlink_inode_operations = {
 	.readlink       = generic_readlink,
 	.get_link       = f2fs_get_link,
-	.put_link       = page_put_link,
 	.getattr	= f2fs_getattr,
 	.setattr	= f2fs_setattr,
 #ifdef CONFIG_F2FS_FS_XATTR
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index def0a4d..712601f 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1366,7 +1366,8 @@ static int fuse_readdir(struct file *file, struct dir_context *ctx)
 }
 
 static const char *fuse_get_link(struct dentry *dentry,
-				 struct inode *inode, void **cookie)
+				 struct inode *inode,
+				 struct delayed_call *done)
 {
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	FUSE_ARGS(args);
@@ -1392,7 +1393,7 @@ static const char *fuse_get_link(struct dentry *dentry,
 		link = ERR_PTR(ret);
 	} else {
 		link[ret] = '\0';
-		*cookie = link;
+		set_delayed_call(done, kfree_link, link);
 	}
 	fuse_invalidate_atime(inode);
 	return link;
@@ -1913,7 +1914,6 @@ static const struct inode_operations fuse_common_inode_operations = {
 static const struct inode_operations fuse_symlink_inode_operations = {
 	.setattr	= fuse_setattr,
 	.get_link	= fuse_get_link,
-	.put_link	= kfree_put_link,
 	.readlink	= generic_readlink,
 	.getattr	= fuse_getattr,
 	.setxattr	= fuse_setxattr,
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 1095056..1bae189 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1715,7 +1715,7 @@ static int gfs2_rename2(struct inode *odir, struct dentry *odentry,
  * gfs2_get_link - Follow a symbolic link
  * @dentry: The dentry of the link
  * @inode: The inode of the link
- * @cookie: place to store the information for ->put_link()
+ * @done: destructor for return value
  *
  * This can handle symlinks of any size.
  *
@@ -1723,7 +1723,8 @@ static int gfs2_rename2(struct inode *odir, struct dentry *odentry,
  */
 
 static const char *gfs2_get_link(struct dentry *dentry,
-				 struct inode *inode, void **cookie)
+				 struct inode *inode,
+				 struct delayed_call *done)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_holder i_gh;
@@ -1764,7 +1765,7 @@ static const char *gfs2_get_link(struct dentry *dentry,
 out:
 	gfs2_glock_dq_uninit(&i_gh);
 	if (!IS_ERR(buf))
-		*cookie = buf;
+		set_delayed_call(done, kfree_link, buf);
 	return buf;
 }
 
@@ -2138,7 +2139,6 @@ const struct inode_operations gfs2_dir_iops = {
 const struct inode_operations gfs2_symlink_iops = {
 	.readlink = generic_readlink,
 	.get_link = gfs2_get_link,
-	.put_link = kfree_put_link,
 	.permission = gfs2_permission,
 	.setattr = gfs2_setattr,
 	.getattr = gfs2_getattr,
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 6ce5309..7db524cc 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -893,12 +893,13 @@ static const struct inode_operations hostfs_dir_iops = {
 };
 
 static const char *hostfs_get_link(struct dentry *dentry,
-				   struct inode *inode, void **cookie)
+				   struct inode *inode,
+				   struct delayed_call *done)
 {
 	char *link;
 	if (!dentry)
 		return ERR_PTR(-ECHILD);
-	link = __getname();
+	link = kmalloc(PATH_MAX, GFP_KERNEL);
 	if (link) {
 		char *path = dentry_name(dentry);
 		int err = -ENOMEM;
@@ -909,25 +910,20 @@ static const char *hostfs_get_link(struct dentry *dentry,
 			__putname(path);
 		}
 		if (err < 0) {
-			__putname(link);
+			kfree(link);
 			return ERR_PTR(err);
 		}
 	} else {
 		return ERR_PTR(-ENOMEM);
 	}
 
-	return *cookie = link;
-}
-
-static void hostfs_put_link(struct inode *unused, void *cookie)
-{
-	__putname(cookie);
+	set_delayed_call(done, kfree_link, link);
+	return link;
 }
 
 static const struct inode_operations hostfs_link_iops = {
 	.readlink	= generic_readlink,
 	.get_link	= hostfs_get_link,
-	.put_link	= hostfs_put_link,
 };
 
 static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
diff --git a/fs/jfs/symlink.c b/fs/jfs/symlink.c
index 0211328..f8db4fd 100644
--- a/fs/jfs/symlink.c
+++ b/fs/jfs/symlink.c
@@ -34,7 +34,6 @@ const struct inode_operations jfs_fast_symlink_inode_operations = {
 const struct inode_operations jfs_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.get_link	= page_get_link,
-	.put_link	= page_put_link,
 	.setattr	= jfs_setattr,
 	.setxattr	= jfs_setxattr,
 	.getxattr	= jfs_getxattr,
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index f9efdae..117b8b3 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -113,22 +113,24 @@ static int kernfs_getlink(struct dentry *dentry, char *path)
 }
 
 static const char *kernfs_iop_get_link(struct dentry *dentry,
-				       struct inode *inode, void **cookie)
+				       struct inode *inode,
+				       struct delayed_call *done)
 {
-	int error = -ENOMEM;
-	char *page;
+	char *body;
+	int error;
 
 	if (!dentry)
 		return ERR_PTR(-ECHILD);
-	page = kzalloc(PAGE_SIZE, GFP_KERNEL);
-	if (!page)
+	body = kzalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!body)
 		return ERR_PTR(-ENOMEM);
-	error = kernfs_getlink(dentry, page);
+	error = kernfs_getlink(dentry, body);
 	if (unlikely(error < 0)) {
-		kfree(page);
+		kfree(body);
 		return ERR_PTR(error);
 	}
-	return *cookie = page;
+	set_delayed_call(done, kfree_link, body);
+	return body;
 }
 
 const struct inode_operations kernfs_symlink_iops = {
@@ -138,7 +140,6 @@ const struct inode_operations kernfs_symlink_iops = {
 	.listxattr	= kernfs_iop_listxattr,
 	.readlink	= generic_readlink,
 	.get_link	= kernfs_iop_get_link,
-	.put_link	= kfree_put_link,
 	.setattr	= kernfs_iop_setattr,
 	.getattr	= kernfs_iop_getattr,
 	.permission	= kernfs_iop_permission,
diff --git a/fs/libfs.c b/fs/libfs.c
index fec7ab0..0149129 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1019,11 +1019,12 @@ int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 }
 EXPORT_SYMBOL(noop_fsync);
 
-void kfree_put_link(struct inode *unused, void *cookie)
+/* Because kfree isn't assignment-compatible with void(void*) ;-/ */
+void kfree_link(void *p)
 {
-	kfree(cookie);
+	kfree(p);
 }
-EXPORT_SYMBOL(kfree_put_link);
+EXPORT_SYMBOL(kfree_link);
 
 /*
  * nop .set_page_dirty method so that people can use .page_mkwrite on
@@ -1087,7 +1088,7 @@ simple_nosetlease(struct file *filp, long arg, struct file_lock **flp,
 EXPORT_SYMBOL(simple_nosetlease);
 
 const char *simple_get_link(struct dentry *dentry, struct inode *inode,
-			    void **cookie)
+			    struct delayed_call *done)
 {
 	return inode->i_link;
 }
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 3cce709..cb1789c 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -436,7 +436,6 @@ static const struct address_space_operations minix_aops = {
 static const struct inode_operations minix_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.get_link	= page_get_link,
-	.put_link	= page_put_link,
 	.getattr	= minix_getattr,
 };
 
diff --git a/fs/namei.c b/fs/namei.c
index 8f51788..3c909ae 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -505,13 +505,13 @@ struct nameidata {
 	int		total_link_count;
 	struct saved {
 		struct path link;
-		void *cookie;
+		struct delayed_call done;
 		const char *name;
-		struct inode *inode;
 		unsigned seq;
 	} *stack, internal[EMBEDDED_LEVELS];
 	struct filename	*name;
 	struct nameidata *saved;
+	struct inode	*link_inode;
 	unsigned	root_seq;
 	int		dfd;
 };
@@ -592,11 +592,8 @@ static void drop_links(struct nameidata *nd)
 	int i = nd->depth;
 	while (i--) {
 		struct saved *last = nd->stack + i;
-		struct inode *inode = last->inode;
-		if (last->cookie && inode->i_op->put_link) {
-			inode->i_op->put_link(inode, last->cookie);
-			last->cookie = NULL;
-		}
+		do_delayed_call(&last->done);
+		clear_delayed_call(&last->done);
 	}
 }
 
@@ -858,9 +855,7 @@ void nd_jump_link(struct path *path)
 static inline void put_link(struct nameidata *nd)
 {
 	struct saved *last = nd->stack + --nd->depth;
-	struct inode *inode = last->inode;
-	if (last->cookie && inode->i_op->put_link)
-		inode->i_op->put_link(inode, last->cookie);
+	do_delayed_call(&last->done);
 	if (!(nd->flags & LOOKUP_RCU))
 		path_put(&last->link);
 }
@@ -892,7 +887,7 @@ static inline int may_follow_link(struct nameidata *nd)
 		return 0;
 
 	/* Allowed if owner and follower match. */
-	inode = nd->stack[0].inode;
+	inode = nd->link_inode;
 	if (uid_eq(current_cred()->fsuid, inode->i_uid))
 		return 0;
 
@@ -983,7 +978,7 @@ const char *get_link(struct nameidata *nd)
 {
 	struct saved *last = nd->stack + nd->depth - 1;
 	struct dentry *dentry = last->link.dentry;
-	struct inode *inode = last->inode;
+	struct inode *inode = nd->link_inode;
 	int error;
 	const char *res;
 
@@ -1004,23 +999,21 @@ const char *get_link(struct nameidata *nd)
 	nd->last_type = LAST_BIND;
 	res = inode->i_link;
 	if (!res) {
+		const char * (*get)(struct dentry *, struct inode *,
+				struct delayed_call *);
+		get = inode->i_op->get_link;
 		if (nd->flags & LOOKUP_RCU) {
-			res = inode->i_op->get_link(NULL, inode,
-						    &last->cookie);
+			res = get(NULL, inode, &last->done);
 			if (res == ERR_PTR(-ECHILD)) {
 				if (unlikely(unlazy_walk(nd, NULL, 0)))
 					return ERR_PTR(-ECHILD);
-				res = inode->i_op->get_link(dentry, inode,
-						            &last->cookie);
+				res = get(dentry, inode, &last->done);
 			}
 		} else {
-			res = inode->i_op->get_link(dentry, inode,
-						    &last->cookie);
+			res = get(dentry, inode, &last->done);
 		}
-		if (IS_ERR_OR_NULL(res)) {
-			last->cookie = NULL;
+		if (IS_ERR_OR_NULL(res))
 			return res;
-		}
 	}
 	if (*res == '/') {
 		if (nd->flags & LOOKUP_RCU) {
@@ -1699,8 +1692,8 @@ static int pick_link(struct nameidata *nd, struct path *link,
 
 	last = nd->stack + nd->depth++;
 	last->link = *link;
-	last->cookie = NULL;
-	last->inode = inode;
+	clear_delayed_call(&last->done);
+	nd->link_inode = inode;
 	last->seq = seq;
 	return 1;
 }
@@ -4508,26 +4501,25 @@ EXPORT_SYMBOL(readlink_copy);
  */
 int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
 {
-	void *cookie;
+	DEFINE_DELAYED_CALL(done);
 	struct inode *inode = d_inode(dentry);
 	const char *link = inode->i_link;
 	int res;
 
 	if (!link) {
-		link = inode->i_op->get_link(dentry, inode, &cookie);
+		link = inode->i_op->get_link(dentry, inode, &done);
 		if (IS_ERR(link))
 			return PTR_ERR(link);
 	}
 	res = readlink_copy(buffer, buflen, link);
-	if (inode->i_op->put_link)
-		inode->i_op->put_link(inode, cookie);
+	do_delayed_call(&done);
 	return res;
 }
 EXPORT_SYMBOL(generic_readlink);
 
 /* get the link contents into pagecache */
 const char *page_get_link(struct dentry *dentry, struct inode *inode,
-				 void **cookie)
+			  struct delayed_call *callback)
 {
 	char *kaddr;
 	struct page *page;
@@ -4546,7 +4538,7 @@ const char *page_get_link(struct dentry *dentry, struct inode *inode,
 		if (IS_ERR(page))
 			return (char*)page;
 	}
-	*cookie = page;
+	set_delayed_call(callback, page_put_link, page);
 	BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM);
 	kaddr = page_address(page);
 	nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1);
@@ -4555,21 +4547,19 @@ const char *page_get_link(struct dentry *dentry, struct inode *inode,
 
 EXPORT_SYMBOL(page_get_link);
 
-void page_put_link(struct inode *unused, void *cookie)
+void page_put_link(void *arg)
 {
-	struct page *page = cookie;
-	page_cache_release(page);
+	put_page(arg);
 }
 EXPORT_SYMBOL(page_put_link);
 
 int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
 {
-	void *cookie = NULL;
+	DEFINE_DELAYED_CALL(done);
 	int res = readlink_copy(buffer, buflen,
 				page_get_link(dentry, d_inode(dentry),
-					      &cookie));
-	if (cookie)
-		page_put_link(NULL, cookie);
+					      &done));
+	do_delayed_call(&done);
 	return res;
 }
 EXPORT_SYMBOL(page_readlink);
@@ -4619,6 +4609,5 @@ EXPORT_SYMBOL(page_symlink);
 const struct inode_operations page_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.get_link	= page_get_link,
-	.put_link	= page_put_link,
 };
 EXPORT_SYMBOL(page_symlink_inode_operations);
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 3ab6cdb..ce1eb3f 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -245,7 +245,6 @@ static void ncp_set_attr(struct inode *inode, struct ncp_entry_info *nwinfo)
 static const struct inode_operations ncp_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.get_link	= page_get_link,
-	.put_link	= page_put_link,
 	.setattr	= ncp_notify_change,
 };
 #endif
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index 95c69af..4fe3eea 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -43,7 +43,8 @@ error:
 }
 
 static const char *nfs_get_link(struct dentry *dentry,
-				struct inode *inode, void **cookie)
+				struct inode *inode,
+				struct delayed_call *done)
 {
 	struct page *page;
 	void *err;
@@ -68,7 +69,7 @@ static const char *nfs_get_link(struct dentry *dentry,
 		if (IS_ERR(page))
 			return ERR_CAST(page);
 	}
-	*cookie = page;
+	set_delayed_call(done, page_put_link, page);
 	return page_address(page);
 }
 
@@ -78,7 +79,6 @@ static const char *nfs_get_link(struct dentry *dentry,
 const struct inode_operations nfs_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.get_link	= nfs_get_link,
-	.put_link	= page_put_link,
 	.getattr	= nfs_getattr,
 	.setattr	= nfs_setattr,
 };
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 63dddb7..7ccdb96 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -570,7 +570,6 @@ const struct inode_operations nilfs_special_inode_operations = {
 const struct inode_operations nilfs_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.get_link	= page_get_link,
-	.put_link	= page_put_link,
 	.permission     = nilfs_permission,
 };
 
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index b4e79bc..6c2a3e3 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -89,7 +89,6 @@ const struct address_space_operations ocfs2_fast_symlink_aops = {
 const struct inode_operations ocfs2_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.get_link	= page_get_link,
-	.put_link	= page_put_link,
 	.getattr	= ocfs2_getattr,
 	.setattr	= ocfs2_setattr,
 	.setxattr	= generic_setxattr,
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 38a0b8b..964a60f 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -131,19 +131,12 @@ out_dput:
 	return err;
 }
 
-
-struct ovl_link_data {
-	struct dentry *realdentry;
-	void *cookie;
-};
-
 static const char *ovl_get_link(struct dentry *dentry,
-				struct inode *inode, void **cookie)
+				struct inode *inode,
+				struct delayed_call *done)
 {
 	struct dentry *realdentry;
 	struct inode *realinode;
-	struct ovl_link_data *data = NULL;
-	const char *ret;
 
 	if (!dentry)
 		return ERR_PTR(-ECHILD);
@@ -154,38 +147,7 @@ static const char *ovl_get_link(struct dentry *dentry,
 	if (WARN_ON(!realinode->i_op->get_link))
 		return ERR_PTR(-EPERM);
 
-	if (realinode->i_op->put_link) {
-		data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL);
-		if (!data)
-			return ERR_PTR(-ENOMEM);
-		data->realdentry = realdentry;
-	}
-
-	ret = realinode->i_op->get_link(realdentry, realinode, cookie);
-	if (IS_ERR_OR_NULL(ret)) {
-		kfree(data);
-		return ret;
-	}
-
-	if (data)
-		data->cookie = *cookie;
-
-	*cookie = data;
-
-	return ret;
-}
-
-static void ovl_put_link(struct inode *unused, void *c)
-{
-	struct inode *realinode;
-	struct ovl_link_data *data = c;
-
-	if (!data)
-		return;
-
-	realinode = data->realdentry->d_inode;
-	realinode->i_op->put_link(realinode, data->cookie);
-	kfree(data);
+	return realinode->i_op->get_link(realdentry, realinode, done);
 }
 
 static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
@@ -383,7 +345,6 @@ static const struct inode_operations ovl_file_inode_operations = {
 static const struct inode_operations ovl_symlink_inode_operations = {
 	.setattr	= ovl_setattr,
 	.get_link	= ovl_get_link,
-	.put_link	= ovl_put_link,
 	.readlink	= ovl_readlink,
 	.getattr	= ovl_getattr,
 	.setxattr	= ovl_setxattr,
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 1a489e2..71660bb 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1565,7 +1565,8 @@ static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
 }
 
 static const char *proc_pid_get_link(struct dentry *dentry,
-				     struct inode *inode, void **cookie)
+				     struct inode *inode,
+				     struct delayed_call *done)
 {
 	struct path path;
 	int error = -EACCES;
@@ -1949,12 +1950,13 @@ struct map_files_info {
  */
 static const char *
 proc_map_files_get_link(struct dentry *dentry,
-			struct inode *inode, void **cookie)
+			struct inode *inode,
+		        struct delayed_call *done)
 {
 	if (!capable(CAP_SYS_ADMIN))
 		return ERR_PTR(-EPERM);
 
-	return proc_pid_get_link(dentry, inode, NULL);
+	return proc_pid_get_link(dentry, inode, done);
 }
 
 /*
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 10360b2..d0e9b9b 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -393,25 +393,25 @@ static const struct file_operations proc_reg_file_ops_no_compat = {
 };
 #endif
 
+static void proc_put_link(void *p)
+{
+	unuse_pde(p);
+}
+
 static const char *proc_get_link(struct dentry *dentry,
-				 struct inode *inode, void **cookie)
+				 struct inode *inode,
+				 struct delayed_call *done)
 {
 	struct proc_dir_entry *pde = PDE(inode);
 	if (unlikely(!use_pde(pde)))
 		return ERR_PTR(-EINVAL);
-	*cookie = pde;
+	set_delayed_call(done, proc_put_link, pde);
 	return pde->data;
 }
 
-static void proc_put_link(struct inode *unused, void *p)
-{
-	unuse_pde(p);
-}
-
 const struct inode_operations proc_link_inode_operations = {
 	.readlink	= generic_readlink,
 	.get_link	= proc_get_link,
-	.put_link	= proc_put_link,
 };
 
 struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 63861c1..1dece87 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -31,7 +31,8 @@ static const struct proc_ns_operations *ns_entries[] = {
 };
 
 static const char *proc_ns_get_link(struct dentry *dentry,
-				    struct inode *inode, void **cookie)
+				    struct inode *inode,
+				    struct delayed_call *done)
 {
 	const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops;
 	struct task_struct *task;
diff --git a/fs/proc/self.c b/fs/proc/self.c
index 7a8b19e..67e8db4 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -19,7 +19,8 @@ static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
 }
 
 static const char *proc_self_get_link(struct dentry *dentry,
-				      struct inode *inode, void **cookie)
+				      struct inode *inode,
+				      struct delayed_call *done)
 {
 	struct pid_namespace *ns = inode->i_sb->s_fs_info;
 	pid_t tgid = task_tgid_nr_ns(current, ns);
@@ -32,13 +33,13 @@ static const char *proc_self_get_link(struct dentry *dentry,
 	if (unlikely(!name))
 		return dentry ? ERR_PTR(-ENOMEM) : ERR_PTR(-ECHILD);
 	sprintf(name, "%d", tgid);
-	return *cookie = name;
+	set_delayed_call(done, kfree_link, name);
+	return name;
 }
 
 static const struct inode_operations proc_self_inode_operations = {
 	.readlink	= proc_self_readlink,
 	.get_link	= proc_self_get_link,
-	.put_link	= kfree_put_link,
 };
 
 static unsigned self_inum;
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index 03eaa84..9eacd59 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -20,7 +20,8 @@ static int proc_thread_self_readlink(struct dentry *dentry, char __user *buffer,
 }
 
 static const char *proc_thread_self_get_link(struct dentry *dentry,
-					     struct inode *inode, void **cookie)
+					     struct inode *inode,
+					     struct delayed_call *done)
 {
 	struct pid_namespace *ns = inode->i_sb->s_fs_info;
 	pid_t tgid = task_tgid_nr_ns(current, ns);
@@ -34,13 +35,13 @@ static const char *proc_thread_self_get_link(struct dentry *dentry,
 	if (unlikely(!name))
 		return dentry ? ERR_PTR(-ENOMEM) : ERR_PTR(-ECHILD);
 	sprintf(name, "%d/task/%d", tgid, pid);
-	return *cookie = name;
+	set_delayed_call(done, kfree_link, name);
+	return name;
 }
 
 static const struct inode_operations proc_thread_self_inode_operations = {
 	.readlink	= proc_thread_self_readlink,
 	.get_link	= proc_thread_self_get_link,
-	.put_link	= kfree_put_link,
 };
 
 static unsigned thread_self_inum;
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index ecbf11e..2a12d46 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -1666,7 +1666,6 @@ const struct inode_operations reiserfs_dir_inode_operations = {
 const struct inode_operations reiserfs_symlink_inode_operations = {
 	.readlink = generic_readlink,
 	.get_link	= page_get_link,
-	.put_link = page_put_link,
 	.setattr = reiserfs_setattr,
 	.setxattr = reiserfs_setxattr,
 	.getxattr = reiserfs_getxattr,
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
index 7c635a5..dbcc2f5 100644
--- a/fs/squashfs/symlink.c
+++ b/fs/squashfs/symlink.c
@@ -120,7 +120,6 @@ const struct address_space_operations squashfs_symlink_aops = {
 const struct inode_operations squashfs_symlink_inode_ops = {
 	.readlink = generic_readlink,
 	.get_link = page_get_link,
-	.put_link = page_put_link,
 	.getxattr = generic_getxattr,
 	.listxattr = squashfs_listxattr
 };
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 80a40bc..07ac18c 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -147,7 +147,6 @@ static inline void write3byte(struct sysv_sb_info *sbi,
 static const struct inode_operations sysv_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.get_link	= page_get_link,
-	.put_link	= page_put_link,
 	.getattr	= sysv_getattr,
 };
 
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index f638fd5..06eafaf 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -417,7 +417,7 @@ STATIC const char *
 xfs_vn_get_link(
 	struct dentry		*dentry,
 	struct inode		*inode,
-	void			**cookie)
+	struct delayed_call	*done)
 {
 	char			*link;
 	int			error = -ENOMEM;
@@ -433,7 +433,8 @@ xfs_vn_get_link(
 	if (unlikely(error))
 		goto out_kfree;
 
-	return *cookie = link;
+	set_delayed_call(done, kfree_link, link);
+	return link;
 
  out_kfree:
 	kfree(link);
@@ -1177,7 +1178,6 @@ static const struct inode_operations xfs_dir_ci_inode_operations = {
 static const struct inode_operations xfs_symlink_inode_operations = {
 	.readlink		= generic_readlink,
 	.get_link		= xfs_vn_get_link,
-	.put_link		= kfree_put_link,
 	.getattr		= xfs_vn_getattr,
 	.setattr		= xfs_vn_setattr,
 	.setxattr		= generic_setxattr,
diff --git a/include/linux/delayed_call.h b/include/linux/delayed_call.h
new file mode 100644
index 0000000..f7fa76a
--- /dev/null
+++ b/include/linux/delayed_call.h
@@ -0,0 +1,34 @@
+#ifndef _DELAYED_CALL_H
+#define _DELAYED_CALL_H
+
+/*
+ * Poor man's closures; I wish we could've done them sanely polymorphic,
+ * but...
+ */
+
+struct delayed_call {
+	void (*fn)(void *);
+	void *arg;
+};
+
+#define DEFINE_DELAYED_CALL(name) struct delayed_call name = {NULL, NULL}
+
+/* I really wish we had closures with sane typechecking... */
+static inline void set_delayed_call(struct delayed_call *call,
+		void (*fn)(void *), void *arg)
+{
+	call->fn = fn;
+	call->arg = arg;
+}
+
+static inline void do_delayed_call(struct delayed_call *call)
+{
+	if (call->fn)
+		call->fn(call->arg);
+}
+
+static inline void clear_delayed_call(struct delayed_call *call)
+{
+	call->fn = NULL;
+}
+#endif
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 138e206..5de5edb 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -31,6 +31,7 @@
 #include <linux/blk_types.h>
 #include <linux/workqueue.h>
 #include <linux/percpu-rwsem.h>
+#include <linux/delayed_call.h>
 
 #include <asm/byteorder.h>
 #include <uapi/linux/fs.h>
@@ -1633,12 +1634,11 @@ struct file_operations {
 
 struct inode_operations {
 	struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
-	const char * (*get_link) (struct dentry *, struct inode *, void **);
+	const char * (*get_link) (struct dentry *, struct inode *, struct delayed_call *);
 	int (*permission) (struct inode *, int);
 	struct posix_acl * (*get_acl)(struct inode *, int);
 
 	int (*readlink) (struct dentry *, char __user *,int);
-	void (*put_link) (struct inode *, void *);
 
 	int (*create) (struct inode *,struct dentry *, umode_t, bool);
 	int (*link) (struct dentry *,struct inode *,struct dentry *);
@@ -2736,13 +2736,14 @@ extern const struct file_operations generic_ro_fops;
 
 extern int readlink_copy(char __user *, int, const char *);
 extern int page_readlink(struct dentry *, char __user *, int);
-extern const char *page_get_link(struct dentry *, struct inode *, void **);
-extern void page_put_link(struct inode *, void *);
+extern const char *page_get_link(struct dentry *, struct inode *,
+				 struct delayed_call *);
+extern void page_put_link(void *);
 extern int __page_symlink(struct inode *inode, const char *symname, int len,
 		int nofs);
 extern int page_symlink(struct inode *inode, const char *symname, int len);
 extern const struct inode_operations page_symlink_inode_operations;
-extern void kfree_put_link(struct inode *, void *);
+extern void kfree_link(void *);
 extern int generic_readlink(struct dentry *, char __user *, int);
 extern void generic_fillattr(struct inode *, struct kstat *);
 int vfs_getattr_nosec(struct path *path, struct kstat *stat);
@@ -2753,7 +2754,8 @@ void __inode_sub_bytes(struct inode *inode, loff_t bytes);
 void inode_sub_bytes(struct inode *inode, loff_t bytes);
 loff_t inode_get_bytes(struct inode *inode);
 void inode_set_bytes(struct inode *inode, loff_t bytes);
-const char *simple_get_link(struct dentry *, struct inode *, void **);
+const char *simple_get_link(struct dentry *, struct inode *,
+			    struct delayed_call *);
 extern const struct inode_operations simple_symlink_inode_operations;
 
 extern int iterate_dir(struct file *, struct dir_context *);
diff --git a/mm/shmem.c b/mm/shmem.c
index 0605716..bab9041 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2496,8 +2496,15 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
 	return 0;
 }
 
+static void shmem_put_link(void *arg)
+{
+	mark_page_accessed(arg);
+	put_page(arg);
+}
+
 static const char *shmem_get_link(struct dentry *dentry,
-				  struct inode *inode, void **cookie)
+				  struct inode *inode,
+				  struct delayed_call *done)
 {
 	struct page *page = NULL;
 	int error;
@@ -2515,17 +2522,10 @@ static const char *shmem_get_link(struct dentry *dentry,
 			return ERR_PTR(error);
 		unlock_page(page);
 	}
-	*cookie = page;
+	set_delayed_call(done, shmem_put_link, page);
 	return page_address(page);
 }
 
-static void shmem_put_link(struct inode *unused, void *cookie)
-{
-	struct page *page = cookie;
-	mark_page_accessed(page);
-	page_cache_release(page);
-}
-
 #ifdef CONFIG_TMPFS_XATTR
 /*
  * Superblocks without xattr inode operations may get some security.* xattr
@@ -2680,7 +2680,6 @@ static const struct inode_operations shmem_short_symlink_operations = {
 static const struct inode_operations shmem_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.get_link	= shmem_get_link,
-	.put_link	= shmem_put_link,
 #ifdef CONFIG_TMPFS_XATTR
 	.setxattr	= shmem_setxattr,
 	.getxattr	= shmem_getxattr,
-- 
cgit v1.1


From d9fe4fab11976e56b2e992980bf6ce948bdf02ac Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Tue, 22 Dec 2015 17:54:23 -0700
Subject: x86/mm/pat: Add untrack_pfn_moved for mremap

mremap() with MREMAP_FIXED on a VM_PFNMAP range causes the following
WARN_ON_ONCE() message in untrack_pfn().

  WARNING: CPU: 1 PID: 3493 at arch/x86/mm/pat.c:985 untrack_pfn+0xbd/0xd0()
  Call Trace:
  [<ffffffff817729ea>] dump_stack+0x45/0x57
  [<ffffffff8109e4b6>] warn_slowpath_common+0x86/0xc0
  [<ffffffff8109e5ea>] warn_slowpath_null+0x1a/0x20
  [<ffffffff8106a88d>] untrack_pfn+0xbd/0xd0
  [<ffffffff811d2d5e>] unmap_single_vma+0x80e/0x860
  [<ffffffff811d3725>] unmap_vmas+0x55/0xb0
  [<ffffffff811d916c>] unmap_region+0xac/0x120
  [<ffffffff811db86a>] do_munmap+0x28a/0x460
  [<ffffffff811dec33>] move_vma+0x1b3/0x2e0
  [<ffffffff811df113>] SyS_mremap+0x3b3/0x510
  [<ffffffff817793ee>] entry_SYSCALL_64_fastpath+0x12/0x71

MREMAP_FIXED moves a pfnmap from old vma to new vma.  untrack_pfn() is
called with the old vma after its pfnmap page table has been removed,
which causes follow_phys() to fail.  The new vma has a new pfnmap to
the same pfn & cache type with VM_PAT set.  Therefore, we only need to
clear VM_PAT from the old vma in this case.

Add untrack_pfn_moved(), which clears VM_PAT from a given old vma.
move_vma() is changed to call this function with the old vma when
VM_PFNMAP is set.  move_vma() then calls do_munmap(), and untrack_pfn()
is a no-op since VM_PAT is cleared.

Reported-by: Stas Sergeev <stsp@list.ru>
Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/1450832064-10093-2-git-send-email-toshi.kani@hpe.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/pat.c             | 10 ++++++++++
 include/asm-generic/pgtable.h | 10 +++++++++-
 mm/mremap.c                   |  4 ++++
 3 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 188e3e0..1aca073 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -992,6 +992,16 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
 	vma->vm_flags &= ~VM_PAT;
 }
 
+/*
+ * untrack_pfn_moved is called, while mremapping a pfnmap for a new region,
+ * with the old vma after its pfnmap page table has been removed.  The new
+ * vma has a new pfnmap to the same pfn & cache type with VM_PAT set.
+ */
+void untrack_pfn_moved(struct vm_area_struct *vma)
+{
+	vma->vm_flags &= ~VM_PAT;
+}
+
 pgprot_t pgprot_writecombine(pgprot_t prot)
 {
 	return __pgprot(pgprot_val(prot) |
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 14b0ff32..3a6803c 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -569,7 +569,7 @@ static inline int track_pfn_copy(struct vm_area_struct *vma)
 }
 
 /*
- * untrack_pfn_vma is called while unmapping a pfnmap for a region.
+ * untrack_pfn is called while unmapping a pfnmap for a region.
  * untrack can be called for a specific region indicated by pfn and size or
  * can be for the entire vma (in which case pfn, size are zero).
  */
@@ -577,6 +577,13 @@ static inline void untrack_pfn(struct vm_area_struct *vma,
 			       unsigned long pfn, unsigned long size)
 {
 }
+
+/*
+ * untrack_pfn_moved is called while mremapping a pfnmap for a new region.
+ */
+static inline void untrack_pfn_moved(struct vm_area_struct *vma)
+{
+}
 #else
 extern int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
 			   unsigned long pfn, unsigned long addr,
@@ -586,6 +593,7 @@ extern int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
 extern int track_pfn_copy(struct vm_area_struct *vma);
 extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
 			unsigned long size);
+extern void untrack_pfn_moved(struct vm_area_struct *vma);
 #endif
 
 #ifdef __HAVE_COLOR_ZERO_PAGE
diff --git a/mm/mremap.c b/mm/mremap.c
index c25bc62..de824e7 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -319,6 +319,10 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 	hiwater_vm = mm->hiwater_vm;
 	vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
 
+	/* Tell pfnmap has moved from this vma */
+	if (unlikely(vma->vm_flags & VM_PFNMAP))
+		untrack_pfn_moved(vma);
+
 	if (do_munmap(mm, old_addr, old_len) < 0) {
 		/* OOM: unable to split vma, just get accounts right */
 		vm_unacct_memory(excess >> PAGE_SHIFT);
-- 
cgit v1.1


From 2039e6acaf94d83ec6b6d9f3d0bce7ea1f099918 Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Tue, 22 Dec 2015 17:54:24 -0700
Subject: x86/mm/pat: Change free_memtype() to support shrinking case

Using mremap() to shrink the map size of a VM_PFNMAP range causes
the following error message, and leaves the pfn range allocated.

 x86/PAT: test:3493 freeing invalid memtype [mem 0x483200000-0x4863fffff]

This is because rbt_memtype_erase(), called from free_memtype()
with spin_lock held, only supports to free a whole memtype node in
memtype_rbroot.  Therefore, this patch changes rbt_memtype_erase()
to support a request that shrinks the size of a memtype node for
mremap().

memtype_rb_exact_match() is renamed to memtype_rb_match(), and
is enhanced to support EXACT_MATCH and END_MATCH in @match_type.
Since the memtype_rbroot tree allows overlapping ranges,
rbt_memtype_erase() checks with EXACT_MATCH first, i.e. free
a whole node for the munmap case.  If no such entry is found,
it then checks with END_MATCH, i.e. shrink the size of a node
from the end for the mremap case.

On the mremap case, rbt_memtype_erase() proceeds in two steps,
1) remove the node, and then 2) insert the updated node.  This
allows proper update of augmented values, subtree_max_end, in
the tree.

Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: stsp@list.ru
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/1450832064-10093-3-git-send-email-toshi.kani@hpe.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/pat.c        |  2 +-
 arch/x86/mm/pat_rbtree.c | 52 +++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 44 insertions(+), 10 deletions(-)

diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 1aca073..031782e 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -586,7 +586,7 @@ int free_memtype(u64 start, u64 end)
 	entry = rbt_memtype_erase(start, end);
 	spin_unlock(&memtype_lock);
 
-	if (!entry) {
+	if (IS_ERR(entry)) {
 		pr_info("x86/PAT: %s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n",
 			current->comm, current->pid, start, end - 1);
 		return -EINVAL;
diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c
index 6393108..2f77022 100644
--- a/arch/x86/mm/pat_rbtree.c
+++ b/arch/x86/mm/pat_rbtree.c
@@ -98,8 +98,13 @@ static struct memtype *memtype_rb_lowest_match(struct rb_root *root,
 	return last_lower; /* Returns NULL if there is no overlap */
 }
 
-static struct memtype *memtype_rb_exact_match(struct rb_root *root,
-				u64 start, u64 end)
+enum {
+	MEMTYPE_EXACT_MATCH	= 0,
+	MEMTYPE_END_MATCH	= 1
+};
+
+static struct memtype *memtype_rb_match(struct rb_root *root,
+				u64 start, u64 end, int match_type)
 {
 	struct memtype *match;
 
@@ -107,7 +112,12 @@ static struct memtype *memtype_rb_exact_match(struct rb_root *root,
 	while (match != NULL && match->start < end) {
 		struct rb_node *node;
 
-		if (match->start == start && match->end == end)
+		if ((match_type == MEMTYPE_EXACT_MATCH) &&
+		    (match->start == start) && (match->end == end))
+			return match;
+
+		if ((match_type == MEMTYPE_END_MATCH) &&
+		    (match->start < start) && (match->end == end))
 			return match;
 
 		node = rb_next(&match->rb);
@@ -117,7 +127,7 @@ static struct memtype *memtype_rb_exact_match(struct rb_root *root,
 			match = NULL;
 	}
 
-	return NULL; /* Returns NULL if there is no exact match */
+	return NULL; /* Returns NULL if there is no match */
 }
 
 static int memtype_rb_check_conflict(struct rb_root *root,
@@ -210,12 +220,36 @@ struct memtype *rbt_memtype_erase(u64 start, u64 end)
 {
 	struct memtype *data;
 
-	data = memtype_rb_exact_match(&memtype_rbroot, start, end);
-	if (!data)
-		goto out;
+	/*
+	 * Since the memtype_rbroot tree allows overlapping ranges,
+	 * rbt_memtype_erase() checks with EXACT_MATCH first, i.e. free
+	 * a whole node for the munmap case.  If no such entry is found,
+	 * it then checks with END_MATCH, i.e. shrink the size of a node
+	 * from the end for the mremap case.
+	 */
+	data = memtype_rb_match(&memtype_rbroot, start, end,
+				MEMTYPE_EXACT_MATCH);
+	if (!data) {
+		data = memtype_rb_match(&memtype_rbroot, start, end,
+					MEMTYPE_END_MATCH);
+		if (!data)
+			return ERR_PTR(-EINVAL);
+	}
+
+	if (data->start == start) {
+		/* munmap: erase this node */
+		rb_erase_augmented(&data->rb, &memtype_rbroot,
+					&memtype_rb_augment_cb);
+	} else {
+		/* mremap: update the end value of this node */
+		rb_erase_augmented(&data->rb, &memtype_rbroot,
+					&memtype_rb_augment_cb);
+		data->end = start;
+		data->subtree_max_end = data->end;
+		memtype_rb_insert(&memtype_rbroot, data);
+		return NULL;
+	}
 
-	rb_erase_augmented(&data->rb, &memtype_rbroot, &memtype_rb_augment_cb);
-out:
 	return data;
 }
 
-- 
cgit v1.1


From 8705d603edd49f1cff165cd3b7998f4c7f098d27 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 29 Dec 2015 20:12:18 -0800
Subject: x86/vsdo: Fix build on PARAVIRT_CLOCK=y, KVM_GUEST=n

arch/x86/built-in.o: In function `arch_setup_additional_pages':
 (.text+0x587): undefined reference to `pvclock_pvti_cpu0_va'

KVM_GUEST selects PARAVIRT_CLOCK, so we can make pvclock_pvti_cpu0_va depend
on KVM_GUEST.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Tested-by: Borislav Petkov <bp@alien8.de>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Link: http://lkml.kernel.org/r/444d38a9bcba832685740ea1401b569861d09a72.1451446564.git.luto@kernel.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/pvclock.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 66df22b..fdcc040 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -4,7 +4,7 @@
 #include <linux/clocksource.h>
 #include <asm/pvclock-abi.h>
 
-#ifdef CONFIG_PARAVIRT_CLOCK
+#ifdef CONFIG_KVM_GUEST
 extern struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void);
 #else
 static inline struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void)
-- 
cgit v1.1


From 7d92de3a8285ab3dfd68aa3a99823acd5b190444 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <kernellwp@gmail.com>
Date: Thu, 3 Dec 2015 17:42:10 +0800
Subject: sched/deadline: Fix the earliest_dl.next logic

earliest_dl.next should cache deadline of the earliest ready task that
is also enqueued in the pushable rbtree, as pull algorithm uses this
information to find candidates for migration: if the earliest_dl.next
deadline of source rq is earlier than the earliest_dl.curr deadline of
destination rq, the task from the source rq can be pulled.

However, current implementation only guarantees that earliest_dl.next is
the deadline of the next ready task instead of the next pushable task;
which will result in potentially holding both rqs' lock and find nothing
to migrate because of affinity constraints. In addition, current logic
doesn't update the next candidate for pushing in pick_next_task_dl(),
even if the running task is never eligible.

This patch fixes both problems by updating earliest_dl.next when
pushable dl task is enqueued/dequeued, similar to what we already do for
RT.

Tested-by: Luca Abeni <luca.abeni@unitn.it>
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Juri Lelli <juri.lelli@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1449135730-27202-1-git-send-email-wanpeng.li@hotmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/deadline.c | 59 ++++++-------------------------------------------
 1 file changed, 7 insertions(+), 52 deletions(-)

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 8b0a15e..cd64c97 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -176,8 +176,10 @@ static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
 		}
 	}
 
-	if (leftmost)
+	if (leftmost) {
 		dl_rq->pushable_dl_tasks_leftmost = &p->pushable_dl_tasks;
+		dl_rq->earliest_dl.next = p->dl.deadline;
+	}
 
 	rb_link_node(&p->pushable_dl_tasks, parent, link);
 	rb_insert_color(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
@@ -195,6 +197,10 @@ static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
 
 		next_node = rb_next(&p->pushable_dl_tasks);
 		dl_rq->pushable_dl_tasks_leftmost = next_node;
+		if (next_node) {
+			dl_rq->earliest_dl.next = rb_entry(next_node,
+				struct task_struct, pushable_dl_tasks)->dl.deadline;
+		}
 	}
 
 	rb_erase(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
@@ -782,42 +788,14 @@ static void update_curr_dl(struct rq *rq)
 
 #ifdef CONFIG_SMP
 
-static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu);
-
-static inline u64 next_deadline(struct rq *rq)
-{
-	struct task_struct *next = pick_next_earliest_dl_task(rq, rq->cpu);
-
-	if (next && dl_prio(next->prio))
-		return next->dl.deadline;
-	else
-		return 0;
-}
-
 static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
 {
 	struct rq *rq = rq_of_dl_rq(dl_rq);
 
 	if (dl_rq->earliest_dl.curr == 0 ||
 	    dl_time_before(deadline, dl_rq->earliest_dl.curr)) {
-		/*
-		 * If the dl_rq had no -deadline tasks, or if the new task
-		 * has shorter deadline than the current one on dl_rq, we
-		 * know that the previous earliest becomes our next earliest,
-		 * as the new task becomes the earliest itself.
-		 */
-		dl_rq->earliest_dl.next = dl_rq->earliest_dl.curr;
 		dl_rq->earliest_dl.curr = deadline;
 		cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1);
-	} else if (dl_rq->earliest_dl.next == 0 ||
-		   dl_time_before(deadline, dl_rq->earliest_dl.next)) {
-		/*
-		 * On the other hand, if the new -deadline task has a
-		 * a later deadline than the earliest one on dl_rq, but
-		 * it is earlier than the next (if any), we must
-		 * recompute the next-earliest.
-		 */
-		dl_rq->earliest_dl.next = next_deadline(rq);
 	}
 }
 
@@ -839,7 +817,6 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
 
 		entry = rb_entry(leftmost, struct sched_dl_entity, rb_node);
 		dl_rq->earliest_dl.curr = entry->deadline;
-		dl_rq->earliest_dl.next = next_deadline(rq);
 		cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1);
 	}
 }
@@ -1274,28 +1251,6 @@ static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
 	return 0;
 }
 
-/* Returns the second earliest -deadline task, NULL otherwise */
-static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu)
-{
-	struct rb_node *next_node = rq->dl.rb_leftmost;
-	struct sched_dl_entity *dl_se;
-	struct task_struct *p = NULL;
-
-next_node:
-	next_node = rb_next(next_node);
-	if (next_node) {
-		dl_se = rb_entry(next_node, struct sched_dl_entity, rb_node);
-		p = dl_task_of(dl_se);
-
-		if (pick_dl_task(rq, p, cpu))
-			return p;
-
-		goto next_node;
-	}
-
-	return NULL;
-}
-
 /*
  * Return the earliest pushable rq's task, which is suitable to be executed
  * on the CPU, NULL otherwise:
-- 
cgit v1.1


From 25ec02f2c14466a4549c5dcc044b628c2cc46fde Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 21 Dec 2015 15:25:30 +0100
Subject: x86/fpu: Properly align size in CHECK_MEMBER_AT_END_OF() macro

The CHECK_MEMBER_AT_END_OF(TYPE, MEMBER) checks whether MEMBER
is last member of TYPE by evaluating:

  offsetof(TYPE::MEMBER) + sizeof(TYPE::MEMBER) == sizeof(TYPE)

and ensuring TYPE::MEMBER is the last member of the TYPE.

This condition breaks on structs that are padded to be
aligned. This patch ensures the TYPE alignment is taken
into account.

This bug was revealed after adding cacheline alignment into
struct sched_entity, which broke task_struct::thread check:

  CHECK_MEMBER_AT_END_OF(struct task_struct, thread);

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Dave Hansen <dave@sr71.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1450707930-3445-1-git-send-email-jolsa@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/fpu/init.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index be39b5f..8e839e7 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -143,9 +143,18 @@ static void __init fpu__init_system_generic(void)
 unsigned int xstate_size;
 EXPORT_SYMBOL_GPL(xstate_size);
 
-/* Enforce that 'MEMBER' is the last field of 'TYPE': */
+/* Get alignment of the TYPE. */
+#define TYPE_ALIGN(TYPE) offsetof(struct { char x; TYPE test; }, test)
+
+/*
+ * Enforce that 'MEMBER' is the last field of 'TYPE'.
+ *
+ * Align the computed size with alignment of the TYPE,
+ * because that's how C aligns structs.
+ */
 #define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER) \
-	BUILD_BUG_ON(sizeof(TYPE) != offsetofend(TYPE, MEMBER))
+	BUILD_BUG_ON(sizeof(TYPE) != ALIGN(offsetofend(TYPE, MEMBER), \
+					   TYPE_ALIGN(TYPE)))
 
 /*
  * We append the 'struct fpu' to the task_struct:
-- 
cgit v1.1


From 5a1078043f844074cbd53981432778a8d5dd56e9 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Tue, 8 Dec 2015 21:23:59 +0100
Subject: sched/core: Move sched_entity::avg into separate cache line

The sched_entity::avg collides with read-mostly sched_entity data.

The perf c2c tool showed many read HITM accesses across
many CPUs for sched_entity's cfs_rq and my_q, while having
at the same time tons of stores for avg.

After placing sched_entity::avg into separate cache line,
the perf bench sched pipe showed around 20 seconds speedup.

NOTE I cut out all perf events except for cycles and
instructions from following output.

Before:
  $ perf stat -r 5 perf bench sched pipe -l 10000000
  # Running 'sched/pipe' benchmark:
  # Executed 10000000 pipe operations between two processes

       Total time: 270.348 [sec]

        27.034805 usecs/op
            36989 ops/sec
   ...

     245,537,074,035      cycles                    #    1.433 GHz
     187,264,548,519      instructions              #    0.77  insns per cycle

       272.653840535 seconds time elapsed           ( +-  1.31% )

After:
  $ perf stat -r 5 perf bench sched pipe -l 10000000
  # Running 'sched/pipe' benchmark:
  # Executed 10000000 pipe operations between two processes

       Total time: 251.076 [sec]

        25.107678 usecs/op
            39828 ops/sec
  ...

     244,573,513,928      cycles                    #    1.572 GHz
     187,409,641,157      instructions              #    0.76  insns per cycle

       251.679315188 seconds time elapsed           ( +-  0.31% )

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Joe Mario <jmario@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1449606239-28602-1-git-send-email-jolsa@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 791b47e..0c0e781 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1268,8 +1268,13 @@ struct sched_entity {
 #endif
 
 #ifdef CONFIG_SMP
-	/* Per entity load average tracking */
-	struct sched_avg	avg;
+	/*
+	 * Per entity load average tracking.
+	 *
+	 * Put into separate cache line so it does not
+	 * collide with read-mostly values above.
+	 */
+	struct sched_avg	avg ____cacheline_aligned_in_smp;
 #endif
 };
 
-- 
cgit v1.1


From 0905f04eb21fc1c2e690bed5d0418a061d56c225 Mon Sep 17 00:00:00 2001
From: Yuyang Du <yuyang.du@intel.com>
Date: Thu, 17 Dec 2015 07:34:27 +0800
Subject: sched/fair: Fix new task's load avg removed from source CPU in
 wake_up_new_task()

If a newly created task is selected to go to a different CPU in fork
balance when it wakes up the first time, its load averages should
not be removed from the source CPU since they are never added to
it before. The same is also applicable to a never used group entity.

Fix it in remove_entity_load_avg(): when entity's last_update_time
is 0, simply return. This should precisely identify the case in
question, because in other migrations, the last_update_time is set
to 0 after remove_entity_load_avg().

Reported-by: Steve Muckle <steve.muckle@linaro.org>
Signed-off-by: Yuyang Du <yuyang.du@intel.com>
[peterz: cfs_rq_last_update_time]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Juri Lelli <Juri.Lelli@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Patrick Bellasi <patrick.bellasi@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Link: http://lkml.kernel.org/r/20151216233427.GJ28098@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 38 ++++++++++++++++++++++++++++----------
 1 file changed, 28 insertions(+), 10 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 93efb96..1926606 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2900,27 +2900,45 @@ dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		max_t(s64,  cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
 }
 
-/*
- * Task first catches up with cfs_rq, and then subtract
- * itself from the cfs_rq (task must be off the queue now).
- */
-void remove_entity_load_avg(struct sched_entity *se)
-{
-	struct cfs_rq *cfs_rq = cfs_rq_of(se);
-	u64 last_update_time;
-
 #ifndef CONFIG_64BIT
+static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
+{
 	u64 last_update_time_copy;
+	u64 last_update_time;
 
 	do {
 		last_update_time_copy = cfs_rq->load_last_update_time_copy;
 		smp_rmb();
 		last_update_time = cfs_rq->avg.last_update_time;
 	} while (last_update_time != last_update_time_copy);
+
+	return last_update_time;
+}
 #else
-	last_update_time = cfs_rq->avg.last_update_time;
+static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
+{
+	return cfs_rq->avg.last_update_time;
+}
 #endif
 
+/*
+ * Task first catches up with cfs_rq, and then subtract
+ * itself from the cfs_rq (task must be off the queue now).
+ */
+void remove_entity_load_avg(struct sched_entity *se)
+{
+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+	u64 last_update_time;
+
+	/*
+	 * Newly created task or never used group entity should not be removed
+	 * from its (source) cfs_rq
+	 */
+	if (se->avg.last_update_time == 0)
+		return;
+
+	last_update_time = cfs_rq_last_update_time(cfs_rq);
+
 	__update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
 	atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
 	atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
-- 
cgit v1.1


From 7b648018f628eee73450b71dc68ebb3c3865465e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 3 Dec 2015 18:35:21 +0100
Subject: perf/core: Collapse more IPI loops

This patch collapses the two 'hard' cases, which are
perf_event_{dis,en}able().

I cannot seem to convince myself the current code is correct.

So starting with perf_event_disable(); we don't strictly need to test
for event->state == ACTIVE, ctx->is_active is enough. If the event is
not scheduled while the ctx is, __perf_event_disable() still does the
right thing.  Its a little less efficient to IPI in that case,
over-all simpler.

For perf_event_enable(); the same goes, but I think that's actually
broken in its current form. The current condition is: ctx->is_active
&& event->state == OFF, that means it doesn't do anything when
!ctx->active && event->state == OFF. This is wrong, it should still
mark the event INACTIVE in that case, otherwise we'll still not try
and schedule the event once the context becomes active again.

This patch implements the two function using the new
event_function_call() and does away with the tricky event->state
tests.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Alexander Shishkin <alexander.shishkin@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 106 ++++++++++++++++-----------------------------------
 1 file changed, 33 insertions(+), 73 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 2c7bb20..bf82441 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1766,6 +1766,20 @@ int __perf_event_disable(void *info)
 	return 0;
 }
 
+void ___perf_event_disable(void *info)
+{
+	struct perf_event *event = info;
+
+	/*
+	 * Since we have the lock this context can't be scheduled
+	 * in, so we can change the state safely.
+	 */
+	if (event->state == PERF_EVENT_STATE_INACTIVE) {
+		update_group_times(event);
+		event->state = PERF_EVENT_STATE_OFF;
+	}
+}
+
 /*
  * Disable a event.
  *
@@ -1782,43 +1796,16 @@ int __perf_event_disable(void *info)
 static void _perf_event_disable(struct perf_event *event)
 {
 	struct perf_event_context *ctx = event->ctx;
-	struct task_struct *task = ctx->task;
-
-	if (!task) {
-		/*
-		 * Disable the event on the cpu that it's on
-		 */
-		cpu_function_call(event->cpu, __perf_event_disable, event);
-		return;
-	}
-
-retry:
-	if (!task_function_call(task, __perf_event_disable, event))
-		return;
 
 	raw_spin_lock_irq(&ctx->lock);
-	/*
-	 * If the event is still active, we need to retry the cross-call.
-	 */
-	if (event->state == PERF_EVENT_STATE_ACTIVE) {
+	if (event->state <= PERF_EVENT_STATE_OFF) {
 		raw_spin_unlock_irq(&ctx->lock);
-		/*
-		 * Reload the task pointer, it might have been changed by
-		 * a concurrent perf_event_context_sched_out().
-		 */
-		task = ctx->task;
-		goto retry;
-	}
-
-	/*
-	 * Since we have the lock this context can't be scheduled
-	 * in, so we can change the state safely.
-	 */
-	if (event->state == PERF_EVENT_STATE_INACTIVE) {
-		update_group_times(event);
-		event->state = PERF_EVENT_STATE_OFF;
+		return;
 	}
 	raw_spin_unlock_irq(&ctx->lock);
+
+	event_function_call(event, __perf_event_disable,
+			    ___perf_event_disable, event);
 }
 
 /*
@@ -2269,6 +2256,11 @@ unlock:
 	return 0;
 }
 
+void ___perf_event_enable(void *info)
+{
+	__perf_event_mark_enabled((struct perf_event *)info);
+}
+
 /*
  * Enable a event.
  *
@@ -2281,58 +2273,26 @@ unlock:
 static void _perf_event_enable(struct perf_event *event)
 {
 	struct perf_event_context *ctx = event->ctx;
-	struct task_struct *task = ctx->task;
 
-	if (!task) {
-		/*
-		 * Enable the event on the cpu that it's on
-		 */
-		cpu_function_call(event->cpu, __perf_event_enable, event);
+	raw_spin_lock_irq(&ctx->lock);
+	if (event->state >= PERF_EVENT_STATE_INACTIVE) {
+		raw_spin_unlock_irq(&ctx->lock);
 		return;
 	}
 
-	raw_spin_lock_irq(&ctx->lock);
-	if (event->state >= PERF_EVENT_STATE_INACTIVE)
-		goto out;
-
 	/*
 	 * If the event is in error state, clear that first.
-	 * That way, if we see the event in error state below, we
-	 * know that it has gone back into error state, as distinct
-	 * from the task having been scheduled away before the
-	 * cross-call arrived.
+	 *
+	 * That way, if we see the event in error state below, we know that it
+	 * has gone back into error state, as distinct from the task having
+	 * been scheduled away before the cross-call arrived.
 	 */
 	if (event->state == PERF_EVENT_STATE_ERROR)
 		event->state = PERF_EVENT_STATE_OFF;
-
-retry:
-	if (!ctx->is_active) {
-		__perf_event_mark_enabled(event);
-		goto out;
-	}
-
 	raw_spin_unlock_irq(&ctx->lock);
 
-	if (!task_function_call(task, __perf_event_enable, event))
-		return;
-
-	raw_spin_lock_irq(&ctx->lock);
-
-	/*
-	 * If the context is active and the event is still off,
-	 * we need to retry the cross-call.
-	 */
-	if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) {
-		/*
-		 * task could have been flipped by a concurrent
-		 * perf_event_context_sched_out()
-		 */
-		task = ctx->task;
-		goto retry;
-	}
-
-out:
-	raw_spin_unlock_irq(&ctx->lock);
+	event_function_call(event, __perf_event_enable,
+			    ___perf_event_enable, event);
 }
 
 /*
-- 
cgit v1.1


From 957ea1fdbcdb909e1540f06f06f1a9ce6e696efa Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Thu, 3 Dec 2015 13:22:19 -0800
Subject: perf/x86: Remove warning for zero PEBS status

The recent commit:

  75f80859b130 ("perf/x86/intel/pebs: Robustify PEBS buffer drain")

causes lots of warnings on different CPUs before Skylake
when running PEBS intensive workloads.

They can have a zero status field in the PEBS record when
PEBS is racing with clearing of GLOBAl_STATUS.

This also can cause hangs (it seems there are still
problems with printk in NMI).

Disable the warning, but still ignore the record.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1449177740-5422-1-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 5db1c77..0e3a9c7 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -1232,10 +1232,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 
 		bit = find_first_bit((unsigned long *)&pebs_status,
 					x86_pmu.max_pebs_events);
-		if (WARN(bit >= x86_pmu.max_pebs_events,
-			 "PEBS record without PEBS event! status=%Lx pebs_enabled=%Lx active_mask=%Lx",
-			 (unsigned long long)p->status, (unsigned long long)cpuc->pebs_enabled,
-			 *(unsigned long long *)cpuc->active_mask))
+		if (bit >= x86_pmu.max_pebs_events)
 			continue;
 
 		/*
-- 
cgit v1.1


From 01330d7288e0050c5aaabc558059ff91589e67cd Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Thu, 3 Dec 2015 13:22:20 -0800
Subject: perf/x86: Allow zero PEBS status with only single active event

Normally we drop PEBS events with a zero status field. But when
there is only a single PEBS event active we can assume the
PEBS record is for that event. The PEBS buffer is always flushed
when PEBS events are disabled, so there is no risk of mishandling
state PEBS records this way.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1449177740-5422-2-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 0e3a9c7..cd1993e 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -1230,6 +1230,18 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 		pebs_status = p->status & cpuc->pebs_enabled;
 		pebs_status &= (1ULL << x86_pmu.max_pebs_events) - 1;
 
+		/*
+		 * On some CPUs the PEBS status can be zero when PEBS is
+		 * racing with clearing of GLOBAL_STATUS.
+		 *
+		 * Normally we would drop that record, but in the
+		 * case when there is only a single active PEBS event
+		 * we can assume it's for that event.
+		 */
+		if (!pebs_status && cpuc->pebs_enabled &&
+			!(cpuc->pebs_enabled & (cpuc->pebs_enabled-1)))
+			pebs_status = cpuc->pebs_enabled;
+
 		bit = find_first_bit((unsigned long *)&pebs_status,
 					x86_pmu.max_pebs_events);
 		if (bit >= x86_pmu.max_pebs_events)
-- 
cgit v1.1


From 442f5c74cbeaf54939980397ece59360c0a824e9 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Fri, 4 Dec 2015 03:50:32 -0800
Subject: perf/x86: Use INST_RETIRED.TOTAL_CYCLES_PS for cycles:pp for Skylake

I added UOPS_RETIRED.ALL by mistake to the Skylake PEBS event list for
cycles:pp. But the event is not documented for Skylake, and has some
issues.

The recommended replacement for cycles:pp is to use
INST_RETIRED.ANY+pebs as a base, similar to what CPUs before Sandy
Bridge did. This new event is called INST_RETIRED.TOTAL_CYCLES_PS. The
event is not really new, but has been already used by perf before
Sandy Bridge for the original cycles:p

Note the SDM doesn't document that event either, but it's being
documented in the latest version of the event list on:

  https://download.01.org/perfmon/SKL

This patch does:

 - Remove UOPS_RETIRED.ALL from the Skylake PEBS event list

 - Add INST_RETIRED.ANY to the Skylake PEBS event list, and an table entry to
   allow cmask=16,inv=1 for cycles:pp

 - We don't need an extra entry for the base INST_RETIRED event,
   because it is already covered by the catch-all PEBS table entry.

 - Switch Skylake to use the Core2 PEBS alias (which is
   INST_RETIRED.TOTAL_CYCLES_PS)

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: hpa@zytor.com
Link: http://lkml.kernel.org/r/1448929689-13771-1-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel.c    | 2 +-
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 33b4b67..5ed6e0d 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -3521,7 +3521,7 @@ __init int intel_pmu_init(void)
 		x86_pmu.event_constraints = intel_skl_event_constraints;
 		x86_pmu.pebs_constraints = intel_skl_pebs_event_constraints;
 		x86_pmu.extra_regs = intel_skl_extra_regs;
-		x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+		x86_pmu.pebs_aliases = intel_pebs_aliases_core2;
 		/* all extra regs are per-cpu when HT is on */
 		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
 		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index cd1993e..56b5015 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -718,9 +718,8 @@ struct event_constraint intel_hsw_pebs_event_constraints[] = {
 
 struct event_constraint intel_skl_pebs_event_constraints[] = {
 	INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x2),	/* INST_RETIRED.PREC_DIST */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
-	/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+	/* INST_RETIRED.TOTAL_CYCLES_PS (inv=1, cmask=16) (cycles:p). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
 	INTEL_PLD_CONSTRAINT(0x1cd, 0xf),		      /* MEM_TRANS_RETIRED.* */
 	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */
 	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */
-- 
cgit v1.1


From 724697648eec540b2a7561089b1c87cb33e6a0eb Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Fri, 4 Dec 2015 03:50:52 -0800
Subject: perf/x86: Use INST_RETIRED.PREC_DIST for cycles: ppp
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a new 'three-p' precise level, that uses INST_RETIRED.PREC_DIST as
base. The basic mechanism of abusing the inverse cmask to get all
cycles works the same as before.

PREC_DIST is available on Sandy Bridge or later. It had some problems
on Sandy Bridge, so we only use it on IvyBridge and later. I tested it
on Broadwell and Skylake.

PREC_DIST has special support for avoiding shadow effects, which can
give better results compare to UOPS_RETIRED. The drawback is that
PREC_DIST can only schedule on counter 1, but that is ok for cycle
sampling, as there is normally no need to do multiple cycle sampling
runs in parallel. It is still possible to run perf top in parallel, as
that doesn't use precise mode. Also of course the multiplexing can
still allow parallel operation.

:pp stays with the previous event.

Example:

Sample a loop with 10 sqrt with old cycles:pp

	  0.14 │10:   sqrtps %xmm1,%xmm0     <--------------
	  9.13 │      sqrtps %xmm1,%xmm0
	 11.58 │      sqrtps %xmm1,%xmm0
	 11.51 │      sqrtps %xmm1,%xmm0
	  6.27 │      sqrtps %xmm1,%xmm0
	 10.38 │      sqrtps %xmm1,%xmm0
	 12.20 │      sqrtps %xmm1,%xmm0
	 12.74 │      sqrtps %xmm1,%xmm0
	  5.40 │      sqrtps %xmm1,%xmm0
	 10.14 │      sqrtps %xmm1,%xmm0
	 10.51 │    ↑ jmp    10

We expect all 10 sqrt to get roughly the sample number of samples.

But you can see that the instruction directly after the JMP is
systematically underestimated in the result, due to sampling shadow
effects.

With the new PREC_DIST based sampling this problem is gone and all
instructions show up roughly evenly:

	  9.51 │10:   sqrtps %xmm1,%xmm0
	 11.74 │      sqrtps %xmm1,%xmm0
	 11.84 │      sqrtps %xmm1,%xmm0
	  6.05 │      sqrtps %xmm1,%xmm0
	 10.46 │      sqrtps %xmm1,%xmm0
	 12.25 │      sqrtps %xmm1,%xmm0
	 12.18 │      sqrtps %xmm1,%xmm0
	  5.26 │      sqrtps %xmm1,%xmm0
	 10.13 │      sqrtps %xmm1,%xmm0
	 10.43 │      sqrtps %xmm1,%xmm0
	  0.16 │    ↑ jmp    10

Even with PREC_DIST there is still sampling skid and the result is not
completely even, but systematic shadow effects are significantly
reduced.

The improvements are mainly expected to make a difference in high IPC
code. With low IPC it should be similar.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: hpa@zytor.com
Link: http://lkml.kernel.org/r/1448929689-13771-2-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c          |  3 ++
 arch/x86/kernel/cpu/perf_event.h          |  3 +-
 arch/x86/kernel/cpu/perf_event_intel.c    | 50 ++++++++++++++++++++++++++++---
 arch/x86/kernel/cpu/perf_event_intel_ds.c |  6 ++++
 4 files changed, 57 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 9dfbba5..e7e63a9 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -482,6 +482,9 @@ int x86_pmu_hw_config(struct perf_event *event)
 			/* Support for IP fixup */
 			if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2)
 				precise++;
+
+			if (x86_pmu.pebs_prec_dist)
+				precise++;
 		}
 
 		if (event->attr.precise_ip > precise)
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 799e6bd..ce8768f 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -583,7 +583,8 @@ struct x86_pmu {
 			bts_active	:1,
 			pebs		:1,
 			pebs_active	:1,
-			pebs_broken	:1;
+			pebs_broken	:1,
+			pebs_prec_dist	:1;
 	int		pebs_record_size;
 	void		(*drain_pebs)(struct pt_regs *regs);
 	struct event_constraint *pebs_constraints;
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 5ed6e0d..762c602 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -2475,6 +2475,44 @@ static void intel_pebs_aliases_snb(struct perf_event *event)
 	}
 }
 
+static void intel_pebs_aliases_precdist(struct perf_event *event)
+{
+	if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
+		/*
+		 * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
+		 * (0x003c) so that we can use it with PEBS.
+		 *
+		 * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
+		 * PEBS capable. However we can use INST_RETIRED.PREC_DIST
+		 * (0x01c0), which is a PEBS capable event, to get the same
+		 * count.
+		 *
+		 * The PREC_DIST event has special support to minimize sample
+		 * shadowing effects. One drawback is that it can be
+		 * only programmed on counter 1, but that seems like an
+		 * acceptable trade off.
+		 */
+		u64 alt_config = X86_CONFIG(.event=0xc0, .umask=0x01, .inv=1, .cmask=16);
+
+		alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
+		event->hw.config = alt_config;
+	}
+}
+
+static void intel_pebs_aliases_ivb(struct perf_event *event)
+{
+	if (event->attr.precise_ip < 3)
+		return intel_pebs_aliases_snb(event);
+	return intel_pebs_aliases_precdist(event);
+}
+
+static void intel_pebs_aliases_skl(struct perf_event *event)
+{
+	if (event->attr.precise_ip < 3)
+		return intel_pebs_aliases_core2(event);
+	return intel_pebs_aliases_precdist(event);
+}
+
 static unsigned long intel_pmu_free_running_flags(struct perf_event *event)
 {
 	unsigned long flags = x86_pmu.free_running_flags;
@@ -3431,7 +3469,8 @@ __init int intel_pmu_init(void)
 
 		x86_pmu.event_constraints = intel_ivb_event_constraints;
 		x86_pmu.pebs_constraints = intel_ivb_pebs_event_constraints;
-		x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+		x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
+		x86_pmu.pebs_prec_dist = true;
 		if (boot_cpu_data.x86_model == 62)
 			x86_pmu.extra_regs = intel_snbep_extra_regs;
 		else
@@ -3464,7 +3503,8 @@ __init int intel_pmu_init(void)
 		x86_pmu.event_constraints = intel_hsw_event_constraints;
 		x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
 		x86_pmu.extra_regs = intel_snbep_extra_regs;
-		x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+		x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
+		x86_pmu.pebs_prec_dist = true;
 		/* all extra regs are per-cpu when HT is on */
 		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
 		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
@@ -3499,7 +3539,8 @@ __init int intel_pmu_init(void)
 		x86_pmu.event_constraints = intel_bdw_event_constraints;
 		x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
 		x86_pmu.extra_regs = intel_snbep_extra_regs;
-		x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+		x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
+		x86_pmu.pebs_prec_dist = true;
 		/* all extra regs are per-cpu when HT is on */
 		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
 		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
@@ -3521,7 +3562,8 @@ __init int intel_pmu_init(void)
 		x86_pmu.event_constraints = intel_skl_event_constraints;
 		x86_pmu.pebs_constraints = intel_skl_pebs_event_constraints;
 		x86_pmu.extra_regs = intel_skl_extra_regs;
-		x86_pmu.pebs_aliases = intel_pebs_aliases_core2;
+		x86_pmu.pebs_aliases = intel_pebs_aliases_skl;
+		x86_pmu.pebs_prec_dist = true;
 		/* all extra regs are per-cpu when HT is on */
 		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
 		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 56b5015..9c0f8d4 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -686,6 +686,8 @@ struct event_constraint intel_ivb_pebs_event_constraints[] = {
 	INTEL_PST_CONSTRAINT(0x02cd, 0x8),    /* MEM_TRANS_RETIRED.PRECISE_STORES */
 	/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
 	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+	/* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
 	INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf),    /* MEM_UOP_RETIRED.* */
 	INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
 	INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf),    /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
@@ -700,6 +702,8 @@ struct event_constraint intel_hsw_pebs_event_constraints[] = {
 	INTEL_PLD_CONSTRAINT(0x01cd, 0xf),    /* MEM_TRANS_RETIRED.* */
 	/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
 	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+	/* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
 	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
 	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
 	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
@@ -718,6 +722,8 @@ struct event_constraint intel_hsw_pebs_event_constraints[] = {
 
 struct event_constraint intel_skl_pebs_event_constraints[] = {
 	INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x2),	/* INST_RETIRED.PREC_DIST */
+	/* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
 	/* INST_RETIRED.TOTAL_CYCLES_PS (inv=1, cmask=16) (cycles:p). */
 	INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
 	INTEL_PLD_CONSTRAINT(0x1cd, 0xf),		      /* MEM_TRANS_RETIRED.* */
-- 
cgit v1.1


From 61b87cae6361ea6af161c1ffa549898892707b19 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Mon, 7 Dec 2015 20:33:25 +0100
Subject: perf/x86: Fix filter_events() bug with event mappings

This patch fixes a bug in the filter_events() function.

The patch fixes the bug whereby if some mappings did not
exist, e.g., STALLED_CYCLES_FRONTEND, then any event after it
in the attrs array would disappear from the published list of
events in /sys/devices/cpu/events. This could be verified
easily on any system post SNB (which do not publish
STALLED_CYCLES_FRONTEND):

	$ ./perf stat -e cycles,ref-cycles true
	Performance counter stats for 'true':
              1,217,348      cycles
	<not supported>      ref-cycles

The problem is that in filter_events() there is an assumption
that the argument (attrs) is organized in increasing continuous
event indexes related to the event_map(). But if we remove the
non-supported events by shifing the position in the array, then
the lookup x86_pmu.event_map() needs to compensate for it, otherwise
we are looking up the wrong index. This patch corrects this problem
by compensating for the deleted events and with that ref-cycles
reappears (here shown on Haswell):

	$ perf stat -e ref-cycles,cycles true
	Performance counter stats for 'true':
         4,525,910      ref-cycles
         1,064,920      cycles
       0.002943888 seconds time elapsed

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: jolsa@kernel.org
Cc: kan.liang@intel.com
Fixes: 8300daa26755 ("perf/x86: Filter out undefined events from sysfs events attribute")
Link: http://lkml.kernel.org/r/1449516805-6637-1-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index e7e63a9..1b443db 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1534,6 +1534,7 @@ static void __init filter_events(struct attribute **attrs)
 {
 	struct device_attribute *d;
 	struct perf_pmu_events_attr *pmu_attr;
+	int offset = 0;
 	int i, j;
 
 	for (i = 0; attrs[i]; i++) {
@@ -1542,7 +1543,7 @@ static void __init filter_events(struct attribute **attrs)
 		/* str trumps id */
 		if (pmu_attr->event_str)
 			continue;
-		if (x86_pmu.event_map(i))
+		if (x86_pmu.event_map(i + offset))
 			continue;
 
 		for (j = i; attrs[j]; j++)
@@ -1550,6 +1551,14 @@ static void __init filter_events(struct attribute **attrs)
 
 		/* Check the shifted attr. */
 		i--;
+
+		/*
+		 * event_map() is index based, the attrs array is organized
+		 * by increasing event index. If we shift the events, then
+		 * we need to compensate for the event_map(), otherwise
+		 * we are looking up the wrong event in the map
+		 */
+		offset++;
 	}
 }
 
-- 
cgit v1.1


From 6fc2e83077b05a061afe9b24f2fdff7a0434eb67 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Thu, 3 Dec 2015 23:33:17 +0100
Subject: perf/x86: Fix LBR related crashes on Intel Atom

This patches fixes the LBR kernel crashes on Intel Atom.

The kernel was assuming that if the CPU supports 64-bit format
LBR, then it has an LBR_SELECT MSR. Atom uses 64-bit LBR format
but does not have LBR_SELECT. That was causing NULL pointer
dereferences in a couple of places.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: kan.liang@intel.com
Fixes: 96f3eda67fcf ("perf/x86/intel: Fix static checker warning in lbr enable")
Link: http://lkml.kernel.org/r/1449182000-31524-2-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index e2fad0c..1390148 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -161,7 +161,7 @@ static void __intel_pmu_lbr_enable(bool pmi)
 	 */
 	if (cpuc->lbr_sel)
 		lbr_select = cpuc->lbr_sel->config & x86_pmu.lbr_sel_mask;
-	if (!pmi)
+	if (!pmi && cpuc->lbr_sel)
 		wrmsrl(MSR_LBR_SELECT, lbr_select);
 
 	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
@@ -430,7 +430,7 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
  */
 static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 {
-	bool need_info = !(cpuc->lbr_sel->config & LBR_NO_INFO);
+	bool need_info = false;
 	unsigned long mask = x86_pmu.lbr_nr - 1;
 	int lbr_format = x86_pmu.intel_cap.lbr_format;
 	u64 tos = intel_pmu_lbr_tos();
@@ -438,8 +438,11 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 	int out = 0;
 	int num = x86_pmu.lbr_nr;
 
-	if (cpuc->lbr_sel->config & LBR_CALL_STACK)
-		num = tos;
+	if (cpuc->lbr_sel) {
+		need_info = !(cpuc->lbr_sel->config & LBR_NO_INFO);
+		if (cpuc->lbr_sel->config & LBR_CALL_STACK)
+			num = tos;
+	}
 
 	for (i = 0; i < num; i++) {
 		unsigned long lbr_idx = (tos - i) & mask;
-- 
cgit v1.1


From 1424a09a9e1839285e948d4ea9fdfca26c9a2086 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Thu, 3 Dec 2015 23:33:18 +0100
Subject: perf/x86: fix PEBS issues on Intel Atom/Core2

This patch fixes broken PEBS support on Intel Atom and Core2
due to wrong pointer arithmetic in intel_pmu_drain_pebs_core().

The get_next_pebs_record_by_bit() was called on PEBS format fmt0
which does not use the pebs_record_nhm layout.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: kan.liang@intel.com
Fixes: 21509084f999 ("perf/x86/intel: Handle multiple records in the PEBS buffer")
Link: http://lkml.kernel.org/r/1449182000-31524-3-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 9c0f8d4..a7463ed 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -1106,6 +1106,13 @@ get_next_pebs_record_by_bit(void *base, void *top, int bit)
 	void *at;
 	u64 pebs_status;
 
+	/*
+	 * fmt0 does not have a status bitfield (does not use
+	 * perf_record_nhm format)
+	 */
+	if (x86_pmu.intel_cap.pebs_format < 1)
+		return base;
+
 	if (base == NULL)
 		return NULL;
 
@@ -1191,7 +1198,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 	if (!event->attr.precise_ip)
 		return;
 
-	n = (top - at) / x86_pmu.pebs_record_size;
+	n = top - at;
 	if (n <= 0)
 		return;
 
-- 
cgit v1.1


From 673d188ba5b1cef6f9a41a5a18b490b2831c3ea5 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Thu, 3 Dec 2015 21:03:10 +0100
Subject: perf/x86: Enable cycles:pp for Intel Atom

This patch updates the PEBS support for Intel Atom to provide
an alias for the cycles:pp event used by perf record/top by default
nowadays.

On Atom, only INST_RETIRED:ANY supports PEBS, so we use this event
instead with a large cmask to count cycles. Given that Core2 has
the same issue, we use the intel_pebs_aliases_core2() function for Atom
as well.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1449172990-30183-3-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel.c    | 1 +
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 762c602..95980c0 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -3370,6 +3370,7 @@ __init int intel_pmu_init(void)
 
 		x86_pmu.event_constraints = intel_gen_event_constraints;
 		x86_pmu.pebs_constraints = intel_atom_pebs_event_constraints;
+		x86_pmu.pebs_aliases = intel_pebs_aliases_core2;
 		pr_cont("Atom events, ");
 		break;
 
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index a7463ed..10602f0 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -620,6 +620,8 @@ struct event_constraint intel_atom_pebs_event_constraints[] = {
 	INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1),    /* MEM_LOAD_RETIRED.* */
 	/* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
 	INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01),
+	/* Allow all events as PEBS with no flags */
+	INTEL_ALL_EVENT_CONSTRAINT(0, 0x1),
 	EVENT_CONSTRAINT_END
 };
 
-- 
cgit v1.1


From d3bcd64bbc35076a80c56918c905ddb167d097d8 Mon Sep 17 00:00:00 2001
From: Huang Rui <ray.huang@amd.com>
Date: Fri, 4 Dec 2015 18:07:41 +0800
Subject: perf/x86/rapl: Use unified perf_event_sysfs_show instead of special
 interface

Actually, rapl_sysfs_show is a duplicate of perf_event_sysfs_show. We
prefer to use the unified interface.

Signed-off-by: Huang Rui <ray.huang@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Dasaratharaman Chandramouli<dasaratharaman.chandramouli@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1449223661-2437-1-git-send-email-ray.huang@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_rapl.c | 23 +++++------------------
 1 file changed, 5 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
index fb5843d..24a351a 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
@@ -109,11 +109,11 @@ static struct kobj_attribute format_attr_##_var =		\
 
 #define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */
 
-#define RAPL_EVENT_ATTR_STR(_name, v, str)				\
-static struct perf_pmu_events_attr event_attr_##v = {			\
-	.attr		= __ATTR(_name, 0444, rapl_sysfs_show, NULL),	\
-	.id		= 0,						\
-	.event_str	= str,						\
+#define RAPL_EVENT_ATTR_STR(_name, v, str)					\
+static struct perf_pmu_events_attr event_attr_##v = {				\
+	.attr		= __ATTR(_name, 0444, perf_event_sysfs_show, NULL),	\
+	.id		= 0,							\
+	.event_str	= str,							\
 };
 
 struct rapl_pmu {
@@ -405,19 +405,6 @@ static struct attribute_group rapl_pmu_attr_group = {
 	.attrs = rapl_pmu_attrs,
 };
 
-static ssize_t rapl_sysfs_show(struct device *dev,
-			       struct device_attribute *attr,
-			       char *page)
-{
-	struct perf_pmu_events_attr *pmu_attr = \
-		container_of(attr, struct perf_pmu_events_attr, attr);
-
-	if (pmu_attr->event_str)
-		return sprintf(page, "%s", pmu_attr->event_str);
-
-	return 0;
-}
-
 RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
 RAPL_EVENT_ATTR_STR(energy-pkg  ,   rapl_pkg, "event=0x02");
 RAPL_EVENT_ATTR_STR(energy-ram  ,   rapl_ram, "event=0x03");
-- 
cgit v1.1


From d6980ef32570e2a26e05b1183788f4b70f1f27d0 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@intel.com>
Date: Thu, 3 Dec 2015 16:00:11 -0500
Subject: perf/x86/intel/uncore: Add Broadwell-EP uncore support

The uncore subsystem for Broadwell-EP is similar to Haswell-EP.
There are some differences in pci device IDs, box number and
constraints. This patch extends the Broadwell-DE codes to support
Broadwell-EP.

Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1449176411-9499-1-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_uncore.c      |   2 +
 .../x86/kernel/cpu/perf_event_intel_uncore_snbep.c | 152 ++++++++++++++++++++-
 2 files changed, 149 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 61215a6..b63271c 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -966,6 +966,7 @@ static int __init uncore_pci_init(void)
 	case 63: /* Haswell-EP */
 		ret = hswep_uncore_pci_init();
 		break;
+	case 79: /* BDX-EP */
 	case 86: /* BDX-DE */
 		ret = bdx_uncore_pci_init();
 		break;
@@ -1287,6 +1288,7 @@ static int __init uncore_cpu_init(void)
 	case 63: /* Haswell-EP */
 		hswep_uncore_cpu_init();
 		break;
+	case 79: /* BDX-EP */
 	case 86: /* BDX-DE */
 		bdx_uncore_cpu_init();
 		break;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
index f0f4fcb..f2ddfcc 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
@@ -2338,7 +2338,7 @@ int hswep_uncore_pci_init(void)
 }
 /* end of Haswell-EP uncore support */
 
-/* BDX-DE uncore support */
+/* BDX uncore support */
 
 static struct intel_uncore_type bdx_uncore_ubox = {
 	.name			= "ubox",
@@ -2360,13 +2360,14 @@ static struct event_constraint bdx_uncore_cbox_constraints[] = {
 	UNCORE_EVENT_CONSTRAINT(0x09, 0x3),
 	UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
 	UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x3e, 0x1),
 	EVENT_CONSTRAINT_END
 };
 
 static struct intel_uncore_type bdx_uncore_cbox = {
 	.name			= "cbox",
 	.num_counters		= 4,
-	.num_boxes		= 8,
+	.num_boxes		= 24,
 	.perf_ctr_bits		= 48,
 	.event_ctl		= HSWEP_C0_MSR_PMON_CTL0,
 	.perf_ctr		= HSWEP_C0_MSR_PMON_CTR0,
@@ -2379,9 +2380,24 @@ static struct intel_uncore_type bdx_uncore_cbox = {
 	.format_group		= &hswep_uncore_cbox_format_group,
 };
 
+static struct intel_uncore_type bdx_uncore_sbox = {
+	.name			= "sbox",
+	.num_counters		= 4,
+	.num_boxes		= 4,
+	.perf_ctr_bits		= 48,
+	.event_ctl		= HSWEP_S0_MSR_PMON_CTL0,
+	.perf_ctr		= HSWEP_S0_MSR_PMON_CTR0,
+	.event_mask		= HSWEP_S_MSR_PMON_RAW_EVENT_MASK,
+	.box_ctl		= HSWEP_S0_MSR_PMON_BOX_CTL,
+	.msr_offset		= HSWEP_SBOX_MSR_OFFSET,
+	.ops			= &hswep_uncore_sbox_msr_ops,
+	.format_group		= &hswep_uncore_sbox_format_group,
+};
+
 static struct intel_uncore_type *bdx_msr_uncores[] = {
 	&bdx_uncore_ubox,
 	&bdx_uncore_cbox,
+	&bdx_uncore_sbox,
 	&hswep_uncore_pcu,
 	NULL,
 };
@@ -2396,7 +2412,7 @@ void bdx_uncore_cpu_init(void)
 static struct intel_uncore_type bdx_uncore_ha = {
 	.name		= "ha",
 	.num_counters   = 4,
-	.num_boxes	= 1,
+	.num_boxes	= 2,
 	.perf_ctr_bits	= 48,
 	SNBEP_UNCORE_PCI_COMMON_INIT(),
 };
@@ -2404,7 +2420,7 @@ static struct intel_uncore_type bdx_uncore_ha = {
 static struct intel_uncore_type bdx_uncore_imc = {
 	.name		= "imc",
 	.num_counters   = 5,
-	.num_boxes	= 2,
+	.num_boxes	= 8,
 	.perf_ctr_bits	= 48,
 	.fixed_ctr_bits	= 48,
 	.fixed_ctr	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
@@ -2424,6 +2440,19 @@ static struct intel_uncore_type bdx_uncore_irp = {
 	.format_group		= &snbep_uncore_format_group,
 };
 
+static struct intel_uncore_type bdx_uncore_qpi = {
+	.name			= "qpi",
+	.num_counters		= 4,
+	.num_boxes		= 3,
+	.perf_ctr_bits		= 48,
+	.perf_ctr		= SNBEP_PCI_PMON_CTR0,
+	.event_ctl		= SNBEP_PCI_PMON_CTL0,
+	.event_mask		= SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
+	.box_ctl		= SNBEP_PCI_PMON_BOX_CTL,
+	.num_shared_regs	= 1,
+	.ops			= &snbep_uncore_qpi_ops,
+	.format_group		= &snbep_uncore_qpi_format_group,
+};
 
 static struct event_constraint bdx_uncore_r2pcie_constraints[] = {
 	UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
@@ -2432,6 +2461,8 @@ static struct event_constraint bdx_uncore_r2pcie_constraints[] = {
 	UNCORE_EVENT_CONSTRAINT(0x23, 0x1),
 	UNCORE_EVENT_CONSTRAINT(0x25, 0x1),
 	UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
 	UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
 	EVENT_CONSTRAINT_END
 };
@@ -2445,18 +2476,65 @@ static struct intel_uncore_type bdx_uncore_r2pcie = {
 	SNBEP_UNCORE_PCI_COMMON_INIT(),
 };
 
+static struct event_constraint bdx_uncore_r3qpi_constraints[] = {
+	UNCORE_EVENT_CONSTRAINT(0x01, 0x7),
+	UNCORE_EVENT_CONSTRAINT(0x07, 0x7),
+	UNCORE_EVENT_CONSTRAINT(0x08, 0x7),
+	UNCORE_EVENT_CONSTRAINT(0x09, 0x7),
+	UNCORE_EVENT_CONSTRAINT(0x0a, 0x7),
+	UNCORE_EVENT_CONSTRAINT(0x0e, 0x7),
+	UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x14, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x15, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x1f, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
+	EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type bdx_uncore_r3qpi = {
+	.name		= "r3qpi",
+	.num_counters   = 3,
+	.num_boxes	= 3,
+	.perf_ctr_bits	= 48,
+	.constraints	= bdx_uncore_r3qpi_constraints,
+	SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
 enum {
 	BDX_PCI_UNCORE_HA,
 	BDX_PCI_UNCORE_IMC,
 	BDX_PCI_UNCORE_IRP,
+	BDX_PCI_UNCORE_QPI,
 	BDX_PCI_UNCORE_R2PCIE,
+	BDX_PCI_UNCORE_R3QPI,
 };
 
 static struct intel_uncore_type *bdx_pci_uncores[] = {
 	[BDX_PCI_UNCORE_HA]	= &bdx_uncore_ha,
 	[BDX_PCI_UNCORE_IMC]	= &bdx_uncore_imc,
 	[BDX_PCI_UNCORE_IRP]	= &bdx_uncore_irp,
+	[BDX_PCI_UNCORE_QPI]	= &bdx_uncore_qpi,
 	[BDX_PCI_UNCORE_R2PCIE]	= &bdx_uncore_r2pcie,
+	[BDX_PCI_UNCORE_R3QPI]	= &bdx_uncore_r3qpi,
 	NULL,
 };
 
@@ -2465,6 +2543,10 @@ static const struct pci_device_id bdx_uncore_pci_ids[] = {
 		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f30),
 		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_HA, 0),
 	},
+	{ /* Home Agent 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f38),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_HA, 1),
+	},
 	{ /* MC0 Channel 0 */
 		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb0),
 		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 0),
@@ -2473,14 +2555,74 @@ static const struct pci_device_id bdx_uncore_pci_ids[] = {
 		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb1),
 		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 1),
 	},
+	{ /* MC0 Channel 2 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb4),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 2),
+	},
+	{ /* MC0 Channel 3 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb5),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 3),
+	},
+	{ /* MC1 Channel 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd0),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 4),
+	},
+	{ /* MC1 Channel 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd1),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 5),
+	},
+	{ /* MC1 Channel 2 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd4),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 6),
+	},
+	{ /* MC1 Channel 3 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd5),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 7),
+	},
 	{ /* IRP */
 		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f39),
 		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IRP, 0),
 	},
+	{ /* QPI0 Port 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f32),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 0),
+	},
+	{ /* QPI0 Port 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f33),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 1),
+	},
+	{ /* QPI1 Port 2 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f3a),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 2),
+	},
 	{ /* R2PCIe */
 		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f34),
 		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R2PCIE, 0),
 	},
+	{ /* R3QPI0 Link 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f36),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 0),
+	},
+	{ /* R3QPI0 Link 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f37),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 1),
+	},
+	{ /* R3QPI1 Link 2 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f3e),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 2),
+	},
+	{ /* QPI Port 0 filter  */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f86),
+		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 0),
+	},
+	{ /* QPI Port 1 filter  */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f96),
+		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 1),
+	},
+	{ /* QPI Port 2 filter  */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f46),
+		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 2),
+	},
 	{ /* end: all zeroes */ }
 };
 
@@ -2500,4 +2642,4 @@ int bdx_uncore_pci_init(void)
 	return 0;
 }
 
-/* end of BDX-DE uncore support */
+/* end of BDX uncore support */
-- 
cgit v1.1


From 1e7b93906249a7ccca730be03168ace15f95709e Mon Sep 17 00:00:00 2001
From: Harish Chegondi <harish.chegondi@intel.com>
Date: Mon, 7 Dec 2015 14:28:18 -0800
Subject: perf/x86/intel: Add perf core PMU support for Intel Knights Landing

Knights Landing core is based on Silvermont core with several differences.
Like Silvermont, Knights Landing has 8 pairs of LBR MSRs. However, the
LBR MSRs addresses match those of the Xeon cores' first 8 pairs of LBR MSRs
Unlike Silvermont, Knights Landing supports hyperthreading. Knights Landing
offcore response events config register mask is different from that of the
Silvermont.

This patch was developed based on a patch from Andi Kleen.

For more details, please refer to the public document:

  https://software.intel.com/sites/default/files/managed/15/8d/IntelXeonPhi%E2%84%A2x200ProcessorPerformanceMonitoringReferenceManual_Volume1_Registers_v0%206.pdf

Signed-off-by: Harish Chegondi <harish.chegondi@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Harish Chegondi <harish.chegondi@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Lukasz Anaczkowski <lukasz.anaczkowski@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/d14593c7311f78c93c9cf6b006be843777c5ad5c.1449517401.git.harish.chegondi@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.h           |  2 +
 arch/x86/kernel/cpu/perf_event_intel.c     | 62 ++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 14 +++++++
 3 files changed, 78 insertions(+)

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index ce8768f..7bb61e3 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -902,6 +902,8 @@ void intel_pmu_lbr_init_hsw(void);
 
 void intel_pmu_lbr_init_skl(void);
 
+void intel_pmu_lbr_init_knl(void);
+
 int intel_pmu_setup_lbr_filter(struct perf_event *event);
 
 void intel_pt_interrupt(void);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 95980c0..a667078 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -185,6 +185,14 @@ struct event_constraint intel_skl_event_constraints[] = {
 	EVENT_CONSTRAINT_END
 };
 
+static struct extra_reg intel_knl_extra_regs[] __read_mostly = {
+	INTEL_UEVENT_EXTRA_REG(0x01b7,
+			       MSR_OFFCORE_RSP_0, 0x7f9ffbffffull, RSP_0),
+	INTEL_UEVENT_EXTRA_REG(0x02b7,
+			       MSR_OFFCORE_RSP_1, 0x3f9ffbffffull, RSP_1),
+	EVENT_EXTRA_END
+};
+
 static struct extra_reg intel_snb_extra_regs[] __read_mostly = {
 	/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
 	INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0),
@@ -1457,6 +1465,42 @@ static __initconst const u64 slm_hw_cache_event_ids
  },
 };
 
+#define KNL_OT_L2_HITE		BIT_ULL(19) /* Other Tile L2 Hit */
+#define KNL_OT_L2_HITF		BIT_ULL(20) /* Other Tile L2 Hit */
+#define KNL_MCDRAM_LOCAL	BIT_ULL(21)
+#define KNL_MCDRAM_FAR		BIT_ULL(22)
+#define KNL_DDR_LOCAL		BIT_ULL(23)
+#define KNL_DDR_FAR		BIT_ULL(24)
+#define KNL_DRAM_ANY		(KNL_MCDRAM_LOCAL | KNL_MCDRAM_FAR | \
+				    KNL_DDR_LOCAL | KNL_DDR_FAR)
+#define KNL_L2_READ		SLM_DMND_READ
+#define KNL_L2_WRITE		SLM_DMND_WRITE
+#define KNL_L2_PREFETCH		SLM_DMND_PREFETCH
+#define KNL_L2_ACCESS		SLM_LLC_ACCESS
+#define KNL_L2_MISS		(KNL_OT_L2_HITE | KNL_OT_L2_HITF | \
+				   KNL_DRAM_ANY | SNB_SNP_ANY | \
+						  SNB_NON_DRAM)
+
+static __initconst const u64 knl_hw_cache_extra_regs
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+	[C(LL)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)] = KNL_L2_READ | KNL_L2_ACCESS,
+			[C(RESULT_MISS)]   = 0,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)] = KNL_L2_WRITE | KNL_L2_ACCESS,
+			[C(RESULT_MISS)]   = KNL_L2_WRITE | KNL_L2_MISS,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)] = KNL_L2_PREFETCH | KNL_L2_ACCESS,
+			[C(RESULT_MISS)]   = KNL_L2_PREFETCH | KNL_L2_MISS,
+		},
+	},
+};
+
 /*
  * Use from PMIs where the LBRs are already disabled.
  */
@@ -3553,6 +3597,24 @@ __init int intel_pmu_init(void)
 		pr_cont("Broadwell events, ");
 		break;
 
+	case 87: /* Knights Landing Xeon Phi */
+		memcpy(hw_cache_event_ids,
+		       slm_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+		memcpy(hw_cache_extra_regs,
+		       knl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+		intel_pmu_lbr_init_knl();
+
+		x86_pmu.event_constraints = intel_slm_event_constraints;
+		x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
+		x86_pmu.extra_regs = intel_knl_extra_regs;
+
+		/* all extra regs are per-cpu when HT is on */
+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+		pr_cont("Knights Landing events, ");
+		break;
+
 	case 78: /* 14nm Skylake Mobile */
 	case 94: /* 14nm Skylake Desktop */
 		x86_pmu.late_ack = true;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 1390148..653f88d 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -1046,3 +1046,17 @@ void __init intel_pmu_lbr_init_atom(void)
 	 */
 	pr_cont("8-deep LBR, ");
 }
+
+/* Knights Landing */
+void intel_pmu_lbr_init_knl(void)
+{
+	x86_pmu.lbr_nr	   = 8;
+	x86_pmu.lbr_tos    = MSR_LBR_TOS;
+	x86_pmu.lbr_from   = MSR_LBR_NHM_FROM;
+	x86_pmu.lbr_to     = MSR_LBR_NHM_TO;
+
+	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+	x86_pmu.lbr_sel_map  = snb_lbr_sel_map;
+
+	pr_cont("8-deep LBR, ");
+}
-- 
cgit v1.1


From dae25530a44ad9e6523495ebc8b37bb0a1640490 Mon Sep 17 00:00:00 2001
From: Harish Chegondi <harish.chegondi@intel.com>
Date: Mon, 7 Dec 2015 14:32:31 -0800
Subject: perf/x86/intel/uncore: Remove hard coding of PMON box control MSR
 offset

Call uncore_pci_box_ctl() function to get the PMON box control MSR offset
instead of hard coding the offset. This would allow us to use this
snbep_uncore_pci_init_box() function for other PCI PMON devices whose box
control MSR offset is different from SNBEP_PCI_PMON_BOX_CTL.

Signed-off-by: Harish Chegondi <harish.chegondi@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Harish Chegondi <harish.chegondi@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Lukasz Anaczkowski <lukasz.anaczkowski@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/872e8ef16cfc38e5ff3b45fac1094e6f1722e4ad.1449470704.git.harish.chegondi@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
index f2ddfcc..bfb9656 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
@@ -315,8 +315,9 @@ static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box, struct pe
 static void snbep_uncore_pci_init_box(struct intel_uncore_box *box)
 {
 	struct pci_dev *pdev = box->pci_dev;
+	int box_ctl = uncore_pci_box_ctl(box);
 
-	pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, SNBEP_PMON_BOX_CTL_INT);
+	pci_write_config_dword(pdev, box_ctl, SNBEP_PMON_BOX_CTL_INT);
 }
 
 static void snbep_uncore_msr_disable_box(struct intel_uncore_box *box)
-- 
cgit v1.1


From 77af0037de0a280eeabc632890de871f062ea7be Mon Sep 17 00:00:00 2001
From: Harish Chegondi <harish.chegondi@intel.com>
Date: Mon, 7 Dec 2015 14:32:32 -0800
Subject: perf/x86/intel/uncore: Add Knights Landing uncore PMU support

Knights Landing uncore performance monitoring (perfmon) is derived from
Haswell-EP uncore perfmon with several differences. One notable difference
is in PCI device IDs. Knights Landing uses common PCI device ID for
multiple instances of an uncore PMU device type. In Haswell-EP, each
instance of a PMU device type has a unique device ID.

Knights Landing uncore components that have performance monitoring units
are UBOX, CHA, EDC, MC, M2PCIe, IRP and PCU. Perfmon registers in EDC, MC,
IRP, and M2PCIe reside in the PCIe configuration space. Perfmon registers
in UBOX, CHA and PCU are accessed via the MSR interface.

For more details, please refer to the public document:

  https://software.intel.com/sites/default/files/managed/15/8d/IntelXeonPhi%E2%84%A2x200ProcessorPerformanceMonitoringReferenceManual_Volume1_Registers_v0%206.pdf

Signed-off-by: Harish Chegondi <harish.chegondi@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Harish Chegondi <harish.chegondi@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Lukasz Anaczkowski <lukasz.anaczkowski@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/8ac513981264c3eb10343a3f523f19cc5a2d12fe.1449470704.git.harish.chegondi@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_uncore.c      |  15 +
 arch/x86/kernel/cpu/perf_event_intel_uncore.h      |   3 +
 arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c  |   2 +-
 .../x86/kernel/cpu/perf_event_intel_uncore_snbep.c | 480 +++++++++++++++++++++
 4 files changed, 499 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index b63271c..f97f807 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -884,6 +884,15 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
 	 * each box has a different function id.
 	 */
 	pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)];
+	/* Knights Landing uses a common PCI device ID for multiple instances of
+	 * an uncore PMU device type. There is only one entry per device type in
+	 * the knl_uncore_pci_ids table inspite of multiple devices present for
+	 * some device types. Hence PCI device idx would be 0 for all devices.
+	 * So increment pmu pointer to point to an unused array element.
+	 */
+	if (boot_cpu_data.x86_model == 87)
+		while (pmu->func_id >= 0)
+			pmu++;
 	if (pmu->func_id < 0)
 		pmu->func_id = pdev->devfn;
 	else
@@ -983,6 +992,9 @@ static int __init uncore_pci_init(void)
 	case 61: /* Broadwell */
 		ret = bdw_uncore_pci_init();
 		break;
+	case 87: /* Knights Landing */
+		ret = knl_uncore_pci_init();
+		break;
 	default:
 		return 0;
 	}
@@ -1292,6 +1304,9 @@ static int __init uncore_cpu_init(void)
 	case 86: /* BDX-DE */
 		bdx_uncore_cpu_init();
 		break;
+	case 87: /* Knights Landing */
+		knl_uncore_cpu_init();
+		break;
 	default:
 		return 0;
 	}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
index 2f0a4a9..07aa2d6 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
@@ -338,6 +338,7 @@ int hsw_uncore_pci_init(void);
 int bdw_uncore_pci_init(void);
 void snb_uncore_cpu_init(void);
 void nhm_uncore_cpu_init(void);
+int snb_pci2phy_map_init(int devid);
 
 /* perf_event_intel_uncore_snbep.c */
 int snbep_uncore_pci_init(void);
@@ -348,6 +349,8 @@ int hswep_uncore_pci_init(void);
 void hswep_uncore_cpu_init(void);
 int bdx_uncore_pci_init(void);
 void bdx_uncore_cpu_init(void);
+int knl_uncore_pci_init(void);
+void knl_uncore_cpu_init(void);
 
 /* perf_event_intel_uncore_nhmex.c */
 void nhmex_uncore_cpu_init(void);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
index 8452561..0b93482 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
@@ -417,7 +417,7 @@ static void snb_uncore_imc_event_del(struct perf_event *event, int flags)
 	}
 }
 
-static int snb_pci2phy_map_init(int devid)
+int snb_pci2phy_map_init(int devid)
 {
 	struct pci_dev *dev = NULL;
 	struct pci2phy_map *map;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
index bfb9656..33acb88 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
@@ -209,31 +209,98 @@
 #define HSWEP_PCU_MSR_PMON_BOX_CTL		0x710
 #define HSWEP_PCU_MSR_PMON_BOX_FILTER		0x715
 
+/* KNL Ubox */
+#define KNL_U_MSR_PMON_RAW_EVENT_MASK \
+					(SNBEP_U_MSR_PMON_RAW_EVENT_MASK | \
+						SNBEP_CBO_PMON_CTL_TID_EN)
+/* KNL CHA */
+#define KNL_CHA_MSR_OFFSET			0xc
+#define KNL_CHA_MSR_PMON_CTL_QOR		(1 << 16)
+#define KNL_CHA_MSR_PMON_RAW_EVENT_MASK \
+					(SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK | \
+					 KNL_CHA_MSR_PMON_CTL_QOR)
+#define KNL_CHA_MSR_PMON_BOX_FILTER_TID		0x1ff
+#define KNL_CHA_MSR_PMON_BOX_FILTER_STATE	(7 << 18)
+#define KNL_CHA_MSR_PMON_BOX_FILTER_OP		(0xfffffe2aULL << 32)
+
+/* KNL EDC/MC UCLK */
+#define KNL_UCLK_MSR_PMON_CTR0_LOW		0x400
+#define KNL_UCLK_MSR_PMON_CTL0			0x420
+#define KNL_UCLK_MSR_PMON_BOX_CTL		0x430
+#define KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW	0x44c
+#define KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL	0x454
+#define KNL_PMON_FIXED_CTL_EN			0x1
+
+/* KNL EDC */
+#define KNL_EDC0_ECLK_MSR_PMON_CTR0_LOW		0xa00
+#define KNL_EDC0_ECLK_MSR_PMON_CTL0		0xa20
+#define KNL_EDC0_ECLK_MSR_PMON_BOX_CTL		0xa30
+#define KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_LOW	0xa3c
+#define KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_CTL	0xa44
+
+/* KNL MC */
+#define KNL_MC0_CH0_MSR_PMON_CTR0_LOW		0xb00
+#define KNL_MC0_CH0_MSR_PMON_CTL0		0xb20
+#define KNL_MC0_CH0_MSR_PMON_BOX_CTL		0xb30
+#define KNL_MC0_CH0_MSR_PMON_FIXED_LOW		0xb3c
+#define KNL_MC0_CH0_MSR_PMON_FIXED_CTL		0xb44
+
+/* KNL IRP */
+#define KNL_IRP_PCI_PMON_BOX_CTL		0xf0
+#define KNL_IRP_PCI_PMON_RAW_EVENT_MASK		(SNBEP_PMON_RAW_EVENT_MASK | \
+						 KNL_CHA_MSR_PMON_CTL_QOR)
+/* KNL PCU */
+#define KNL_PCU_PMON_CTL_EV_SEL_MASK		0x0000007f
+#define KNL_PCU_PMON_CTL_USE_OCC_CTR		(1 << 7)
+#define KNL_PCU_MSR_PMON_CTL_TRESH_MASK		0x3f000000
+#define KNL_PCU_MSR_PMON_RAW_EVENT_MASK	\
+				(KNL_PCU_PMON_CTL_EV_SEL_MASK | \
+				 KNL_PCU_PMON_CTL_USE_OCC_CTR | \
+				 SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
+				 SNBEP_PMON_CTL_EDGE_DET | \
+				 SNBEP_CBO_PMON_CTL_TID_EN | \
+				 SNBEP_PMON_CTL_EV_SEL_EXT | \
+				 SNBEP_PMON_CTL_INVERT | \
+				 KNL_PCU_MSR_PMON_CTL_TRESH_MASK | \
+				 SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
+				 SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
 
 DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(event2, event, "config:0-6");
 DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21");
+DEFINE_UNCORE_FORMAT_ATTR(use_occ_ctr, use_occ_ctr, "config:7");
 DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(qor, qor, "config:16");
 DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
 DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19");
 DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
 DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
+DEFINE_UNCORE_FORMAT_ATTR(thresh6, thresh, "config:24-29");
 DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28");
 DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15");
 DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30");
 DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51");
+DEFINE_UNCORE_FORMAT_ATTR(occ_edge_det, occ_edge_det, "config:31");
 DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4");
 DEFINE_UNCORE_FORMAT_ATTR(filter_tid2, filter_tid, "config1:0");
 DEFINE_UNCORE_FORMAT_ATTR(filter_tid3, filter_tid, "config1:0-5");
+DEFINE_UNCORE_FORMAT_ATTR(filter_tid4, filter_tid, "config1:0-8");
 DEFINE_UNCORE_FORMAT_ATTR(filter_cid, filter_cid, "config1:5");
 DEFINE_UNCORE_FORMAT_ATTR(filter_link, filter_link, "config1:5-8");
 DEFINE_UNCORE_FORMAT_ATTR(filter_link2, filter_link, "config1:6-8");
+DEFINE_UNCORE_FORMAT_ATTR(filter_link3, filter_link, "config1:12");
 DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17");
 DEFINE_UNCORE_FORMAT_ATTR(filter_nid2, filter_nid, "config1:32-47");
 DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22");
 DEFINE_UNCORE_FORMAT_ATTR(filter_state2, filter_state, "config1:17-22");
 DEFINE_UNCORE_FORMAT_ATTR(filter_state3, filter_state, "config1:17-23");
+DEFINE_UNCORE_FORMAT_ATTR(filter_state4, filter_state, "config1:18-20");
+DEFINE_UNCORE_FORMAT_ATTR(filter_local, filter_local, "config1:33");
+DEFINE_UNCORE_FORMAT_ATTR(filter_all_op, filter_all_op, "config1:35");
+DEFINE_UNCORE_FORMAT_ATTR(filter_nnm, filter_nnm, "config1:37");
 DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31");
 DEFINE_UNCORE_FORMAT_ATTR(filter_opc2, filter_opc, "config1:52-60");
+DEFINE_UNCORE_FORMAT_ATTR(filter_opc3, filter_opc, "config1:41-60");
 DEFINE_UNCORE_FORMAT_ATTR(filter_nc, filter_nc, "config1:62");
 DEFINE_UNCORE_FORMAT_ATTR(filter_c6, filter_c6, "config1:61");
 DEFINE_UNCORE_FORMAT_ATTR(filter_isoc, filter_isoc, "config1:63");
@@ -1729,6 +1796,419 @@ int ivbep_uncore_pci_init(void)
 }
 /* end of IvyTown uncore support */
 
+/* KNL uncore support */
+static struct attribute *knl_uncore_ubox_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_tid_en.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh5.attr,
+	NULL,
+};
+
+static struct attribute_group knl_uncore_ubox_format_group = {
+	.name = "format",
+	.attrs = knl_uncore_ubox_formats_attr,
+};
+
+static struct intel_uncore_type knl_uncore_ubox = {
+	.name			= "ubox",
+	.num_counters		= 2,
+	.num_boxes		= 1,
+	.perf_ctr_bits		= 48,
+	.fixed_ctr_bits		= 48,
+	.perf_ctr		= HSWEP_U_MSR_PMON_CTR0,
+	.event_ctl		= HSWEP_U_MSR_PMON_CTL0,
+	.event_mask		= KNL_U_MSR_PMON_RAW_EVENT_MASK,
+	.fixed_ctr		= HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
+	.fixed_ctl		= HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
+	.ops			= &snbep_uncore_msr_ops,
+	.format_group		= &knl_uncore_ubox_format_group,
+};
+
+static struct attribute *knl_uncore_cha_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_qor.attr,
+	&format_attr_edge.attr,
+	&format_attr_tid_en.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh8.attr,
+	&format_attr_filter_tid4.attr,
+	&format_attr_filter_link3.attr,
+	&format_attr_filter_state4.attr,
+	&format_attr_filter_local.attr,
+	&format_attr_filter_all_op.attr,
+	&format_attr_filter_nnm.attr,
+	&format_attr_filter_opc3.attr,
+	&format_attr_filter_nc.attr,
+	&format_attr_filter_isoc.attr,
+	NULL,
+};
+
+static struct attribute_group knl_uncore_cha_format_group = {
+	.name = "format",
+	.attrs = knl_uncore_cha_formats_attr,
+};
+
+static struct event_constraint knl_uncore_cha_constraints[] = {
+	UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x1f, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+	EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg knl_uncore_cha_extra_regs[] = {
+	SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
+				  SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x3d, 0xff, 0x2),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x35, 0xff, 0x4),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x36, 0xff, 0x4),
+	EVENT_EXTRA_END
+};
+
+static u64 knl_cha_filter_mask(int fields)
+{
+	u64 mask = 0;
+
+	if (fields & 0x1)
+		mask |= KNL_CHA_MSR_PMON_BOX_FILTER_TID;
+	if (fields & 0x2)
+		mask |= KNL_CHA_MSR_PMON_BOX_FILTER_STATE;
+	if (fields & 0x4)
+		mask |= KNL_CHA_MSR_PMON_BOX_FILTER_OP;
+	return mask;
+}
+
+static struct event_constraint *
+knl_cha_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	return __snbep_cbox_get_constraint(box, event, knl_cha_filter_mask);
+}
+
+static int knl_cha_hw_config(struct intel_uncore_box *box,
+			     struct perf_event *event)
+{
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	struct extra_reg *er;
+	int idx = 0;
+
+	for (er = knl_uncore_cha_extra_regs; er->msr; er++) {
+		if (er->event != (event->hw.config & er->config_mask))
+			continue;
+		idx |= er->idx;
+	}
+
+	if (idx) {
+		reg1->reg = HSWEP_C0_MSR_PMON_BOX_FILTER0 +
+			    KNL_CHA_MSR_OFFSET * box->pmu->pmu_idx;
+		reg1->config = event->attr.config1 & knl_cha_filter_mask(idx);
+		reg1->idx = idx;
+	}
+	return 0;
+}
+
+static void hswep_cbox_enable_event(struct intel_uncore_box *box,
+				    struct perf_event *event);
+
+static struct intel_uncore_ops knl_uncore_cha_ops = {
+	.init_box		= snbep_uncore_msr_init_box,
+	.disable_box		= snbep_uncore_msr_disable_box,
+	.enable_box		= snbep_uncore_msr_enable_box,
+	.disable_event		= snbep_uncore_msr_disable_event,
+	.enable_event		= hswep_cbox_enable_event,
+	.read_counter		= uncore_msr_read_counter,
+	.hw_config		= knl_cha_hw_config,
+	.get_constraint		= knl_cha_get_constraint,
+	.put_constraint		= snbep_cbox_put_constraint,
+};
+
+static struct intel_uncore_type knl_uncore_cha = {
+	.name			= "cha",
+	.num_counters		= 4,
+	.num_boxes		= 38,
+	.perf_ctr_bits		= 48,
+	.event_ctl		= HSWEP_C0_MSR_PMON_CTL0,
+	.perf_ctr		= HSWEP_C0_MSR_PMON_CTR0,
+	.event_mask		= KNL_CHA_MSR_PMON_RAW_EVENT_MASK,
+	.box_ctl		= HSWEP_C0_MSR_PMON_BOX_CTL,
+	.msr_offset		= KNL_CHA_MSR_OFFSET,
+	.num_shared_regs	= 1,
+	.constraints		= knl_uncore_cha_constraints,
+	.ops			= &knl_uncore_cha_ops,
+	.format_group		= &knl_uncore_cha_format_group,
+};
+
+static struct attribute *knl_uncore_pcu_formats_attr[] = {
+	&format_attr_event2.attr,
+	&format_attr_use_occ_ctr.attr,
+	&format_attr_occ_sel.attr,
+	&format_attr_edge.attr,
+	&format_attr_tid_en.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh6.attr,
+	&format_attr_occ_invert.attr,
+	&format_attr_occ_edge_det.attr,
+	NULL,
+};
+
+static struct attribute_group knl_uncore_pcu_format_group = {
+	.name = "format",
+	.attrs = knl_uncore_pcu_formats_attr,
+};
+
+static struct intel_uncore_type knl_uncore_pcu = {
+	.name			= "pcu",
+	.num_counters		= 4,
+	.num_boxes		= 1,
+	.perf_ctr_bits		= 48,
+	.perf_ctr		= HSWEP_PCU_MSR_PMON_CTR0,
+	.event_ctl		= HSWEP_PCU_MSR_PMON_CTL0,
+	.event_mask		= KNL_PCU_MSR_PMON_RAW_EVENT_MASK,
+	.box_ctl		= HSWEP_PCU_MSR_PMON_BOX_CTL,
+	.ops			= &snbep_uncore_msr_ops,
+	.format_group		= &knl_uncore_pcu_format_group,
+};
+
+static struct intel_uncore_type *knl_msr_uncores[] = {
+	&knl_uncore_ubox,
+	&knl_uncore_cha,
+	&knl_uncore_pcu,
+	NULL,
+};
+
+void knl_uncore_cpu_init(void)
+{
+	uncore_msr_uncores = knl_msr_uncores;
+}
+
+static void knl_uncore_imc_enable_box(struct intel_uncore_box *box)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	int box_ctl = uncore_pci_box_ctl(box);
+
+	pci_write_config_dword(pdev, box_ctl, 0);
+}
+
+static void knl_uncore_imc_enable_event(struct intel_uncore_box *box,
+					struct perf_event *event)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	struct hw_perf_event *hwc = &event->hw;
+
+	if ((event->attr.config & SNBEP_PMON_CTL_EV_SEL_MASK)
+							== UNCORE_FIXED_EVENT)
+		pci_write_config_dword(pdev, hwc->config_base,
+				       hwc->config | KNL_PMON_FIXED_CTL_EN);
+	else
+		pci_write_config_dword(pdev, hwc->config_base,
+				       hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops knl_uncore_imc_ops = {
+	.init_box	= snbep_uncore_pci_init_box,
+	.disable_box	= snbep_uncore_pci_disable_box,
+	.enable_box	= knl_uncore_imc_enable_box,
+	.read_counter	= snbep_uncore_pci_read_counter,
+	.enable_event	= knl_uncore_imc_enable_event,
+	.disable_event	= snbep_uncore_pci_disable_event,
+};
+
+static struct intel_uncore_type knl_uncore_imc_uclk = {
+	.name			= "imc_uclk",
+	.num_counters		= 4,
+	.num_boxes		= 2,
+	.perf_ctr_bits		= 48,
+	.fixed_ctr_bits		= 48,
+	.perf_ctr		= KNL_UCLK_MSR_PMON_CTR0_LOW,
+	.event_ctl		= KNL_UCLK_MSR_PMON_CTL0,
+	.event_mask		= SNBEP_PMON_RAW_EVENT_MASK,
+	.fixed_ctr		= KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW,
+	.fixed_ctl		= KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL,
+	.box_ctl		= KNL_UCLK_MSR_PMON_BOX_CTL,
+	.ops			= &knl_uncore_imc_ops,
+	.format_group		= &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type knl_uncore_imc_dclk = {
+	.name			= "imc",
+	.num_counters		= 4,
+	.num_boxes		= 6,
+	.perf_ctr_bits		= 48,
+	.fixed_ctr_bits		= 48,
+	.perf_ctr		= KNL_MC0_CH0_MSR_PMON_CTR0_LOW,
+	.event_ctl		= KNL_MC0_CH0_MSR_PMON_CTL0,
+	.event_mask		= SNBEP_PMON_RAW_EVENT_MASK,
+	.fixed_ctr		= KNL_MC0_CH0_MSR_PMON_FIXED_LOW,
+	.fixed_ctl		= KNL_MC0_CH0_MSR_PMON_FIXED_CTL,
+	.box_ctl		= KNL_MC0_CH0_MSR_PMON_BOX_CTL,
+	.ops			= &knl_uncore_imc_ops,
+	.format_group		= &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type knl_uncore_edc_uclk = {
+	.name			= "edc_uclk",
+	.num_counters		= 4,
+	.num_boxes		= 8,
+	.perf_ctr_bits		= 48,
+	.fixed_ctr_bits		= 48,
+	.perf_ctr		= KNL_UCLK_MSR_PMON_CTR0_LOW,
+	.event_ctl		= KNL_UCLK_MSR_PMON_CTL0,
+	.event_mask		= SNBEP_PMON_RAW_EVENT_MASK,
+	.fixed_ctr		= KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW,
+	.fixed_ctl		= KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL,
+	.box_ctl		= KNL_UCLK_MSR_PMON_BOX_CTL,
+	.ops			= &knl_uncore_imc_ops,
+	.format_group		= &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type knl_uncore_edc_eclk = {
+	.name			= "edc_eclk",
+	.num_counters		= 4,
+	.num_boxes		= 8,
+	.perf_ctr_bits		= 48,
+	.fixed_ctr_bits		= 48,
+	.perf_ctr		= KNL_EDC0_ECLK_MSR_PMON_CTR0_LOW,
+	.event_ctl		= KNL_EDC0_ECLK_MSR_PMON_CTL0,
+	.event_mask		= SNBEP_PMON_RAW_EVENT_MASK,
+	.fixed_ctr		= KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_LOW,
+	.fixed_ctl		= KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_CTL,
+	.box_ctl		= KNL_EDC0_ECLK_MSR_PMON_BOX_CTL,
+	.ops			= &knl_uncore_imc_ops,
+	.format_group		= &snbep_uncore_format_group,
+};
+
+static struct event_constraint knl_uncore_m2pcie_constraints[] = {
+	UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+	EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type knl_uncore_m2pcie = {
+	.name		= "m2pcie",
+	.num_counters   = 4,
+	.num_boxes	= 1,
+	.perf_ctr_bits	= 48,
+	.constraints	= knl_uncore_m2pcie_constraints,
+	SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct attribute *knl_uncore_irp_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_qor.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh8.attr,
+	NULL,
+};
+
+static struct attribute_group knl_uncore_irp_format_group = {
+	.name = "format",
+	.attrs = knl_uncore_irp_formats_attr,
+};
+
+static struct intel_uncore_type knl_uncore_irp = {
+	.name			= "irp",
+	.num_counters		= 2,
+	.num_boxes		= 1,
+	.perf_ctr_bits		= 48,
+	.perf_ctr		= SNBEP_PCI_PMON_CTR0,
+	.event_ctl		= SNBEP_PCI_PMON_CTL0,
+	.event_mask		= KNL_IRP_PCI_PMON_RAW_EVENT_MASK,
+	.box_ctl		= KNL_IRP_PCI_PMON_BOX_CTL,
+	.ops			= &snbep_uncore_pci_ops,
+	.format_group		= &knl_uncore_irp_format_group,
+};
+
+enum {
+	KNL_PCI_UNCORE_MC_UCLK,
+	KNL_PCI_UNCORE_MC_DCLK,
+	KNL_PCI_UNCORE_EDC_UCLK,
+	KNL_PCI_UNCORE_EDC_ECLK,
+	KNL_PCI_UNCORE_M2PCIE,
+	KNL_PCI_UNCORE_IRP,
+};
+
+static struct intel_uncore_type *knl_pci_uncores[] = {
+	[KNL_PCI_UNCORE_MC_UCLK]	= &knl_uncore_imc_uclk,
+	[KNL_PCI_UNCORE_MC_DCLK]	= &knl_uncore_imc_dclk,
+	[KNL_PCI_UNCORE_EDC_UCLK]	= &knl_uncore_edc_uclk,
+	[KNL_PCI_UNCORE_EDC_ECLK]	= &knl_uncore_edc_eclk,
+	[KNL_PCI_UNCORE_M2PCIE]		= &knl_uncore_m2pcie,
+	[KNL_PCI_UNCORE_IRP]		= &knl_uncore_irp,
+	NULL,
+};
+
+/*
+ * KNL uses a common PCI device ID for multiple instances of an Uncore PMU
+ * device type. prior to KNL, each instance of a PMU device type had a unique
+ * device ID.
+ *
+ *	PCI Device ID	Uncore PMU Devices
+ *	----------------------------------
+ *	0x7841		MC0 UClk, MC1 UClk
+ *	0x7843		MC0 DClk CH 0, MC0 DClk CH 1, MC0 DClk CH 2,
+ *			MC1 DClk CH 0, MC1 DClk CH 1, MC1 DClk CH 2
+ *	0x7833		EDC0 UClk, EDC1 UClk, EDC2 UClk, EDC3 UClk,
+ *			EDC4 UClk, EDC5 UClk, EDC6 UClk, EDC7 UClk
+ *	0x7835		EDC0 EClk, EDC1 EClk, EDC2 EClk, EDC3 EClk,
+ *			EDC4 EClk, EDC5 EClk, EDC6 EClk, EDC7 EClk
+ *	0x7817		M2PCIe
+ *	0x7814		IRP
+*/
+
+static const struct pci_device_id knl_uncore_pci_ids[] = {
+	{ /* MC UClk */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7841),
+		.driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_MC_UCLK, 0),
+	},
+	{ /* MC DClk Channel */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7843),
+		.driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_MC_DCLK, 0),
+	},
+	{ /* EDC UClk */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833),
+		.driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_EDC_UCLK, 0),
+	},
+	{ /* EDC EClk */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835),
+		.driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_EDC_ECLK, 0),
+	},
+	{ /* M2PCIe */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7817),
+		.driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_M2PCIE, 0),
+	},
+	{ /* IRP */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7814),
+		.driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_IRP, 0),
+	},
+	{ /* end: all zeroes */ }
+};
+
+static struct pci_driver knl_uncore_pci_driver = {
+	.name		= "knl_uncore",
+	.id_table	= knl_uncore_pci_ids,
+};
+
+int knl_uncore_pci_init(void)
+{
+	int ret;
+
+	/* All KNL PCI based PMON units are on the same PCI bus except IRP */
+	ret = snb_pci2phy_map_init(0x7814); /* IRP */
+	if (ret)
+		return ret;
+	ret = snb_pci2phy_map_init(0x7817); /* M2PCIe */
+	if (ret)
+		return ret;
+	uncore_pci_uncores = knl_pci_uncores;
+	uncore_pci_driver = &knl_uncore_pci_driver;
+	return 0;
+}
+
+/* end of KNL uncore support */
+
 /* Haswell-EP uncore support */
 static struct attribute *hswep_uncore_ubox_formats_attr[] = {
 	&format_attr_event.attr,
-- 
cgit v1.1


From 9cc2617de5b9222abb39cd02e90d57dfea99c6d7 Mon Sep 17 00:00:00 2001
From: Vince Weaver <vincent.weaver@maine.edu>
Date: Wed, 9 Dec 2015 11:34:45 -0500
Subject: perf/x86/amd: Remove l1-dcache-stores event for AMD

This is a long standing bug with the l1-dcache-stores generic event on
AMD machines.  My perf_event testsuite has been complaining about this
for years and I'm finally getting around to trying to get it fixed.

The data_cache_refills:system event does not make sense for l1-dcache-stores.
Maybe this was a typo and it was meant to be for l1-dcache-store-misses?

In any case, the values returned are nowhere near correct for l1-dcache-stores
and in fact the umask values for the event have completely changed with
fam15h so it makes even less sense than ever.  So just remove it.

Signed-off-by: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1512091134350.24311@vincent-weaver-1.umelst.maine.edu
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_amd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 1cee5d2..05e76bf 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -18,7 +18,7 @@ static __initconst const u64 amd_hw_cache_event_ids
 		[ C(RESULT_MISS)   ] = 0x0141, /* Data Cache Misses          */
 	},
 	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */
+		[ C(RESULT_ACCESS) ] = 0,
 		[ C(RESULT_MISS)   ] = 0,
 	},
 	[ C(OP_PREFETCH) ] = {
-- 
cgit v1.1


From fd36f3dd79331b9610664b867ff205465bf9ce68 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 23 Dec 2015 02:06:58 +0900
Subject: perf hist: Pass struct sample to __hists__add_entry()

This is a preparation to add more info into the hist_entry.  Also it
already passes too many argument, so passing sample directly will reduce
the overhead of the function call.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1450804030-29193-2-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-annotate.c |  7 +++++--
 tools/perf/builtin-diff.c     | 11 +++++------
 tools/perf/tests/hists_link.c |  6 +++---
 tools/perf/util/hist.c        | 31 +++++++++++++++++--------------
 tools/perf/util/hist.h        |  4 ++--
 5 files changed, 32 insertions(+), 27 deletions(-)

diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index e18f1b9..b5b8db0 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -47,7 +47,7 @@ struct perf_annotate {
 };
 
 static int perf_evsel__add_sample(struct perf_evsel *evsel,
-				  struct perf_sample *sample __maybe_unused,
+				  struct perf_sample *sample,
 				  struct addr_location *al,
 				  struct perf_annotate *ann)
 {
@@ -72,7 +72,10 @@ static int perf_evsel__add_sample(struct perf_evsel *evsel,
 		return 0;
 	}
 
-	he = __hists__add_entry(hists, al, NULL, NULL, NULL, 1, 1, 0, true);
+	sample->period = 1;
+	sample->weight = 1;
+
+	he = __hists__add_entry(hists, al, NULL, NULL, NULL, sample, true);
 	if (he == NULL)
 		return -ENOMEM;
 
diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
index 0b180a8..69f5b1fe 100644
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@@ -311,11 +311,11 @@ static int formula_fprintf(struct hist_entry *he, struct hist_entry *pair,
 }
 
 static int hists__add_entry(struct hists *hists,
-			    struct addr_location *al, u64 period,
-			    u64 weight, u64 transaction)
+			    struct addr_location *al,
+			    struct perf_sample *sample)
 {
-	if (__hists__add_entry(hists, al, NULL, NULL, NULL, period, weight,
-			       transaction, true) != NULL)
+	if (__hists__add_entry(hists, al, NULL, NULL, NULL,
+			       sample, true) != NULL)
 		return 0;
 	return -ENOMEM;
 }
@@ -336,8 +336,7 @@ static int diff__process_sample_event(struct perf_tool *tool __maybe_unused,
 		return -1;
 	}
 
-	if (hists__add_entry(hists, &al, sample->period,
-			     sample->weight, sample->transaction)) {
+	if (hists__add_entry(hists, &al, sample)) {
 		pr_warning("problem incrementing symbol period, skipping event\n");
 		goto out_put;
 	}
diff --git a/tools/perf/tests/hists_link.c b/tools/perf/tests/hists_link.c
index 6243e2b..9eac98d 100644
--- a/tools/perf/tests/hists_link.c
+++ b/tools/perf/tests/hists_link.c
@@ -64,7 +64,7 @@ static int add_hist_entries(struct perf_evlist *evlist, struct machine *machine)
 	struct perf_evsel *evsel;
 	struct addr_location al;
 	struct hist_entry *he;
-	struct perf_sample sample = { .period = 1, };
+	struct perf_sample sample = { .period = 1, .weight = 1, };
 	size_t i = 0, k;
 
 	/*
@@ -90,7 +90,7 @@ static int add_hist_entries(struct perf_evlist *evlist, struct machine *machine)
 				goto out;
 
 			he = __hists__add_entry(hists, &al, NULL,
-						NULL, NULL, 1, 1, 0, true);
+						NULL, NULL, &sample, true);
 			if (he == NULL) {
 				addr_location__put(&al);
 				goto out;
@@ -116,7 +116,7 @@ static int add_hist_entries(struct perf_evlist *evlist, struct machine *machine)
 				goto out;
 
 			he = __hists__add_entry(hists, &al, NULL,
-						NULL, NULL, 1, 1, 0, true);
+						NULL, NULL, &sample, true);
 			if (he == NULL) {
 				addr_location__put(&al);
 				goto out;
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 56e97f5..039bb91 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -461,7 +461,7 @@ struct hist_entry *__hists__add_entry(struct hists *hists,
 				      struct symbol *sym_parent,
 				      struct branch_info *bi,
 				      struct mem_info *mi,
-				      u64 period, u64 weight, u64 transaction,
+				      struct perf_sample *sample,
 				      bool sample_self)
 {
 	struct hist_entry entry = {
@@ -478,15 +478,15 @@ struct hist_entry *__hists__add_entry(struct hists *hists,
 		.level	 = al->level,
 		.stat = {
 			.nr_events = 1,
-			.period	= period,
-			.weight = weight,
+			.period	= sample->period,
+			.weight = sample->weight,
 		},
 		.parent = sym_parent,
 		.filtered = symbol__parent_filter(sym_parent) | al->filtered,
 		.hists	= hists,
 		.branch_info = bi,
 		.mem_info = mi,
-		.transaction = transaction,
+		.transaction = sample->transaction,
 	};
 
 	return hists__findnew_entry(hists, &entry, al, sample_self);
@@ -526,12 +526,13 @@ iter_add_single_mem_entry(struct hist_entry_iter *iter, struct addr_location *al
 	u64 cost;
 	struct mem_info *mi = iter->priv;
 	struct hists *hists = evsel__hists(iter->evsel);
+	struct perf_sample *sample = iter->sample;
 	struct hist_entry *he;
 
 	if (mi == NULL)
 		return -EINVAL;
 
-	cost = iter->sample->weight;
+	cost = sample->weight;
 	if (!cost)
 		cost = 1;
 
@@ -542,8 +543,10 @@ iter_add_single_mem_entry(struct hist_entry_iter *iter, struct addr_location *al
 	 * and this is indirectly achieved by passing period=weight here
 	 * and the he_stat__add_period() function.
 	 */
+	sample->period = cost;
+
 	he = __hists__add_entry(hists, al, iter->parent, NULL, mi,
-				cost, cost, 0, true);
+				sample, true);
 	if (!he)
 		return -ENOMEM;
 
@@ -630,6 +633,7 @@ iter_add_next_branch_entry(struct hist_entry_iter *iter, struct addr_location *a
 	struct branch_info *bi;
 	struct perf_evsel *evsel = iter->evsel;
 	struct hists *hists = evsel__hists(evsel);
+	struct perf_sample *sample = iter->sample;
 	struct hist_entry *he = NULL;
 	int i = iter->curr;
 	int err = 0;
@@ -643,9 +647,11 @@ iter_add_next_branch_entry(struct hist_entry_iter *iter, struct addr_location *a
 	 * The report shows the percentage of total branches captured
 	 * and not events sampled. Thus we use a pseudo period of 1.
 	 */
+	sample->period = 1;
+	sample->weight = bi->flags.cycles ? bi->flags.cycles : 1;
+
 	he = __hists__add_entry(hists, al, iter->parent, &bi[i], NULL,
-				1, bi->flags.cycles ? bi->flags.cycles : 1,
-				0, true);
+				sample, true);
 	if (he == NULL)
 		return -ENOMEM;
 
@@ -682,8 +688,7 @@ iter_add_single_normal_entry(struct hist_entry_iter *iter, struct addr_location
 	struct hist_entry *he;
 
 	he = __hists__add_entry(evsel__hists(evsel), al, iter->parent, NULL, NULL,
-				sample->period, sample->weight,
-				sample->transaction, true);
+				sample, true);
 	if (he == NULL)
 		return -ENOMEM;
 
@@ -744,8 +749,7 @@ iter_add_single_cumulative_entry(struct hist_entry_iter *iter,
 	int err = 0;
 
 	he = __hists__add_entry(hists, al, iter->parent, NULL, NULL,
-				sample->period, sample->weight,
-				sample->transaction, true);
+				sample, true);
 	if (he == NULL)
 		return -ENOMEM;
 
@@ -818,8 +822,7 @@ iter_add_next_cumulative_entry(struct hist_entry_iter *iter,
 	}
 
 	he = __hists__add_entry(evsel__hists(evsel), al, iter->parent, NULL, NULL,
-				sample->period, sample->weight,
-				sample->transaction, false);
+				sample, false);
 	if (he == NULL)
 		return -ENOMEM;
 
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index a48a207..36439bf 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -114,8 +114,8 @@ struct hist_entry *__hists__add_entry(struct hists *hists,
 				      struct addr_location *al,
 				      struct symbol *parent,
 				      struct branch_info *bi,
-				      struct mem_info *mi, u64 period,
-				      u64 weight, u64 transaction,
+				      struct mem_info *mi,
+				      struct perf_sample *sample,
 				      bool sample_self);
 int hist_entry_iter__add(struct hist_entry_iter *iter, struct addr_location *al,
 			 int max_stack_depth, void *arg);
-- 
cgit v1.1


From 723928340c9d28d92dcaff8b8fbc9100a1cf9429 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 24 Dec 2015 11:16:17 +0900
Subject: perf hist: Save raw_data/size for tracepoint events

The raw_data and raw_size fields are to provide tracepoint specific
information.  They will be used by dynamic sort keys later.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1450923377-18641-1-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/hist.c | 24 ++++++++++++++++++++++++
 tools/perf/util/sort.h |  2 ++
 2 files changed, 26 insertions(+)

diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 039bb91..2dcf38a 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -369,6 +369,25 @@ static struct hist_entry *hist_entry__new(struct hist_entry *template,
 		if (symbol_conf.use_callchain)
 			callchain_init(he->callchain);
 
+		if (he->raw_data) {
+			he->raw_data = memdup(he->raw_data, he->raw_size);
+
+			if (he->raw_data == NULL) {
+				map__put(he->ms.map);
+				if (he->branch_info) {
+					map__put(he->branch_info->from.map);
+					map__put(he->branch_info->to.map);
+					free(he->branch_info);
+				}
+				if (he->mem_info) {
+					map__put(he->mem_info->iaddr.map);
+					map__put(he->mem_info->daddr.map);
+				}
+				free(he->stat_acc);
+				free(he);
+				return NULL;
+			}
+		}
 		INIT_LIST_HEAD(&he->pairs.node);
 		thread__get(he->thread);
 	}
@@ -487,6 +506,8 @@ struct hist_entry *__hists__add_entry(struct hists *hists,
 		.branch_info = bi,
 		.mem_info = mi,
 		.transaction = sample->transaction,
+		.raw_data = sample->raw_data,
+		.raw_size = sample->raw_size,
 	};
 
 	return hists__findnew_entry(hists, &entry, al, sample_self);
@@ -801,6 +822,8 @@ iter_add_next_cumulative_entry(struct hist_entry_iter *iter,
 			.sym = al->sym,
 		},
 		.parent = iter->parent,
+		.raw_data = sample->raw_data,
+		.raw_size = sample->raw_size,
 	};
 	int i;
 	struct callchain_cursor cursor;
@@ -974,6 +997,7 @@ void hist_entry__delete(struct hist_entry *he)
 	if (he->srcfile && he->srcfile[0])
 		free(he->srcfile);
 	free_callchain(he->callchain);
+	free(he->raw_data);
 	free(he);
 }
 
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index 86f05e7..d298987 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -122,6 +122,8 @@ struct hist_entry {
 	struct branch_info	*branch_info;
 	struct hists		*hists;
 	struct mem_info		*mem_info;
+	void			*raw_data;
+	u32			raw_size;
 	struct callchain_root	callchain[0]; /* must be last member */
 };
 
-- 
cgit v1.1


From be45d40efec96558c489579bbf93465e90b10abe Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 23 Dec 2015 22:08:41 +0900
Subject: tools lib traceevent: Factor out and export print_event_field[s]()

The print_event_field() and print_event_fields() functions print basic
information of a given field or event without the print format.  They'll
be used by dynamic sort keys later.

Committer note:

Rename it to pevent_print_field[s]() to get proper namespacing, as
discussed with Steven Rostedt.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1450876121-22494-1-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/lib/traceevent/event-parse.c | 129 +++++++++++++++++++------------------
 tools/lib/traceevent/event-parse.h |   4 ++
 2 files changed, 72 insertions(+), 61 deletions(-)

diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c
index 68276f3..ea69ce3 100644
--- a/tools/lib/traceevent/event-parse.c
+++ b/tools/lib/traceevent/event-parse.c
@@ -4735,73 +4735,80 @@ static int is_printable_array(char *p, unsigned int len)
 	return 1;
 }
 
-static void print_event_fields(struct trace_seq *s, void *data,
-			       int size __maybe_unused,
-			       struct event_format *event)
+void pevent_print_field(struct trace_seq *s, void *data,
+			struct format_field *field)
 {
-	struct format_field *field;
 	unsigned long long val;
 	unsigned int offset, len, i;
-
-	field = event->format.fields;
-	while (field) {
-		trace_seq_printf(s, " %s=", field->name);
-		if (field->flags & FIELD_IS_ARRAY) {
-			offset = field->offset;
-			len = field->size;
-			if (field->flags & FIELD_IS_DYNAMIC) {
-				val = pevent_read_number(event->pevent, data + offset, len);
-				offset = val;
-				len = offset >> 16;
-				offset &= 0xffff;
-			}
-			if (field->flags & FIELD_IS_STRING &&
-			    is_printable_array(data + offset, len)) {
-				trace_seq_printf(s, "%s", (char *)data + offset);
-			} else {
-				trace_seq_puts(s, "ARRAY[");
-				for (i = 0; i < len; i++) {
-					if (i)
-						trace_seq_puts(s, ", ");
-					trace_seq_printf(s, "%02x",
-							 *((unsigned char *)data + offset + i));
-				}
-				trace_seq_putc(s, ']');
-				field->flags &= ~FIELD_IS_STRING;
-			}
+	struct pevent *pevent = field->event->pevent;
+
+	if (field->flags & FIELD_IS_ARRAY) {
+		offset = field->offset;
+		len = field->size;
+		if (field->flags & FIELD_IS_DYNAMIC) {
+			val = pevent_read_number(pevent, data + offset, len);
+			offset = val;
+			len = offset >> 16;
+			offset &= 0xffff;
+		}
+		if (field->flags & FIELD_IS_STRING &&
+		    is_printable_array(data + offset, len)) {
+			trace_seq_printf(s, "%s", (char *)data + offset);
 		} else {
-			val = pevent_read_number(event->pevent, data + field->offset,
-						 field->size);
-			if (field->flags & FIELD_IS_POINTER) {
-				trace_seq_printf(s, "0x%llx", val);
-			} else if (field->flags & FIELD_IS_SIGNED) {
-				switch (field->size) {
-				case 4:
-					/*
-					 * If field is long then print it in hex.
-					 * A long usually stores pointers.
-					 */
-					if (field->flags & FIELD_IS_LONG)
-						trace_seq_printf(s, "0x%x", (int)val);
-					else
-						trace_seq_printf(s, "%d", (int)val);
-					break;
-				case 2:
-					trace_seq_printf(s, "%2d", (short)val);
-					break;
-				case 1:
-					trace_seq_printf(s, "%1d", (char)val);
-					break;
-				default:
-					trace_seq_printf(s, "%lld", val);
-				}
-			} else {
+			trace_seq_puts(s, "ARRAY[");
+			for (i = 0; i < len; i++) {
+				if (i)
+					trace_seq_puts(s, ", ");
+				trace_seq_printf(s, "%02x",
+						 *((unsigned char *)data + offset + i));
+			}
+			trace_seq_putc(s, ']');
+			field->flags &= ~FIELD_IS_STRING;
+		}
+	} else {
+		val = pevent_read_number(pevent, data + field->offset,
+					 field->size);
+		if (field->flags & FIELD_IS_POINTER) {
+			trace_seq_printf(s, "0x%llx", val);
+		} else if (field->flags & FIELD_IS_SIGNED) {
+			switch (field->size) {
+			case 4:
+				/*
+				 * If field is long then print it in hex.
+				 * A long usually stores pointers.
+				 */
 				if (field->flags & FIELD_IS_LONG)
-					trace_seq_printf(s, "0x%llx", val);
+					trace_seq_printf(s, "0x%x", (int)val);
 				else
-					trace_seq_printf(s, "%llu", val);
+					trace_seq_printf(s, "%d", (int)val);
+				break;
+			case 2:
+				trace_seq_printf(s, "%2d", (short)val);
+				break;
+			case 1:
+				trace_seq_printf(s, "%1d", (char)val);
+				break;
+			default:
+				trace_seq_printf(s, "%lld", val);
 			}
+		} else {
+			if (field->flags & FIELD_IS_LONG)
+				trace_seq_printf(s, "0x%llx", val);
+			else
+				trace_seq_printf(s, "%llu", val);
 		}
+	}
+}
+
+void pevent_print_fields(struct trace_seq *s, void *data,
+			 int size __maybe_unused, struct event_format *event)
+{
+	struct format_field *field;
+
+	field = event->format.fields;
+	while (field) {
+		trace_seq_printf(s, " %s=", field->name);
+		pevent_print_field(s, data, field);
 		field = field->next;
 	}
 }
@@ -4827,7 +4834,7 @@ static void pretty_print(struct trace_seq *s, void *data, int size, struct event
 
 	if (event->flags & EVENT_FL_FAILED) {
 		trace_seq_printf(s, "[FAILED TO PARSE]");
-		print_event_fields(s, data, size, event);
+		pevent_print_fields(s, data, size, event);
 		return;
 	}
 
@@ -5301,7 +5308,7 @@ void pevent_event_info(struct trace_seq *s, struct event_format *event,
 	int print_pretty = 1;
 
 	if (event->pevent->print_raw || (event->flags & EVENT_FL_PRINTRAW))
-		print_event_fields(s, record->data, record->size, event);
+		pevent_print_fields(s, record->data, record->size, event);
 	else {
 
 		if (event->handler && !(event->flags & EVENT_FL_NOHANDLE))
diff --git a/tools/lib/traceevent/event-parse.h b/tools/lib/traceevent/event-parse.h
index 6fc83c7..706d9bc 100644
--- a/tools/lib/traceevent/event-parse.h
+++ b/tools/lib/traceevent/event-parse.h
@@ -705,6 +705,10 @@ struct cmdline *pevent_data_pid_from_comm(struct pevent *pevent, const char *com
 					  struct cmdline *next);
 int pevent_cmdline_pid(struct pevent *pevent, struct cmdline *cmdline);
 
+void pevent_print_field(struct trace_seq *s, void *data,
+			struct format_field *field);
+void pevent_print_fields(struct trace_seq *s, void *data,
+			 int size __maybe_unused, struct event_format *event);
 void pevent_event_info(struct trace_seq *s, struct event_format *event,
 		       struct pevent_record *record);
 int pevent_strerror(struct pevent *pevent, enum pevent_errno errnum,
-- 
cgit v1.1


From 54f8f40384ab940e15585afde5c278c8e7726214 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 23 Dec 2015 02:07:01 +0900
Subject: perf top: Create the evlist sooner

This is a preparation to support dynamic sort keys for tracepoint
events.  Dynamic sort keys can be created for specific fields in trace
events so it needs the event information, so we need to pass the evlist
to the sort routines, create it sooner so that the next patch can do
that.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1450804030-29193-5-git-send-email-namhyung@kernel.org
[ Split from the patch passing the evlist to the sort routines ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-top.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 9ebd67a..4e913d8 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -1231,6 +1231,12 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
 	if (argc)
 		usage_with_options(top_usage, options);
 
+	if (!top.evlist->nr_entries &&
+	    perf_evlist__add_default(top.evlist) < 0) {
+		pr_err("Not enough memory for event selector list\n");
+		goto out_delete_evlist;
+	}
+
 	sort__mode = SORT_MODE__TOP;
 	/* display thread wants entries to be collapsed in a different tree */
 	sort__need_collapse = 1;
@@ -1277,12 +1283,6 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
 		goto out_delete_evlist;
 	}
 
-	if (!top.evlist->nr_entries &&
-	    perf_evlist__add_default(top.evlist) < 0) {
-		ui__error("Not enough memory for event selector list\n");
-		goto out_delete_evlist;
-	}
-
 	symbol_conf.nr_events = top.evlist->nr_entries;
 
 	if (top.delay_secs < 1)
-- 
cgit v1.1


From 40184c46a3055a97e2efa69da6f17c05bff4b776 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 23 Dec 2015 02:07:01 +0900
Subject: perf tools: Pass evlist to setup_sorting()

This is a preparation to support dynamic sort keys for tracepoint
events.  Dynamic sort keys can be created for specific fields in trace
events so it needs the event information.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1450804030-29193-5-git-send-email-namhyung@kernel.org
[ Moving the evlist creation earlier in top was split to a previous patch ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-annotate.c     |  2 +-
 tools/perf/builtin-diff.c         |  2 +-
 tools/perf/builtin-report.c       |  2 +-
 tools/perf/builtin-top.c          |  2 +-
 tools/perf/tests/hists_cumulate.c |  8 ++++----
 tools/perf/tests/hists_filter.c   |  2 +-
 tools/perf/tests/hists_link.c     |  2 +-
 tools/perf/tests/hists_output.c   | 10 +++++-----
 tools/perf/util/sort.c            | 15 +++++++++------
 tools/perf/util/sort.h            |  5 +++--
 10 files changed, 27 insertions(+), 23 deletions(-)

diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index b5b8db0..cc5c126 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -370,7 +370,7 @@ int cmd_annotate(int argc, const char **argv, const char *prefix __maybe_unused)
 	if (ret < 0)
 		goto out_delete;
 
-	if (setup_sorting() < 0)
+	if (setup_sorting(NULL) < 0)
 		usage_with_options(annotate_usage, options);
 
 	if (annotate.use_stdio)
diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
index 69f5b1fe..8706383 100644
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@@ -1279,7 +1279,7 @@ int cmd_diff(int argc, const char **argv, const char *prefix __maybe_unused)
 
 	sort__mode = SORT_MODE__DIFF;
 
-	if (setup_sorting() < 0)
+	if (setup_sorting(NULL) < 0)
 		usage_with_options(diff_usage, options);
 
 	setup_pager();
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 2a7330b..ea53c81 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -897,7 +897,7 @@ repeat:
 		symbol_conf.cumulate_callchain = false;
 	}
 
-	if (setup_sorting() < 0) {
+	if (setup_sorting(session->evlist) < 0) {
 		if (sort_order)
 			parse_options_usage(report_usage, options, "s", 1);
 		if (field_order)
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 4e913d8..0058259 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -1241,7 +1241,7 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
 	/* display thread wants entries to be collapsed in a different tree */
 	sort__need_collapse = 1;
 
-	if (setup_sorting() < 0) {
+	if (setup_sorting(top.evlist) < 0) {
 		if (sort_order)
 			parse_options_usage(top_usage, options, "s", 1);
 		if (field_order)
diff --git a/tools/perf/tests/hists_cumulate.c b/tools/perf/tests/hists_cumulate.c
index 8292948..e360892 100644
--- a/tools/perf/tests/hists_cumulate.c
+++ b/tools/perf/tests/hists_cumulate.c
@@ -281,7 +281,7 @@ static int test1(struct perf_evsel *evsel, struct machine *machine)
 	symbol_conf.cumulate_callchain = false;
 	perf_evsel__reset_sample_bit(evsel, CALLCHAIN);
 
-	setup_sorting();
+	setup_sorting(NULL);
 	callchain_register_param(&callchain_param);
 
 	err = add_hist_entries(hists, machine);
@@ -428,7 +428,7 @@ static int test2(struct perf_evsel *evsel, struct machine *machine)
 	symbol_conf.cumulate_callchain = false;
 	perf_evsel__set_sample_bit(evsel, CALLCHAIN);
 
-	setup_sorting();
+	setup_sorting(NULL);
 	callchain_register_param(&callchain_param);
 
 	err = add_hist_entries(hists, machine);
@@ -486,7 +486,7 @@ static int test3(struct perf_evsel *evsel, struct machine *machine)
 	symbol_conf.cumulate_callchain = true;
 	perf_evsel__reset_sample_bit(evsel, CALLCHAIN);
 
-	setup_sorting();
+	setup_sorting(NULL);
 	callchain_register_param(&callchain_param);
 
 	err = add_hist_entries(hists, machine);
@@ -670,7 +670,7 @@ static int test4(struct perf_evsel *evsel, struct machine *machine)
 	symbol_conf.cumulate_callchain = true;
 	perf_evsel__set_sample_bit(evsel, CALLCHAIN);
 
-	setup_sorting();
+	setup_sorting(NULL);
 	callchain_register_param(&callchain_param);
 
 	err = add_hist_entries(hists, machine);
diff --git a/tools/perf/tests/hists_filter.c b/tools/perf/tests/hists_filter.c
index ccb5b49..2a784be 100644
--- a/tools/perf/tests/hists_filter.c
+++ b/tools/perf/tests/hists_filter.c
@@ -122,7 +122,7 @@ int test__hists_filter(int subtest __maybe_unused)
 		goto out;
 
 	/* default sort order (comm,dso,sym) will be used */
-	if (setup_sorting() < 0)
+	if (setup_sorting(NULL) < 0)
 		goto out;
 
 	machines__init(&machines);
diff --git a/tools/perf/tests/hists_link.c b/tools/perf/tests/hists_link.c
index 9eac98d..c764d69 100644
--- a/tools/perf/tests/hists_link.c
+++ b/tools/perf/tests/hists_link.c
@@ -294,7 +294,7 @@ int test__hists_link(int subtest __maybe_unused)
 		goto out;
 
 	/* default sort order (comm,dso,sym) will be used */
-	if (setup_sorting() < 0)
+	if (setup_sorting(NULL) < 0)
 		goto out;
 
 	machines__init(&machines);
diff --git a/tools/perf/tests/hists_output.c b/tools/perf/tests/hists_output.c
index 248beec..ebe6cd4 100644
--- a/tools/perf/tests/hists_output.c
+++ b/tools/perf/tests/hists_output.c
@@ -134,7 +134,7 @@ static int test1(struct perf_evsel *evsel, struct machine *machine)
 	field_order = NULL;
 	sort_order = NULL; /* equivalent to sort_order = "comm,dso,sym" */
 
-	setup_sorting();
+	setup_sorting(NULL);
 
 	/*
 	 * expected output:
@@ -236,7 +236,7 @@ static int test2(struct perf_evsel *evsel, struct machine *machine)
 	field_order = "overhead,cpu";
 	sort_order = "pid";
 
-	setup_sorting();
+	setup_sorting(NULL);
 
 	/*
 	 * expected output:
@@ -292,7 +292,7 @@ static int test3(struct perf_evsel *evsel, struct machine *machine)
 	field_order = "comm,overhead,dso";
 	sort_order = NULL;
 
-	setup_sorting();
+	setup_sorting(NULL);
 
 	/*
 	 * expected output:
@@ -366,7 +366,7 @@ static int test4(struct perf_evsel *evsel, struct machine *machine)
 	field_order = "dso,sym,comm,overhead,dso";
 	sort_order = "sym";
 
-	setup_sorting();
+	setup_sorting(NULL);
 
 	/*
 	 * expected output:
@@ -468,7 +468,7 @@ static int test5(struct perf_evsel *evsel, struct machine *machine)
 	field_order = "cpu,pid,comm,dso,sym";
 	sort_order = "dso,pid";
 
-	setup_sorting();
+	setup_sorting(NULL);
 
 	/*
 	 * expected output:
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 2d8ccd4..0c038a2 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -4,6 +4,8 @@
 #include "comm.h"
 #include "symbol.h"
 #include "evsel.h"
+#include "evlist.h"
+#include <traceevent/event-parse.h>
 
 regex_t		parent_regex;
 const char	default_parent_pattern[] = "^sys_|^do_page_fault";
@@ -1583,7 +1585,8 @@ int hpp_dimension__add_output(unsigned col)
 	return __hpp_dimension__add_output(&hpp_sort_dimensions[col]);
 }
 
-int sort_dimension__add(const char *tok)
+static int sort_dimension__add(const char *tok,
+			       struct perf_evlist *evlist __maybe_unused)
 {
 	unsigned int i;
 
@@ -1712,7 +1715,7 @@ static int setup_sort_order(void)
 	return 0;
 }
 
-static int __setup_sorting(void)
+static int __setup_sorting(struct perf_evlist *evlist)
 {
 	char *tmp, *tok, *str;
 	const char *sort_keys;
@@ -1743,7 +1746,7 @@ static int __setup_sorting(void)
 
 	for (tok = strtok_r(str, ", ", &tmp);
 			tok; tok = strtok_r(NULL, ", ", &tmp)) {
-		ret = sort_dimension__add(tok);
+		ret = sort_dimension__add(tok, evlist);
 		if (ret == -EINVAL) {
 			error("Invalid --sort key: `%s'", tok);
 			break;
@@ -1954,16 +1957,16 @@ out:
 	return ret;
 }
 
-int setup_sorting(void)
+int setup_sorting(struct perf_evlist *evlist)
 {
 	int err;
 
-	err = __setup_sorting();
+	err = __setup_sorting(evlist);
 	if (err < 0)
 		return err;
 
 	if (parent_pattern != default_parent_pattern) {
-		err = sort_dimension__add("parent");
+		err = sort_dimension__add("parent", evlist);
 		if (err < 0)
 			return err;
 	}
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index d298987..1a00f1e 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -226,10 +226,11 @@ struct sort_entry {
 extern struct sort_entry sort_thread;
 extern struct list_head hist_entry__sort_list;
 
-int setup_sorting(void);
+struct perf_evlist;
+struct pevent;
+int setup_sorting(struct perf_evlist *evlist);
 int setup_output_field(void);
 void reset_output_field(void);
-extern int sort_dimension__add(const char *);
 void sort__setup_elide(FILE *fp);
 void perf_hpp__set_elide(int idx, bool elide);
 
-- 
cgit v1.1


From c7c2a5e40f17ab3b14716d4f08d03792a9b683e7 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 23 Dec 2015 02:07:02 +0900
Subject: perf tools: Add dynamic sort key for tracepoint events

The existing sort keys are less useful for tracepoint events in that
they are always sampled at the same place, the function where the
tracepoint is located.

For example, a 'perf report' on sched:sched_switch event looks like the
following:

  # Overhead  Command          Shared Object     Symbol
  # ........  ...............  ................  ..............
  #
      47.22%  swapper          [kernel.vmlinux]  [k] __schedule
      21.67%  transmission-gt  [kernel.vmlinux]  [k] __schedule
       8.23%  netctl-auto      [kernel.vmlinux]  [k] __schedule
       5.53%  kworker/0:1H     [kernel.vmlinux]  [k] __schedule
       1.98%  Xephyr           [kernel.vmlinux]  [k] __schedule
       1.33%  irq/33-iwlwifi   [kernel.vmlinux]  [k] __schedule
       1.17%  wpa_cli          [kernel.vmlinux]  [k] __schedule
       1.13%  rcu_preempt      [kernel.vmlinux]  [k] __schedule
       0.85%  ksoftirqd/0      [kernel.vmlinux]  [k] __schedule
       0.77%  Timer            [kernel.vmlinux]  [k] __schedule

In fact, tracepoints have meaningful information in their fields but
there's no way to use in 'perf report' currently.  The dynamic sort keys
are introduced in this patc to overcome this limitation.

The sched:sched_switch events have following fields:

  # sudo cat /sys/kernel/debug/tracing/events/sched/sched_switch/format
  name: sched_switch
  ID: 268
  format:
	field:unsigned short common_type;         offset:0; size:2; signed:0;
	field:unsigned char common_flags;         offset:2; size:1; signed:0;
	field:unsigned char common_preempt_count; offset:3; size:1; signed:0;
	field:int common_pid;                     offset:4; size:4; signed:1;

	field:char prev_comm[16]; offset:8;  size:16; signed:1;
	field:pid_t prev_pid;     offset:24; size:4;  signed:1;
	field:int prev_prio;      offset:28; size:4;  signed:1;
	field:long prev_state;    offset:32; size:8;  signed:1;
	field:char next_comm[16]; offset:40; size:16; signed:1;
	field:pid_t next_pid;     offset:56; size:4;  signed:1;
	field:int next_prio;      offset:60; size:4;  signed:1;

  print fmt: "prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==>
              next_comm=%s next_pid=%d next_prio=%d",
    REC->prev_comm, REC->prev_pid, REC->prev_prio,
    REC->prev_state & (2048-1) ? __print_flags(REC->prev_state & (2048-1),
    "|", { 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" }, { 16, "Z" }, { 32, "X" },
    { 64, "x" }, { 128, "K"}, { 256, "W" }, { 512, "P" }, { 1024, "N" }) : "R",
    REC->prev_state & 2048 ? "+" : "", REC->next_comm, REC->next_pid, REC->next_prio

With dynamic sort keys, you can use <event.field> as a sort key.  Those
dynamic keys are checked and created on demand.  For instance, below is
to sort by next_pid field output on the same data file:

  $ perf report -s comm,sched:sched_switch.next_pid --stdio
  ...
  # Overhead  Command            next_pid
  # ........  ...............  ..........
  #
      21.23%  transmission-gt           0
      20.86%  swapper               17773
       6.62%  netctl-auto               0
       5.25%  swapper                 109
       5.21%  kworker/0:1H              0
       1.98%  Xephyr                    0
       1.98%  swapper                6524
       1.98%  swapper               27478
       1.37%  swapper               27476
       1.17%  swapper                 233

Multiple dynamic sort keys are also supported:

  $ perf report -s comm,sched:sched_switch.next_pid,sched:sched_switch.next_comm --stdio
  ...
  # Overhead  Command            next_pid         next_comm
  # ........  ...............  ..........  ................
  #
      20.86%  swapper               17773   transmission-gt
       9.64%  transmission-gt           0         swapper/0
       9.16%  transmission-gt           0         swapper/2
       5.25%  swapper                 109      kworker/0:1H
       5.21%  kworker/0:1H              0         swapper/0
       2.14%  netctl-auto               0         swapper/2
       1.98%  netctl-auto               0         swapper/0
       1.98%  swapper                6524            Xephyr
       1.98%  swapper               27478       netctl-auto
       1.78%  transmission-gt           0         swapper/3
       1.53%  Xephyr                    0         swapper/0
       1.29%  netctl-auto               0         swapper/1
       1.29%  swapper               27476       netctl-auto
       1.21%  netctl-auto               0         swapper/3
       1.17%  swapper                 233    irq/33-iwlwifi

Note that pid 0 exists for each cpu so have comm of 'swapper/N'.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1450804030-29193-6-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/sort.c | 213 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 213 insertions(+)

diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 0c038a2..cc659ba 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -1531,6 +1531,216 @@ static int __sort_dimension__add_hpp_output(struct sort_dimension *sd)
 	return 0;
 }
 
+struct hpp_dynamic_entry {
+	struct perf_hpp_fmt hpp;
+	struct perf_evsel *evsel;
+	struct format_field *field;
+	unsigned dynamic_len;
+};
+
+static int hde_width(struct hpp_dynamic_entry *hde)
+{
+	if (!hde->hpp.len) {
+		int len = hde->dynamic_len;
+		int namelen = strlen(hde->field->name);
+		int fieldlen = hde->field->size;
+
+		if (namelen > len)
+			len = namelen;
+
+		if (!(hde->field->flags & FIELD_IS_STRING)) {
+			/* length for print hex numbers */
+			fieldlen = hde->field->size * 2 + 2;
+		}
+		if (fieldlen > len)
+			len = fieldlen;
+
+		hde->hpp.len = len;
+	}
+	return hde->hpp.len;
+}
+
+static int __sort__hde_header(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
+			      struct perf_evsel *evsel __maybe_unused)
+{
+	struct hpp_dynamic_entry *hde;
+	size_t len = fmt->user_len;
+
+	hde = container_of(fmt, struct hpp_dynamic_entry, hpp);
+
+	if (!len)
+		len = hde_width(hde);
+
+	return scnprintf(hpp->buf, hpp->size, "%*.*s", len, len, hde->field->name);
+}
+
+static int __sort__hde_width(struct perf_hpp_fmt *fmt,
+			     struct perf_hpp *hpp __maybe_unused,
+			     struct perf_evsel *evsel __maybe_unused)
+{
+	struct hpp_dynamic_entry *hde;
+	size_t len = fmt->user_len;
+
+	hde = container_of(fmt, struct hpp_dynamic_entry, hpp);
+
+	if (!len)
+		len = hde_width(hde);
+
+	return len;
+}
+
+static int __sort__hde_entry(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
+			     struct hist_entry *he)
+{
+	struct hpp_dynamic_entry *hde;
+	size_t len = fmt->user_len;
+	struct trace_seq seq;
+	int ret;
+
+	hde = container_of(fmt, struct hpp_dynamic_entry, hpp);
+
+	if (!len)
+		len = hde_width(hde);
+
+	if (hists_to_evsel(he->hists) != hde->evsel)
+		return scnprintf(hpp->buf, hpp->size, "%*.*s", len, len, "N/A");
+
+	trace_seq_init(&seq);
+	pevent_print_field(&seq, he->raw_data, hde->field);
+	ret = scnprintf(hpp->buf, hpp->size, "%*.*s", len, len, seq.buffer);
+	trace_seq_destroy(&seq);
+	return ret;
+}
+
+static int64_t __sort__hde_cmp(struct perf_hpp_fmt *fmt,
+			       struct hist_entry *a, struct hist_entry *b)
+{
+	struct hpp_dynamic_entry *hde;
+	struct format_field *field;
+	unsigned offset, size;
+
+	hde = container_of(fmt, struct hpp_dynamic_entry, hpp);
+
+	if (hists_to_evsel(a->hists) != hde->evsel)
+		return 0;
+
+	field = hde->field;
+	if (field->flags & FIELD_IS_DYNAMIC) {
+		unsigned long long dyn;
+
+		pevent_read_number_field(field, a->raw_data, &dyn);
+		offset = dyn & 0xffff;
+		size = (dyn >> 16) & 0xffff;
+
+		/* record max width for output */
+		if (size > hde->dynamic_len)
+			hde->dynamic_len = size;
+	} else {
+		offset = field->offset;
+		size = field->size;
+	}
+
+	return memcmp(a->raw_data + offset, b->raw_data + offset, size);
+}
+
+static struct hpp_dynamic_entry *
+__alloc_dynamic_entry(struct perf_evsel *evsel, struct format_field *field)
+{
+	struct hpp_dynamic_entry *hde;
+
+	hde = malloc(sizeof(*hde));
+	if (hde == NULL) {
+		pr_debug("Memory allocation failed\n");
+		return NULL;
+	}
+
+	hde->evsel = evsel;
+	hde->field = field;
+	hde->dynamic_len = 0;
+
+	hde->hpp.name = field->name;
+	hde->hpp.header = __sort__hde_header;
+	hde->hpp.width  = __sort__hde_width;
+	hde->hpp.entry  = __sort__hde_entry;
+	hde->hpp.color  = NULL;
+
+	hde->hpp.cmp = __sort__hde_cmp;
+	hde->hpp.collapse = __sort__hde_cmp;
+	hde->hpp.sort = __sort__hde_cmp;
+
+	INIT_LIST_HEAD(&hde->hpp.list);
+	INIT_LIST_HEAD(&hde->hpp.sort_list);
+	hde->hpp.elide = false;
+	hde->hpp.len = 0;
+	hde->hpp.user_len = 0;
+
+	return hde;
+}
+
+static int add_dynamic_entry(struct perf_evlist *evlist, const char *tok)
+{
+	char *str, *event_name, *field_name;
+	struct perf_evsel *evsel, *pos;
+	struct format_field *field;
+	struct hpp_dynamic_entry *hde;
+	int ret = 0;
+
+	if (evlist == NULL)
+		return -ENOENT;
+
+	str = strdup(tok);
+	if (str == NULL)
+		return -ENOMEM;
+
+	event_name = str;
+	field_name = strchr(str, '.');
+	if (field_name == NULL) {
+		ret = -EINVAL;
+		goto out;
+	}
+	*field_name++ = '\0';
+
+	evsel = NULL;
+	evlist__for_each(evlist, pos) {
+		if (!strcmp(pos->name, event_name)) {
+			evsel = pos;
+			break;
+		}
+	}
+
+	if (evsel == NULL) {
+		pr_debug("Cannot find event: %s\n", event_name);
+		ret = -ENOENT;
+		goto out;
+	}
+
+	if (evsel->attr.type != PERF_TYPE_TRACEPOINT) {
+		pr_debug("%s is not a tracepoint event\n", event_name);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	field = pevent_find_any_field(evsel->tp_format, field_name);
+	if (field == NULL) {
+		pr_debug("Cannot find event field for %s.%s\n",
+		       event_name, field_name);
+		ret = -ENOENT;
+		goto out;
+	}
+
+	hde = __alloc_dynamic_entry(evsel, field);
+	if (hde == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	perf_hpp__register_sort_field(&hde->hpp);
+
+out:
+	free(str);
+	return ret;
+}
+
 static int __sort_dimension__add(struct sort_dimension *sd)
 {
 	if (sd->taken)
@@ -1667,6 +1877,9 @@ static int sort_dimension__add(const char *tok,
 		return 0;
 	}
 
+	if (!add_dynamic_entry(evlist, tok))
+		return 0;
+
 	return -ESRCH;
 }
 
-- 
cgit v1.1


From 60517d28fbd91629686dcf9a39aef4e068a3d5f6 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 23 Dec 2015 02:07:03 +0900
Subject: perf tools: Try to show pretty printed output for dynamic sort keys

Each tracepoint event has format string for print to improve
readability.  Try to parse the output and match the field name.  If it
finds one, use that for the result.  If not, fallbacks to the original
output.

For example, sort on kmem:kmalloc.gfp_flags looks like below:
(Note: libtraceevent plugins are not installed on my system.  They might
affect the output below)

Before:
  # Overhead  Command   gfp_flags
  # ........  .......  ..........
  #
      99.89%  perf          32848
       0.06%  sleep           208
       0.03%  perf          32976
       0.01%  perf            208

After:
  # Overhead  Command            gfp_flags
  # ........  .......  ...................
  #
      99.89%  perf       GFP_NOFS|GFP_ZERO
       0.06%  sleep             GFP_KERNEL
       0.03%  perf     GFP_KERNEL|GFP_ZERO
       0.01%  perf              GFP_KERNEL

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1450804030-29193-7-git-send-email-namhyung@kernel.org
[ Fixed clash with earlier, updated patch in this patchkit ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/hist.c |   1 +
 tools/perf/util/sort.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++---
 tools/perf/util/sort.h |   1 +
 3 files changed, 102 insertions(+), 5 deletions(-)

diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 2dcf38a..fdb97e1 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -997,6 +997,7 @@ void hist_entry__delete(struct hist_entry *he)
 	if (he->srcfile && he->srcfile[0])
 		free(he->srcfile);
 	free_callchain(he->callchain);
+	free(he->trace_output);
 	free(he->raw_data);
 	free(he);
 }
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index cc659ba..22d28c7 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -1560,6 +1560,62 @@ static int hde_width(struct hpp_dynamic_entry *hde)
 	return hde->hpp.len;
 }
 
+static char *get_trace_output(struct hist_entry *he)
+{
+	struct trace_seq seq;
+	struct perf_evsel *evsel;
+	struct pevent_record rec = {
+		.data = he->raw_data,
+		.size = he->raw_size,
+	};
+
+	evsel = hists_to_evsel(he->hists);
+
+	trace_seq_init(&seq);
+	pevent_event_info(&seq, evsel->tp_format, &rec);
+	return seq.buffer;
+}
+
+static void update_dynamic_len(struct hpp_dynamic_entry *hde,
+			       struct hist_entry *he)
+{
+	char *str, *pos;
+	struct format_field *field = hde->field;
+	size_t namelen;
+	bool last = false;
+
+	/* parse pretty print result and update max length */
+	if (!he->trace_output)
+		he->trace_output = get_trace_output(he);
+
+	namelen = strlen(field->name);
+	str = he->trace_output;
+
+	while (str) {
+		pos = strchr(str, ' ');
+		if (pos == NULL) {
+			last = true;
+			pos = str + strlen(str);
+		}
+
+		if (!strncmp(str, field->name, namelen)) {
+			size_t len;
+
+			str += namelen + 1;
+			len = pos - str;
+
+			if (len > hde->dynamic_len)
+				hde->dynamic_len = len;
+			break;
+		}
+
+		if (last)
+			str = NULL;
+		else
+			str = pos + 1;
+	}
+}
+
 static int __sort__hde_header(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
 			      struct perf_evsel *evsel __maybe_unused)
 {
@@ -1594,7 +1650,10 @@ static int __sort__hde_entry(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
 {
 	struct hpp_dynamic_entry *hde;
 	size_t len = fmt->user_len;
-	struct trace_seq seq;
+	char *str, *pos;
+	struct format_field *field;
+	size_t namelen;
+	bool last = false;
 	int ret;
 
 	hde = container_of(fmt, struct hpp_dynamic_entry, hpp);
@@ -1605,10 +1664,43 @@ static int __sort__hde_entry(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
 	if (hists_to_evsel(he->hists) != hde->evsel)
 		return scnprintf(hpp->buf, hpp->size, "%*.*s", len, len, "N/A");
 
-	trace_seq_init(&seq);
-	pevent_print_field(&seq, he->raw_data, hde->field);
-	ret = scnprintf(hpp->buf, hpp->size, "%*.*s", len, len, seq.buffer);
-	trace_seq_destroy(&seq);
+	field = hde->field;
+
+	namelen = strlen(field->name);
+	str = he->trace_output;
+
+	while (str) {
+		pos = strchr(str, ' ');
+		if (pos == NULL) {
+			last = true;
+			pos = str + strlen(str);
+		}
+
+		if (!strncmp(str, field->name, namelen)) {
+			str += namelen + 1;
+			str = strndup(str, pos - str);
+
+			if (str == NULL)
+				return scnprintf(hpp->buf, hpp->size,
+						 "%*.*s", len, len, "ERROR");
+			break;
+		}
+
+		if (last)
+			str = NULL;
+		else
+			str = pos + 1;
+	}
+
+	if (str == NULL) {
+		struct trace_seq seq;
+		trace_seq_init(&seq);
+		pevent_print_field(&seq, he->raw_data, hde->field);
+		str = seq.buffer;
+	}
+
+	ret = scnprintf(hpp->buf, hpp->size, "%*.*s", len, len, str);
+	free(str);
 	return ret;
 }
 
@@ -1638,6 +1730,9 @@ static int64_t __sort__hde_cmp(struct perf_hpp_fmt *fmt,
 	} else {
 		offset = field->offset;
 		size = field->size;
+
+		update_dynamic_len(hde, a);
+		update_dynamic_len(hde, b);
 	}
 
 	return memcmp(a->raw_data + offset, b->raw_data + offset, size);
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index 1a00f1e..f6d2a7e 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -124,6 +124,7 @@ struct hist_entry {
 	struct mem_info		*mem_info;
 	void			*raw_data;
 	u32			raw_size;
+	void			*trace_output;
 	struct callchain_root	callchain[0]; /* must be last member */
 };
 
-- 
cgit v1.1


From a34bb6a08d6020bde0475bc901793771858a1112 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 23 Dec 2015 02:07:04 +0900
Subject: perf tools: Add 'trace' sort key

The 'trace' sort key is to show tracepoint event output using either
print fmt or plugin.  For example sched_switch event (using plugin) will
show output like below:

  # perf record -e sched:sched_switch -a usleep 10
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.197 MB perf.data (69 samples) ]
  #

  $ perf report -s trace --stdio
  ...
  # Overhead  Trace output
  # ........  ...................................................
  #
       9.48%  swapper/0:0 [120] R ==> transmission-gt:17773 [120]
       9.48%  transmission-gt:17773 [120] S ==> swapper/0:0 [120]
       9.04%  swapper/2:0 [120] R ==> transmission-gt:17773 [120]
       8.92%  transmission-gt:17773 [120] S ==> swapper/2:0 [120]
       5.25%  swapper/0:0 [120] R ==> kworker/0:1H:109 [100]
       5.21%  kworker/0:1H:109 [100] S ==> swapper/0:0 [120]
       1.78%  swapper/3:0 [120] R ==> transmission-gt:17773 [120]
       1.78%  transmission-gt:17773 [120] S ==> swapper/3:0 [120]
       1.53%  Xephyr:6524 [120] S ==> swapper/0:0 [120]
       1.53%  swapper/0:0 [120] R ==> Xephyr:6524 [120]
       1.17%  swapper/2:0 [120] R ==> irq/33-iwlwifi:233 [49]
       1.13%  irq/33-iwlwifi:233 [49] S ==> swapper/2:0 [120]

Note that the 'trace' sort key works only for tracepoint events.  If
it's used to other type of events, just "N/A" will be printed.

Suggested-and-acked-by: Jiri Olsa <jolsa@redhat.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1450804030-29193-8-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/hist.h |  1 +
 tools/perf/util/sort.c | 76 +++++++++++++++++++++++++++++++++++++++-----------
 tools/perf/util/sort.h |  1 +
 3 files changed, 62 insertions(+), 16 deletions(-)

diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 36439bf..15b22c5 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -52,6 +52,7 @@ enum hist_column {
 	HISTC_MEM_IADDR_SYMBOL,
 	HISTC_TRANSACTION,
 	HISTC_CYCLES,
+	HISTC_TRACE,
 	HISTC_NR_COLS, /* Last entry */
 };
 
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 22d28c7..db8476a 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -445,6 +445,65 @@ struct sort_entry sort_socket = {
 	.se_width_idx	= HISTC_SOCKET,
 };
 
+/* --sort trace */
+
+static char *get_trace_output(struct hist_entry *he)
+{
+	struct trace_seq seq;
+	struct perf_evsel *evsel;
+	struct pevent_record rec = {
+		.data = he->raw_data,
+		.size = he->raw_size,
+	};
+
+	evsel = hists_to_evsel(he->hists);
+
+	trace_seq_init(&seq);
+	pevent_event_info(&seq, evsel->tp_format, &rec);
+	return seq.buffer;
+}
+
+static int64_t
+sort__trace_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	struct perf_evsel *evsel;
+
+	evsel = hists_to_evsel(left->hists);
+	if (evsel->attr.type != PERF_TYPE_TRACEPOINT)
+		return 0;
+
+	if (left->trace_output == NULL)
+		left->trace_output = get_trace_output(left);
+	if (right->trace_output == NULL)
+		right->trace_output = get_trace_output(right);
+
+	hists__new_col_len(left->hists, HISTC_TRACE, strlen(left->trace_output));
+	hists__new_col_len(right->hists, HISTC_TRACE, strlen(right->trace_output));
+
+	return strcmp(right->trace_output, left->trace_output);
+}
+
+static int hist_entry__trace_snprintf(struct hist_entry *he, char *bf,
+				    size_t size, unsigned int width)
+{
+	struct perf_evsel *evsel;
+
+	evsel = hists_to_evsel(he->hists);
+	if (evsel->attr.type != PERF_TYPE_TRACEPOINT)
+		return scnprintf(bf, size, "%-*.*s", width, width, "N/A");
+
+	if (he->trace_output == NULL)
+		he->trace_output = get_trace_output(he);
+	return repsep_snprintf(bf, size, "%-*.*s", width, width, he->trace_output);
+}
+
+struct sort_entry sort_trace = {
+	.se_header      = "Trace output",
+	.se_cmp	        = sort__trace_cmp,
+	.se_snprintf    = hist_entry__trace_snprintf,
+	.se_width_idx	= HISTC_TRACE,
+};
+
 /* sort keys for branch stacks */
 
 static int64_t
@@ -1314,6 +1373,7 @@ static struct sort_dimension common_sort_dimensions[] = {
 	DIM(SORT_LOCAL_WEIGHT, "local_weight", sort_local_weight),
 	DIM(SORT_GLOBAL_WEIGHT, "weight", sort_global_weight),
 	DIM(SORT_TRANSACTION, "transaction", sort_transaction),
+	DIM(SORT_TRACE, "trace", sort_trace),
 };
 
 #undef DIM
@@ -1560,22 +1620,6 @@ static int hde_width(struct hpp_dynamic_entry *hde)
 	return hde->hpp.len;
 }
 
-static char *get_trace_output(struct hist_entry *he)
-{
-	struct trace_seq seq;
-	struct perf_evsel *evsel;
-	struct pevent_record rec = {
-		.data = he->raw_data,
-		.size = he->raw_size,
-	};
-
-	evsel = hists_to_evsel(he->hists);
-
-	trace_seq_init(&seq);
-	pevent_event_info(&seq, evsel->tp_format, &rec);
-	return seq.buffer;
-}
-
 static void update_dynamic_len(struct hpp_dynamic_entry *hde,
 			       struct hist_entry *he)
 {
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index f6d2a7e..6b7590a 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -183,6 +183,7 @@ enum sort_type {
 	SORT_LOCAL_WEIGHT,
 	SORT_GLOBAL_WEIGHT,
 	SORT_TRANSACTION,
+	SORT_TRACE,
 
 	/* branch stack specific sort keys */
 	__SORT_BRANCH_STACK,
-- 
cgit v1.1


From 053a3989e12fdf3be45c00ec1cb0ce09fba0ee4a Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 23 Dec 2015 02:07:05 +0900
Subject: perf report/top: Add --raw-trace option

The --raw-trace option allows disabling pretty printing by the event's
print_fmt or plugin.  Besides that, each dynamic sort key now can
receive a 'raw' suffix separated by '/' to ask for the raw trace of a
specific field.

  $ perf report -s comm,kmem:kmalloc.gfp_flags
  ...
  # Overhead  Command            gfp_flags
  # ........  .......  ...................
  #
      99.89%  perf       GFP_NOFS|GFP_ZERO
       0.06%  sleep             GFP_KERNEL
       0.03%  perf     GFP_KERNEL|GFP_ZERO
       0.01%  perf              GFP_KERNEL

Now

  $ perf report -s comm,kmem:kmalloc.gfp_flags --raw-trace
or
  $ perf report -s comm,kmem:kmalloc.gfp_flags/raw
  ...
  # Overhead  Command   gfp_flags
  # ........  .......  ..........
  #
      99.89%  perf          32848
       0.06%  sleep           208
       0.03%  perf          32976
       0.01%  perf            208

Suggested-and-Acked-by: Jiri Olsa <jolsa@redhat.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1450804030-29193-9-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-report.txt |  3 +++
 tools/perf/Documentation/perf-top.txt    |  3 +++
 tools/perf/builtin-report.c              |  2 ++
 tools/perf/builtin-top.c                 |  2 ++
 tools/perf/util/sort.c                   | 32 +++++++++++++++++++++++++++++---
 tools/perf/util/symbol.h                 |  3 ++-
 6 files changed, 41 insertions(+), 4 deletions(-)

diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index dab99ed..ae7cd91 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -371,6 +371,9 @@ include::itrace.txt[]
 --socket-filter::
 	Only report the samples on the processor socket that match with this filter
 
+--raw-trace::
+	When displaying traceevent output, do not use print fmt or plugins.
+
 include::callchain-overhead-calculation.txt[]
 
 SEE ALSO
diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt
index 556cec0..b0e60e1 100644
--- a/tools/perf/Documentation/perf-top.txt
+++ b/tools/perf/Documentation/perf-top.txt
@@ -230,6 +230,9 @@ Default is to monitor all CPUS.
 	The various filters must be specified as a comma separated list: --branch-filter any_ret,u,k
 	Note that this feature may not be available on all processors.
 
+--raw-trace::
+	When displaying traceevent output, do not use print fmt or plugins.
+
 INTERACTIVE PROMPTING KEYS
 --------------------------
 
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index ea53c81..f10c663 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -788,6 +788,8 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
 		    "Show callgraph from reference event"),
 	OPT_INTEGER(0, "socket-filter", &report.socket_filter,
 		    "only show processor socket that match with this filter"),
+	OPT_BOOLEAN(0, "raw-trace", &symbol_conf.raw_trace,
+		    "Show raw trace event output (do not use print fmt or plugins)"),
 	OPT_END()
 	};
 	struct perf_data_file file = {
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 0058259..bf01cbb 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -1210,6 +1210,8 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
 	OPT_CALLBACK('j', "branch-filter", &opts->branch_stack,
 		     "branch filter mask", "branch stack filter modes",
 		     parse_branch_stack),
+	OPT_BOOLEAN(0, "raw-trace", &symbol_conf.raw_trace,
+		    "Show raw trace event output (do not use print fmt or plugins)"),
 	OPT_END()
 	};
 	const char * const top_usage[] = {
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index db8476a..3477685 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -459,7 +459,12 @@ static char *get_trace_output(struct hist_entry *he)
 	evsel = hists_to_evsel(he->hists);
 
 	trace_seq_init(&seq);
-	pevent_event_info(&seq, evsel->tp_format, &rec);
+	if (symbol_conf.raw_trace) {
+		pevent_print_fields(&seq, he->raw_data, he->raw_size,
+				    evsel->tp_format);
+	} else {
+		pevent_event_info(&seq, evsel->tp_format, &rec);
+	}
 	return seq.buffer;
 }
 
@@ -1596,6 +1601,7 @@ struct hpp_dynamic_entry {
 	struct perf_evsel *evsel;
 	struct format_field *field;
 	unsigned dynamic_len;
+	bool raw_trace;
 };
 
 static int hde_width(struct hpp_dynamic_entry *hde)
@@ -1628,6 +1634,9 @@ static void update_dynamic_len(struct hpp_dynamic_entry *hde,
 	size_t namelen;
 	bool last = false;
 
+	if (hde->raw_trace)
+		return;
+
 	/* parse pretty print result and update max length */
 	if (!he->trace_output)
 		he->trace_output = get_trace_output(he);
@@ -1708,8 +1717,10 @@ static int __sort__hde_entry(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
 	if (hists_to_evsel(he->hists) != hde->evsel)
 		return scnprintf(hpp->buf, hpp->size, "%*.*s", len, len, "N/A");
 
-	field = hde->field;
+	if (hde->raw_trace)
+		goto raw_field;
 
+	field = hde->field;
 	namelen = strlen(field->name);
 	str = he->trace_output;
 
@@ -1738,6 +1749,7 @@ static int __sort__hde_entry(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
 
 	if (str == NULL) {
 		struct trace_seq seq;
+raw_field:
 		trace_seq_init(&seq);
 		pevent_print_field(&seq, he->raw_data, hde->field);
 		str = seq.buffer;
@@ -1818,10 +1830,11 @@ __alloc_dynamic_entry(struct perf_evsel *evsel, struct format_field *field)
 
 static int add_dynamic_entry(struct perf_evlist *evlist, const char *tok)
 {
-	char *str, *event_name, *field_name;
+	char *str, *event_name, *field_name, *raw_opt;
 	struct perf_evsel *evsel, *pos;
 	struct format_field *field;
 	struct hpp_dynamic_entry *hde;
+	bool raw_trace = symbol_conf.raw_trace;
 	int ret = 0;
 
 	if (evlist == NULL)
@@ -1839,6 +1852,18 @@ static int add_dynamic_entry(struct perf_evlist *evlist, const char *tok)
 	}
 	*field_name++ = '\0';
 
+	raw_opt = strchr(field_name, '/');
+	if (raw_opt) {
+		*raw_opt++ = '\0';
+
+		if (strcmp(raw_opt, "raw")) {
+			pr_err("Unsupported field option %s\n", raw_opt);
+			ret = -EINVAL;
+			goto out;
+		}
+		raw_trace = true;
+	}
+
 	evsel = NULL;
 	evlist__for_each(evlist, pos) {
 		if (!strcmp(pos->name, event_name)) {
@@ -1872,6 +1897,7 @@ static int add_dynamic_entry(struct perf_evlist *evlist, const char *tok)
 		ret = -ENOMEM;
 		goto out;
 	}
+	hde->raw_trace = raw_trace;
 
 	perf_hpp__register_sort_field(&hde->hpp);
 
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index 857f707..ccd1caa 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -109,7 +109,8 @@ struct symbol_conf {
 			branch_callstack,
 			has_filter,
 			show_ref_callgraph,
-			hide_unresolved;
+			hide_unresolved,
+			raw_trace;
 	const char	*vmlinux_name,
 			*kallsyms_name,
 			*source_prefix,
-- 
cgit v1.1


From 5d0cff93bb7aa85349230d4e29902b2648640c53 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 23 Dec 2015 02:07:06 +0900
Subject: perf tools: Support shortcuts for events in dynamic sort keys

The dynamic sort key requires event name but specifying full event name
is rather inconvenient.  This patch adds more ways to identify the event
in a more compact way.

  1. If session has just one event, event name can be omitted.
  2. Events can be accessed by index preceded by a percent sign.
  3. A part of the name can be used, if it's not ambiguous.  The partial
     name should not contain ':' in it.
  4. Full system + event name is still used, it should contain ':'.

So in the below example all does same thing:

  $ perf record -e sched:sched_switch -a sleep 1

  $ perf report -s next_pid,next_comm
  $ perf report -s %1.next_pid,%1.next_comm
  $ perf report -s switch.next_pid,switch.next_comm
  $ perf report -s sched:sched_switch.next_pid,sched:sched_switch.next_comm

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1450804030-29193-10-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/sort.c | 107 ++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 87 insertions(+), 20 deletions(-)

diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 3477685..f3a98c2 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -1828,10 +1828,90 @@ __alloc_dynamic_entry(struct perf_evsel *evsel, struct format_field *field)
 	return hde;
 }
 
+static int parse_field_name(char *str, char **event, char **field, char **opt)
+{
+	char *event_name, *field_name, *opt_name;
+
+	event_name = str;
+	field_name = strchr(str, '.');
+
+	if (field_name) {
+		*field_name++ = '\0';
+	} else {
+		event_name = NULL;
+		field_name = str;
+	}
+
+	opt_name = strchr(field_name, '/');
+	if (opt_name)
+		*opt_name++ = '\0';
+
+	*event = event_name;
+	*field = field_name;
+	*opt   = opt_name;
+
+	return 0;
+}
+
+/* find match evsel using a given event name.  The event name can be:
+ *   1. NULL - only valid for single event session
+ *   2. '%' + event index (e.g. '%1' for first event)
+ *   3. full event name (e.g. sched:sched_switch)
+ *   4. partial event name (should not contain ':')
+ */
+static struct perf_evsel *find_evsel(struct perf_evlist *evlist, char *event_name)
+{
+	struct perf_evsel *evsel = NULL;
+	struct perf_evsel *pos;
+	bool full_name;
+
+	/* case 1 */
+	if (event_name == NULL) {
+		if (evlist->nr_entries != 1) {
+			pr_debug("event name should be given\n");
+			return NULL;
+		}
+
+		return perf_evlist__first(evlist);
+	}
+
+	/* case 2 */
+	if (event_name[0] == '%') {
+		int nr = strtol(event_name+1, NULL, 0);
+
+		if (nr > evlist->nr_entries)
+			return NULL;
+
+		evsel = perf_evlist__first(evlist);
+		while (--nr > 0)
+			evsel = perf_evsel__next(evsel);
+
+		return evsel;
+	}
+
+	full_name = !!strchr(event_name, ':');
+	evlist__for_each(evlist, pos) {
+		/* case 3 */
+		if (full_name && !strcmp(pos->name, event_name))
+			return pos;
+		/* case 4 */
+		if (!full_name && strstr(pos->name, event_name)) {
+			if (evsel) {
+				pr_debug("'%s' event is ambiguous: it can be %s or %s\n",
+					 event_name, evsel->name, pos->name);
+				return NULL;
+			}
+			evsel = pos;
+		}
+	}
+
+	return evsel;
+}
+
 static int add_dynamic_entry(struct perf_evlist *evlist, const char *tok)
 {
-	char *str, *event_name, *field_name, *raw_opt;
-	struct perf_evsel *evsel, *pos;
+	char *str, *event_name, *field_name, *opt_name;
+	struct perf_evsel *evsel;
 	struct format_field *field;
 	struct hpp_dynamic_entry *hde;
 	bool raw_trace = symbol_conf.raw_trace;
@@ -1844,34 +1924,21 @@ static int add_dynamic_entry(struct perf_evlist *evlist, const char *tok)
 	if (str == NULL)
 		return -ENOMEM;
 
-	event_name = str;
-	field_name = strchr(str, '.');
-	if (field_name == NULL) {
+	if (parse_field_name(str, &event_name, &field_name, &opt_name) < 0) {
 		ret = -EINVAL;
 		goto out;
 	}
-	*field_name++ = '\0';
 
-	raw_opt = strchr(field_name, '/');
-	if (raw_opt) {
-		*raw_opt++ = '\0';
-
-		if (strcmp(raw_opt, "raw")) {
-			pr_err("Unsupported field option %s\n", raw_opt);
+	if (opt_name) {
+		if (strcmp(opt_name, "raw")) {
+			pr_debug("unsupported field option %s\n", opt_name);
 			ret = -EINVAL;
 			goto out;
 		}
 		raw_trace = true;
 	}
 
-	evsel = NULL;
-	evlist__for_each(evlist, pos) {
-		if (!strcmp(pos->name, event_name)) {
-			evsel = pos;
-			break;
-		}
-	}
-
+	evsel = find_evsel(evlist, event_name);
 	if (evsel == NULL) {
 		pr_debug("Cannot find event: %s\n", event_name);
 		ret = -ENOENT;
-- 
cgit v1.1


From 3b099bf5898ac1bf44d822f0bc15a7517e6fa117 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 23 Dec 2015 02:07:07 +0900
Subject: perf tools: Support '<event>.*' dynamic sort key

Support '*' character for field name to add all (non-common) fields as
sort keys easily.

  $ perf report -s 'switch.*' --stdio
  ...
  # Overhead    prev_comm  prev_pid   prev_prio  prev_state     next_comm  next_pid  next_prio
  # ........  ...........  .........  .........  ..........  ............  ........  .........
  #
       3.82%    swapper/0         0         120           0   netctl-auto     18711        120
       3.75%  netctl-auto     18711         120           1     swapper/0         0        120
       2.24%    swapper/1         0         120           0   netctl-auto     18709        120
       2.24%  netctl-auto     18709         120           1     swapper/1         0        120
       1.80%    swapper/2         0         120           0   rcu_preempt         7        120
       1.80%    swapper/2         0         120           0   netctl-auto     18711        120
       1.80%  rcu_preempt         7         120           1     swapper/2         0        120
       1.80%  netctl-auto     18711         120           1     swapper/2         0        120
  ...

Suggested-and-acked-by: Jiri Olsa <jolsa@redhat.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1450804030-29193-11-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/sort.c | 49 ++++++++++++++++++++++++++++++++++---------------
 1 file changed, 34 insertions(+), 15 deletions(-)

diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index f3a98c2..f6aef15 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -1908,12 +1908,27 @@ static struct perf_evsel *find_evsel(struct perf_evlist *evlist, char *event_nam
 	return evsel;
 }
 
+static int __dynamic_dimension__add(struct perf_evsel *evsel,
+				    struct format_field *field,
+				    bool raw_trace)
+{
+	struct hpp_dynamic_entry *hde;
+
+	hde = __alloc_dynamic_entry(evsel, field);
+	if (hde == NULL)
+		return -ENOMEM;
+
+	hde->raw_trace = raw_trace;
+
+	perf_hpp__register_sort_field(&hde->hpp);
+	return 0;
+}
+
 static int add_dynamic_entry(struct perf_evlist *evlist, const char *tok)
 {
 	char *str, *event_name, *field_name, *opt_name;
 	struct perf_evsel *evsel;
 	struct format_field *field;
-	struct hpp_dynamic_entry *hde;
 	bool raw_trace = symbol_conf.raw_trace;
 	int ret = 0;
 
@@ -1951,22 +1966,26 @@ static int add_dynamic_entry(struct perf_evlist *evlist, const char *tok)
 		goto out;
 	}
 
-	field = pevent_find_any_field(evsel->tp_format, field_name);
-	if (field == NULL) {
-		pr_debug("Cannot find event field for %s.%s\n",
-		       event_name, field_name);
-		ret = -ENOENT;
-		goto out;
-	}
+	if (!strcmp(field_name, "*")) {
+		field = evsel->tp_format->format.fields;
 
-	hde = __alloc_dynamic_entry(evsel, field);
-	if (hde == NULL) {
-		ret = -ENOMEM;
-		goto out;
-	}
-	hde->raw_trace = raw_trace;
+		while (field) {
+			ret = __dynamic_dimension__add(evsel, field, raw_trace);
+			if (ret < 0)
+				goto out;
 
-	perf_hpp__register_sort_field(&hde->hpp);
+			field = field->next;
+		}
+	} else {
+		field = pevent_find_any_field(evsel->tp_format, field_name);
+		if (field == NULL) {
+			pr_debug("Cannot find event field for %s.%s\n",
+				 event_name, field_name);
+			return -ENOENT;
+		}
+
+		ret = __dynamic_dimension__add(evsel, field, raw_trace);
+	}
 
 out:
 	free(str);
-- 
cgit v1.1


From 361459f163fa1ec7ff4700ec876c3b7ff5f36cc6 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 23 Dec 2015 02:07:08 +0900
Subject: perf tools: Skip dynamic fields not defined for current event

When there are multiple events, each dynamic sort key is defined just
for one event.  In this case other events will always show "N/A" for
those fields.  But they are meaningless and consume precious screen
width.

Let's skip those undefined dynamic fields.

  $ perf record -e kmem:kmalloc,kmem:kfree -a sleep 1

  $ perf report -s 'comm,kmalloc.*' --stdio
  # To display the perf.data header info, please use --header/--header-only options.
  #
  #
  # Total Lost Samples: 0
  #
  # Samples: 20K of event 'kmem:kmalloc'
  # Event count (approx.): 20533
  #
  # Overhead  Command           call_site                 ptr  bytes_req  bytes_alloc            gfp_flags
  # ........  .......  ..................  ..................  .........  ...........  ...................
  #
      99.89%  perf       ffffffffa01d4396  0xffff8803ffb79720         96           96    GFP_NOFS|GFP_ZERO
       0.06%  sleep      ffffffff8114e1cd  0xffff8803d228a000       4096         4096           GFP_KERNEL
       0.03%  perf       ffffffff811d6ae6  0xffff8803f7678f00        240          256  GFP_KERNEL|GFP_ZERO
       0.00%  perf       ffffffff812263c1  0xffff880406172380        128          128           GFP_KERNEL
       0.00%  perf       ffffffff812264b9  0xffff8803ffac1600        504          512           GFP_KERNEL
       0.00%  perf       ffffffff81226634  0xffff880401dc5280         28           32           GFP_KERNEL
       0.00%  sleep      ffffffff81226da9  0xffff8803ffac3a00        392          512           GFP_KERNEL

  # Samples: 20K of event 'kmem:kfree'
  # Event count (approx.): 20597
  #
  # Overhead  Command
  # ........  ..............
  #
      99.63%  perf
       0.14%  sleep
       0.11%  irq/36-iwlwifi
       0.11%  kworker/u16:0
       0.01%  Xorg
       0.00%  firefox

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1450804030-29193-12-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/ui/browsers/hists.c |  7 ++++---
 tools/perf/ui/gtk/hists.c      |  4 ++--
 tools/perf/ui/hist.c           |  2 +-
 tools/perf/ui/stdio/hist.c     |  6 +++---
 tools/perf/util/hist.c         |  2 +-
 tools/perf/util/hist.h         | 14 ++++++++++++--
 tools/perf/util/sort.c         | 20 ++++++++++++++------
 7 files changed, 37 insertions(+), 18 deletions(-)

diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index ec33196..901d481 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -1041,7 +1041,8 @@ static int hist_browser__show_entry(struct hist_browser *browser,
 		hist_browser__gotorc(browser, row, 0);
 
 		perf_hpp__for_each_format(fmt) {
-			if (perf_hpp__should_skip(fmt) || column++ < browser->b.horiz_scroll)
+			if (perf_hpp__should_skip(fmt, entry->hists) ||
+			    column++ < browser->b.horiz_scroll)
 				continue;
 
 			if (current_entry && browser->b.navkeypressed) {
@@ -1144,7 +1145,7 @@ static int hists_browser__scnprintf_headers(struct hist_browser *browser, char *
 	}
 
 	perf_hpp__for_each_format(fmt) {
-		if (perf_hpp__should_skip(fmt)  || column++ < browser->b.horiz_scroll)
+		if (perf_hpp__should_skip(fmt, hists)  || column++ < browser->b.horiz_scroll)
 			continue;
 
 		ret = fmt->header(fmt, &dummy_hpp, hists_to_evsel(hists));
@@ -1414,7 +1415,7 @@ static int hist_browser__fprintf_entry(struct hist_browser *browser,
 		printed += fprintf(fp, "%c ", folded_sign);
 
 	perf_hpp__for_each_format(fmt) {
-		if (perf_hpp__should_skip(fmt))
+		if (perf_hpp__should_skip(fmt, he->hists))
 			continue;
 
 		if (!first) {
diff --git a/tools/perf/ui/gtk/hists.c b/tools/perf/ui/gtk/hists.c
index 4677172..0f8dcfd 100644
--- a/tools/perf/ui/gtk/hists.c
+++ b/tools/perf/ui/gtk/hists.c
@@ -318,7 +318,7 @@ static void perf_gtk__show_hists(GtkWidget *window, struct hists *hists,
 	col_idx = 0;
 
 	perf_hpp__for_each_format(fmt) {
-		if (perf_hpp__should_skip(fmt))
+		if (perf_hpp__should_skip(fmt, hists))
 			continue;
 
 		/*
@@ -368,7 +368,7 @@ static void perf_gtk__show_hists(GtkWidget *window, struct hists *hists,
 		col_idx = 0;
 
 		perf_hpp__for_each_format(fmt) {
-			if (perf_hpp__should_skip(fmt))
+			if (perf_hpp__should_skip(fmt, h->hists))
 				continue;
 
 			if (fmt->color)
diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c
index 5029ba2..8263c0e 100644
--- a/tools/perf/ui/hist.c
+++ b/tools/perf/ui/hist.c
@@ -619,7 +619,7 @@ unsigned int hists__sort_list_width(struct hists *hists)
 	struct perf_hpp dummy_hpp;
 
 	perf_hpp__for_each_format(fmt) {
-		if (perf_hpp__should_skip(fmt))
+		if (perf_hpp__should_skip(fmt, hists))
 			continue;
 
 		if (first)
diff --git a/tools/perf/ui/stdio/hist.c b/tools/perf/ui/stdio/hist.c
index 7ebc661..387110d 100644
--- a/tools/perf/ui/stdio/hist.c
+++ b/tools/perf/ui/stdio/hist.c
@@ -385,7 +385,7 @@ static int hist_entry__snprintf(struct hist_entry *he, struct perf_hpp *hpp)
 		return 0;
 
 	perf_hpp__for_each_format(fmt) {
-		if (perf_hpp__should_skip(fmt))
+		if (perf_hpp__should_skip(fmt, he->hists))
 			continue;
 
 		/*
@@ -464,7 +464,7 @@ size_t hists__fprintf(struct hists *hists, bool show_header, int max_rows,
 	fprintf(fp, "# ");
 
 	perf_hpp__for_each_format(fmt) {
-		if (perf_hpp__should_skip(fmt))
+		if (perf_hpp__should_skip(fmt, hists))
 			continue;
 
 		if (!first)
@@ -490,7 +490,7 @@ size_t hists__fprintf(struct hists *hists, bool show_header, int max_rows,
 	perf_hpp__for_each_format(fmt) {
 		unsigned int i;
 
-		if (perf_hpp__should_skip(fmt))
+		if (perf_hpp__should_skip(fmt, hists))
 			continue;
 
 		if (!first)
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index fdb97e1..afc9b8f 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -1112,7 +1112,7 @@ static int hist_entry__sort(struct hist_entry *a, struct hist_entry *b)
 	int64_t cmp = 0;
 
 	perf_hpp__for_each_sort_list(fmt) {
-		if (perf_hpp__should_skip(fmt))
+		if (perf_hpp__should_skip(fmt, a->hists))
 			continue;
 
 		cmp = fmt->sort(fmt, a, b);
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 15b22c5..cb8f373 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -262,10 +262,20 @@ void perf_hpp__append_sort_keys(void);
 
 bool perf_hpp__is_sort_entry(struct perf_hpp_fmt *format);
 bool perf_hpp__same_sort_entry(struct perf_hpp_fmt *a, struct perf_hpp_fmt *b);
+bool perf_hpp__is_dynamic_entry(struct perf_hpp_fmt *format);
+bool perf_hpp__defined_dynamic_entry(struct perf_hpp_fmt *fmt, struct hists *hists);
 
-static inline bool perf_hpp__should_skip(struct perf_hpp_fmt *format)
+static inline bool perf_hpp__should_skip(struct perf_hpp_fmt *format,
+					 struct hists *hists)
 {
-	return format->elide;
+	if (format->elide)
+		return true;
+
+	if (perf_hpp__is_dynamic_entry(format) &&
+	    !perf_hpp__defined_dynamic_entry(format, hists))
+		return true;
+
+	return false;
 }
 
 void perf_hpp__reset_width(struct perf_hpp_fmt *fmt, struct hists *hists);
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index f6aef15..fd56223 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -1698,6 +1698,15 @@ static int __sort__hde_width(struct perf_hpp_fmt *fmt,
 	return len;
 }
 
+bool perf_hpp__defined_dynamic_entry(struct perf_hpp_fmt *fmt, struct hists *hists)
+{
+	struct hpp_dynamic_entry *hde;
+
+	hde = container_of(fmt, struct hpp_dynamic_entry, hpp);
+
+	return hists_to_evsel(hists) == hde->evsel;
+}
+
 static int __sort__hde_entry(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
 			     struct hist_entry *he)
 {
@@ -1714,9 +1723,6 @@ static int __sort__hde_entry(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
 	if (!len)
 		len = hde_width(hde);
 
-	if (hists_to_evsel(he->hists) != hde->evsel)
-		return scnprintf(hpp->buf, hpp->size, "%*.*s", len, len, "N/A");
-
 	if (hde->raw_trace)
 		goto raw_field;
 
@@ -1769,9 +1775,6 @@ static int64_t __sort__hde_cmp(struct perf_hpp_fmt *fmt,
 
 	hde = container_of(fmt, struct hpp_dynamic_entry, hpp);
 
-	if (hists_to_evsel(a->hists) != hde->evsel)
-		return 0;
-
 	field = hde->field;
 	if (field->flags & FIELD_IS_DYNAMIC) {
 		unsigned long long dyn;
@@ -1794,6 +1797,11 @@ static int64_t __sort__hde_cmp(struct perf_hpp_fmt *fmt,
 	return memcmp(a->raw_data + offset, b->raw_data + offset, size);
 }
 
+bool perf_hpp__is_dynamic_entry(struct perf_hpp_fmt *fmt)
+{
+	return fmt->cmp == __sort__hde_cmp;
+}
+
 static struct hpp_dynamic_entry *
 __alloc_dynamic_entry(struct perf_evsel *evsel, struct format_field *field)
 {
-- 
cgit v1.1


From 2e422fd1e4b0a1c0ca11d360be2147c87911dd1a Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 23 Dec 2015 02:07:09 +0900
Subject: perf tools: Add 'trace_fields' dynamic sort key

The 'trace_fields' sort key is similar as 'trace' sort key, but it shows
each fields separately.  Each event will get different columns as their
fields.

  $ perf report -s trace_fields --stdio
  # To display the perf.data header info, please use --header/--header-only options.
  #
  #
  # Total Lost Samples: 0
  #
  # Samples: 20K of event 'kmem:kmalloc'
  # Event count (approx.): 20533
  #
  # Overhead  Command           call_site                 ptr  bytes_req  bytes_alloc            gfp_flags
  # ........  .......  ..................  ..................  .........  ...........  ...................
  #
      99.89%  perf       ffffffffa01d4396  0xffff8803ffb79720         96           96    GFP_NOFS|GFP_ZERO
       0.06%  sleep      ffffffff8114e1cd  0xffff8803d228a000       4096         4096           GFP_KERNEL
       0.03%  perf       ffffffff811d6ae6  0xffff8803f7678f00        240          256  GFP_KERNEL|GFP_ZERO
       0.00%  perf       ffffffff812263c1  0xffff880406172380        128          128           GFP_KERNEL
       0.00%  perf       ffffffff812264b9  0xffff8803ffac1600        504          512           GFP_KERNEL
       0.00%  perf       ffffffff81226634  0xffff880401dc5280         28           32           GFP_KERNEL
       0.00%  sleep      ffffffff81226da9  0xffff8803ffac3a00        392          512           GFP_KERNEL

  # Samples: 20K of event 'kmem:kfree'
  # Event count (approx.): 20597
  #
  # Overhead           call_site                 ptr
  # ........  ..................  ..................
  #
      99.58%    ffffffffa01d85ad  0xffff8803ffb79720
       0.07%    ffffffff81443f5c  0xffff8803f7669400
       0.02%    ffffffff811d5753  0xffff8803f7678f00
       0.01%    ffffffff81443f5c  0xffff8803f766be00
       0.01%    ffffffff8114e359  0xffff8803d228a000
       0.01%    ffffffff81443f5c  0xffff8800d156dc00
       0.01%    ffffffff81443f5c  0xffff8803f7669400
       0.01%    ffffffff8114e359  0xffff8803d228a000
       0.01%    ffffffff8114e359  0xffff8803d228a000
       0.01%    ffffffff8114e359  0xffff8803d228a000

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1450804030-29193-13-git-send-email-namhyung@kernel.org
[ Combined with "perf tools: Fix segfault when using -s trace_fields" ]
Link: http://lkml.kernel.org/r/1451991518-25673-1-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/sort.c | 47 ++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 38 insertions(+), 9 deletions(-)

diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index fd56223..79aa71d 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -1932,6 +1932,38 @@ static int __dynamic_dimension__add(struct perf_evsel *evsel,
 	return 0;
 }
 
+static int add_evsel_fields(struct perf_evsel *evsel, bool raw_trace)
+{
+	int ret;
+	struct format_field *field;
+
+	field = evsel->tp_format->format.fields;
+	while (field) {
+		ret = __dynamic_dimension__add(evsel, field, raw_trace);
+		if (ret < 0)
+			return ret;
+
+		field = field->next;
+	}
+	return 0;
+}
+
+static int add_all_dynamic_fields(struct perf_evlist *evlist, bool raw_trace)
+{
+	int ret;
+	struct perf_evsel *evsel;
+
+	evlist__for_each(evlist, evsel) {
+		if (evsel->attr.type != PERF_TYPE_TRACEPOINT)
+			continue;
+
+		ret = add_evsel_fields(evsel, raw_trace);
+		if (ret < 0)
+			return ret;
+	}
+	return 0;
+}
+
 static int add_dynamic_entry(struct perf_evlist *evlist, const char *tok)
 {
 	char *str, *event_name, *field_name, *opt_name;
@@ -1961,6 +1993,11 @@ static int add_dynamic_entry(struct perf_evlist *evlist, const char *tok)
 		raw_trace = true;
 	}
 
+	if (!strcmp(field_name, "trace_fields")) {
+		ret = add_all_dynamic_fields(evlist, raw_trace);
+		goto out;
+	}
+
 	evsel = find_evsel(evlist, event_name);
 	if (evsel == NULL) {
 		pr_debug("Cannot find event: %s\n", event_name);
@@ -1975,15 +2012,7 @@ static int add_dynamic_entry(struct perf_evlist *evlist, const char *tok)
 	}
 
 	if (!strcmp(field_name, "*")) {
-		field = evsel->tp_format->format.fields;
-
-		while (field) {
-			ret = __dynamic_dimension__add(evsel, field, raw_trace);
-			if (ret < 0)
-				goto out;
-
-			field = field->next;
-		}
+		ret = add_evsel_fields(evsel, raw_trace);
 	} else {
 		field = pevent_find_any_field(evsel->tp_format, field_name);
 		if (field == NULL) {
-- 
cgit v1.1


From d49dadea78624353d1df660efb49f187bd5c5971 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 23 Dec 2015 02:07:10 +0900
Subject: perf tools: Make 'trace' or 'trace_fields' sort key default for
 tracepoint events

When an evlist contains tracepoint events only, use 'trace' sort key as
default.  If --raw-trace option was given, use 'trace_fields' instead.
This will make users more convenient to see trace result.

Suggested-and-Acked-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1450804030-29193-14-git-send-email-namhyung@kernel.org
[ Check evlist in get_default_sort_order() fixing a segfault in 'perf test hists' reported by Jiri Olsa ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/sort.c | 30 +++++++++++++++++++++++++-----
 tools/perf/util/sort.h |  1 +
 2 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 79aa71d..4b4b1c5 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -15,6 +15,7 @@ const char	default_branch_sort_order[] = "comm,dso_from,symbol_from,symbol_to,cy
 const char	default_mem_sort_order[] = "local_weight,mem,sym,dso,symbol_daddr,dso_daddr,snoop,tlb,locked";
 const char	default_top_sort_order[] = "dso,symbol";
 const char	default_diff_sort_order[] = "dso,symbol";
+const char	default_tracepoint_sort_order[] = "trace";
 const char	*sort_order;
 const char	*field_order;
 regex_t		ignore_callees_regex;
@@ -2171,7 +2172,7 @@ static int sort_dimension__add(const char *tok,
 	return -ESRCH;
 }
 
-static const char *get_default_sort_order(void)
+static const char *get_default_sort_order(struct perf_evlist *evlist)
 {
 	const char *default_sort_orders[] = {
 		default_sort_order,
@@ -2179,14 +2180,33 @@ static const char *get_default_sort_order(void)
 		default_mem_sort_order,
 		default_top_sort_order,
 		default_diff_sort_order,
+		default_tracepoint_sort_order,
 	};
+	bool use_trace = true;
+	struct perf_evsel *evsel;
 
 	BUG_ON(sort__mode >= ARRAY_SIZE(default_sort_orders));
 
+	if (evlist == NULL)
+		goto out_no_evlist;
+
+	evlist__for_each(evlist, evsel) {
+		if (evsel->attr.type != PERF_TYPE_TRACEPOINT) {
+			use_trace = false;
+			break;
+		}
+	}
+
+	if (use_trace) {
+		sort__mode = SORT_MODE__TRACEPOINT;
+		if (symbol_conf.raw_trace)
+			return "trace_fields";
+	}
+out_no_evlist:
 	return default_sort_orders[sort__mode];
 }
 
-static int setup_sort_order(void)
+static int setup_sort_order(struct perf_evlist *evlist)
 {
 	char *new_sort_order;
 
@@ -2207,7 +2227,7 @@ static int setup_sort_order(void)
 	 * because it's checked over the rest of the code.
 	 */
 	if (asprintf(&new_sort_order, "%s,%s",
-		     get_default_sort_order(), sort_order + 1) < 0) {
+		     get_default_sort_order(evlist), sort_order + 1) < 0) {
 		error("Not enough memory to set up --sort");
 		return -ENOMEM;
 	}
@@ -2222,7 +2242,7 @@ static int __setup_sorting(struct perf_evlist *evlist)
 	const char *sort_keys;
 	int ret = 0;
 
-	ret = setup_sort_order();
+	ret = setup_sort_order(evlist);
 	if (ret)
 		return ret;
 
@@ -2236,7 +2256,7 @@ static int __setup_sorting(struct perf_evlist *evlist)
 			return 0;
 		}
 
-		sort_keys = get_default_sort_order();
+		sort_keys = get_default_sort_order(evlist);
 	}
 
 	str = strdup(sort_keys);
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index 6b7590a..dec536b 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -167,6 +167,7 @@ enum sort_mode {
 	SORT_MODE__MEMORY,
 	SORT_MODE__TOP,
 	SORT_MODE__DIFF,
+	SORT_MODE__TRACEPOINT,
 };
 
 enum sort_type {
-- 
cgit v1.1


From d0018b495c0429af3efc1b54f16d291a9fa8b4be Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 23 Dec 2015 18:58:30 +0100
Subject: tools build feature: Fix feature_check_display_code typo

This function is cursed.. ;-)

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Wang Nan <wangnan0@huawei.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama <pi3orama@163.com>
Link: http://lkml.kernel.org/r/1450893514-9158-2-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/build/Makefile.feature | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature
index 6c0519d..a8b4bef 100644
--- a/tools/build/Makefile.feature
+++ b/tools/build/Makefile.feature
@@ -143,7 +143,7 @@ ifneq ("$(FEATURE_DUMP)","$(FEATURE_DUMP_FILE)")
 endif
 
 feature_display_check = $(eval $(feature_check_display_code))
-define feature_display_check_code
+define feature_check_display_code
   ifneq ($(feature-$(1)), 1)
     feature_display := 1
   endif
-- 
cgit v1.1


From 76ee2ff342743e57cb6dc059d0aba90e0c4c9bfc Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 23 Dec 2015 18:58:31 +0100
Subject: tools build feature: Move dwarf post unwind choice output into perf

We decide what dwarf unwind to choose way after the Makefile.feature
makefile is included. The $(dwarf-post-unwind) is not even set at that
time. For the same reason it was never included in FEATURE-DUMP file.

Moving it into perf VF=1 verbose display.

  $ make VF=1
    BUILD:   Doing 'make -j4' parallel build
  Auto-detecting system features:
  ...                         dwarf: [ on  ]
  ...
  ...                 LIBUNWIND_DIR:
  ...                     LIBDW_DIR:
  ...     DWARF post unwind library: libunwind
  ...

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Wang Nan <wangnan0@huawei.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama <pi3orama@163.com>
Link: http://lkml.kernel.org/r/1450893514-9158-3-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/build/Makefile.feature | 9 ---------
 tools/perf/config/Makefile   | 4 ++++
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature
index a8b4bef..2075312 100644
--- a/tools/build/Makefile.feature
+++ b/tools/build/Makefile.feature
@@ -126,10 +126,6 @@ FEATURE_DUMP_FILENAME = $(OUTPUT)FEATURE-DUMP$(FEATURE_USER)
 FEATURE_DUMP := $(foreach feat,$(FEATURE_DISPLAY),feature-$(feat)($(feature-$(feat))))
 FEATURE_DUMP_FILE := $(shell touch $(FEATURE_DUMP_FILENAME); cat $(FEATURE_DUMP_FILENAME))
 
-ifeq ($(dwarf-post-unwind),1)
-  FEATURE_DUMP += dwarf-post-unwind($(dwarf-post-unwind-text))
-endif
-
 # The $(feature_display) controls the default detection message
 # output. It's set if:
 # - detected features differes from stored features from
@@ -160,11 +156,6 @@ ifeq ($(feature_display),1)
   $(info )
   $(info Auto-detecting system features:)
   $(foreach feat,$(FEATURE_DISPLAY),$(call feature_print_status,$(feat),))
-
-  ifeq ($(dwarf-post-unwind),1)
-    $(call feature_print_text,"DWARF post unwind library", $(dwarf-post-unwind-text))
-  endif
-
   ifneq ($(feature_verbose),1)
     $(info )
   endif
diff --git a/tools/perf/config/Makefile b/tools/perf/config/Makefile
index a552417..18b2f96 100644
--- a/tools/perf/config/Makefile
+++ b/tools/perf/config/Makefile
@@ -751,6 +751,10 @@ ifeq ($(VF),1)
   $(call print_var,sysconfdir)
   $(call print_var,LIBUNWIND_DIR)
   $(call print_var,LIBDW_DIR)
+
+  ifeq ($(dwarf-post-unwind),1)
+    $(call feature_print_text,"DWARF post unwind library", $(dwarf-post-unwind-text))
+  endif
   $(info )
 endif
 
-- 
cgit v1.1


From c6a5f88f335ec43d2850d62bc4924f719d265670 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 23 Dec 2015 18:58:32 +0100
Subject: tools build feature: Introduce feature_assign macro

The feature_assign macro generates feature value
assignment for name, like:

  $(call feature_assign,dwarf) == feature-dwarf=1

This will be used more in following patches.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Wang Nan <wangnan0@huawei.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama <pi3orama@163.com>
Link: http://lkml.kernel.org/r/1450893514-9158-4-git-send-email-jolsa@kernel.org
[ Rename it to feature_assign, the original shorter name was misleading, to say the least ;-) ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/build/Makefile.feature | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature
index 2075312..b1b262e 100644
--- a/tools/build/Makefile.feature
+++ b/tools/build/Makefile.feature
@@ -122,8 +122,14 @@ define feature_print_text_code
     MSG = $(shell printf '...%30s: %s' $(1) $(2))
 endef
 
+#
+# generates feature value assignment for name, like:
+#   $(call feature_assign,dwarf) == feature-dwarf=1
+#
+feature_assign = feature-$(1)=$(feature-$(1))
+
 FEATURE_DUMP_FILENAME = $(OUTPUT)FEATURE-DUMP$(FEATURE_USER)
-FEATURE_DUMP := $(foreach feat,$(FEATURE_DISPLAY),feature-$(feat)($(feature-$(feat))))
+FEATURE_DUMP := $(foreach feat,$(FEATURE_DISPLAY),$(call feature_assign,$(feat)))
 FEATURE_DUMP_FILE := $(shell touch $(FEATURE_DUMP_FILENAME); cat $(FEATURE_DUMP_FILENAME))
 
 # The $(feature_display) controls the default detection message
-- 
cgit v1.1


From 936d120d5f6406377e622da3167cafc811d053ea Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 23 Dec 2015 18:58:33 +0100
Subject: tools build feature: Use value assignment form for FEATURE-DUMP file

Changing the contents of the FEATURE-DUMP file, so it looks like:

  feature-backtrace=1
  feature-dwarf=0
  feature-fortify-source=1
  feature-sync-compare-and-swap=0

This way it could get included in sub projects, so they won't be forced
to redo features detection.

Also now storing the complete set of features.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Wang Nan <wangnan0@huawei.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama <pi3orama@163.com>
Link: http://lkml.kernel.org/r/1450893514-9158-5-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/build/Makefile.feature | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature
index b1b262e..02db3cd 100644
--- a/tools/build/Makefile.feature
+++ b/tools/build/Makefile.feature
@@ -129,8 +129,24 @@ endef
 feature_assign = feature-$(1)=$(feature-$(1))
 
 FEATURE_DUMP_FILENAME = $(OUTPUT)FEATURE-DUMP$(FEATURE_USER)
-FEATURE_DUMP := $(foreach feat,$(FEATURE_DISPLAY),$(call feature_assign,$(feat)))
-FEATURE_DUMP_FILE := $(shell touch $(FEATURE_DUMP_FILENAME); cat $(FEATURE_DUMP_FILENAME))
+FEATURE_DUMP := $(shell touch $(FEATURE_DUMP_FILENAME); cat $(FEATURE_DUMP_FILENAME))
+
+feature_dump_check = $(eval $(feature_dump_check_code))
+define feature_dump_check_code
+  ifeq ($(findstring $(1),$(FEATURE_DUMP)),)
+    $(2) := 1
+  endif
+endef
+
+#
+# First check if any test from FEATURE_DISPLAY
+# and set feature_display := 1 if it does
+$(foreach feat,$(FEATURE_DISPLAY),$(call feature_dump_check,$(call feature_assign,$(feat)),feature_display))
+
+#
+# Now also check if any other test changed,
+# so we force FEATURE-DUMP generation
+$(foreach feat,$(FEATURE_TESTS),$(call feature_dump_check,$(call feature_assign,$(feat)),feature_dump_changed))
 
 # The $(feature_display) controls the default detection message
 # output. It's set if:
@@ -139,9 +155,9 @@ FEATURE_DUMP_FILE := $(shell touch $(FEATURE_DUMP_FILENAME); cat $(FEATURE_DUMP_
 # - one of the $(FEATURE_DISPLAY) is not detected
 # - VF is enabled
 
-ifneq ("$(FEATURE_DUMP)","$(FEATURE_DUMP_FILE)")
-  $(shell echo "$(FEATURE_DUMP)" > $(FEATURE_DUMP_FILENAME))
-  feature_display := 1
+ifeq ($(feature_dump_changed),1)
+  $(shell rm -f $(FEATURE_DUMP_FILENAME))
+  $(foreach feat,$(FEATURE_TESTS),$(shell echo "$(call feature_assign,$(feat))" >> $(FEATURE_DUMP_FILENAME)))
 endif
 
 feature_display_check = $(eval $(feature_check_display_code))
-- 
cgit v1.1


From 58683600dfe377c883eb8217b5a9bfcfe231b3ff Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 23 Dec 2015 18:58:34 +0100
Subject: perf build: Use FEATURE-DUMP in bpf subproject

Using FEATURE-DUMP in bpf subproject for features detection in case bpf
is built via perf. Keeping the current features detection otherwise.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Wang Nan <wangnan0@huawei.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama <pi3orama@163.com>
Link: http://lkml.kernel.org/r/1450893514-9158-6-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/lib/bpf/Makefile   | 4 ++++
 tools/perf/Makefile.perf | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile
index 636e3dd..919b717 100644
--- a/tools/lib/bpf/Makefile
+++ b/tools/lib/bpf/Makefile
@@ -80,7 +80,11 @@ endif
 endif
 
 ifeq ($(check_feat),1)
+ifeq ($(FEATURES_DUMP),)
 include $(srctree)/tools/build/Makefile.feature
+else
+include $(FEATURES_DUMP)
+endif
 endif
 
 export prefix libdir src obj
diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index 569fcf0..404e3b1 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -436,7 +436,7 @@ $(LIBAPI)-clean:
 	$(Q)$(MAKE) -C $(LIB_DIR) O=$(OUTPUT) clean >/dev/null
 
 $(LIBBPF): fixdep FORCE
-	$(Q)$(MAKE) -C $(BPF_DIR) O=$(OUTPUT) $(OUTPUT)libbpf.a
+	$(Q)$(MAKE) -C $(BPF_DIR) O=$(OUTPUT) $(OUTPUT)libbpf.a FEATURES_DUMP=$(realpath $(OUTPUT)FEATURE-DUMP)
 
 $(LIBBPF)-clean:
 	$(call QUIET_CLEAN, libbpf)
-- 
cgit v1.1


From 9735be24ec086fbccee321471cc21dedefa956a6 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Tue, 5 Jan 2016 19:58:35 +0900
Subject: perf tools: Add all matching dynamic sort keys for field name

When a perf.data file has multiple events, it's likely to be similar
(tracepoint) events.  In that case, they might have same field name so
add all of them to sort keys instead of bailing out.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1451991518-25673-2-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/sort.c | 48 ++++++++++++++++++++++++++++++++----------------
 1 file changed, 32 insertions(+), 16 deletions(-)

diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 4b4b1c5..04e2a5c 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -1863,10 +1863,9 @@ static int parse_field_name(char *str, char **event, char **field, char **opt)
 }
 
 /* find match evsel using a given event name.  The event name can be:
- *   1. NULL - only valid for single event session
- *   2. '%' + event index (e.g. '%1' for first event)
- *   3. full event name (e.g. sched:sched_switch)
- *   4. partial event name (should not contain ':')
+ *   1. '%' + event index (e.g. '%1' for first event)
+ *   2. full event name (e.g. sched:sched_switch)
+ *   3. partial event name (should not contain ':')
  */
 static struct perf_evsel *find_evsel(struct perf_evlist *evlist, char *event_name)
 {
@@ -1875,16 +1874,6 @@ static struct perf_evsel *find_evsel(struct perf_evlist *evlist, char *event_nam
 	bool full_name;
 
 	/* case 1 */
-	if (event_name == NULL) {
-		if (evlist->nr_entries != 1) {
-			pr_debug("event name should be given\n");
-			return NULL;
-		}
-
-		return perf_evlist__first(evlist);
-	}
-
-	/* case 2 */
 	if (event_name[0] == '%') {
 		int nr = strtol(event_name+1, NULL, 0);
 
@@ -1900,10 +1889,10 @@ static struct perf_evsel *find_evsel(struct perf_evlist *evlist, char *event_nam
 
 	full_name = !!strchr(event_name, ':');
 	evlist__for_each(evlist, pos) {
-		/* case 3 */
+		/* case 2 */
 		if (full_name && !strcmp(pos->name, event_name))
 			return pos;
-		/* case 4 */
+		/* case 3 */
 		if (!full_name && strstr(pos->name, event_name)) {
 			if (evsel) {
 				pr_debug("'%s' event is ambiguous: it can be %s or %s\n",
@@ -1965,6 +1954,28 @@ static int add_all_dynamic_fields(struct perf_evlist *evlist, bool raw_trace)
 	return 0;
 }
 
+static int add_all_matching_fields(struct perf_evlist *evlist,
+				   char *field_name, bool raw_trace)
+{
+	int ret = -ESRCH;
+	struct perf_evsel *evsel;
+	struct format_field *field;
+
+	evlist__for_each(evlist, evsel) {
+		if (evsel->attr.type != PERF_TYPE_TRACEPOINT)
+			continue;
+
+		field = pevent_find_any_field(evsel->tp_format, field_name);
+		if (field == NULL)
+			continue;
+
+		ret = __dynamic_dimension__add(evsel, field, raw_trace);
+		if (ret < 0)
+			break;
+	}
+	return ret;
+}
+
 static int add_dynamic_entry(struct perf_evlist *evlist, const char *tok)
 {
 	char *str, *event_name, *field_name, *opt_name;
@@ -1999,6 +2010,11 @@ static int add_dynamic_entry(struct perf_evlist *evlist, const char *tok)
 		goto out;
 	}
 
+	if (event_name == NULL) {
+		ret = add_all_matching_fields(evlist, field_name, raw_trace);
+		goto out;
+	}
+
 	evsel = find_evsel(evlist, event_name);
 	if (evsel == NULL) {
 		pr_debug("Cannot find event: %s\n", event_name);
-- 
cgit v1.1


From 4c96bee03247c6eab27287fa66457a231b9fab79 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Tue, 5 Jan 2016 19:58:36 +0900
Subject: perf report: Add documentation for dynamic sort keys

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1451991518-25673-3-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-report.txt | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index ae7cd91..8a301f6 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -117,6 +117,30 @@ OPTIONS
 	And default sort keys are changed to comm, dso_from, symbol_from, dso_to
 	and symbol_to, see '--branch-stack'.
 
+	If the data file has tracepoint event(s), following (dynamic) sort keys
+	are also available:
+	trace, trace_fields, [<event>.]<field>[/raw]
+
+	- trace: pretty printed trace output in a single column
+	- trace_fields: fields in tracepoints in separate columns
+	- <field name>: optional event and field name for a specific field
+
+	The last form consists of event and field names.  If event name is
+	omitted, it searches all events for matching field name.  The matched
+	field will be shown only for the event has the field.  The event name
+	supports substring match so user doesn't need to specify full subsystem
+	and event name everytime.  For example, 'sched:sched_switch' event can
+	be shortened to 'switch' as long as it's not ambiguous.  Also event can
+	be specified by its index (starting from 1) preceded by the '%'.
+	So '%1' is the first event, '%2' is the second, and so on.
+
+	The field name can have '/raw' suffix which disables pretty printing
+	and shows raw field value like hex numbers.  The --raw-trace option
+	has the same effect for all dynamic sort keys.
+
+	The default sort keys are changed to 'trace' if all events in the data
+	file are tracepoint.
+
 -F::
 --fields=::
 	Specify output field - multiple keys can be specified in CSV format.
-- 
cgit v1.1


From 6db1a5c190d6abe416ea36aa28a6c53e0b3bbd5e Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Tue, 5 Jan 2016 22:09:05 +0100
Subject: perf stat record: Keep sample_type 0 for pipe session

For pipe sessions we need to keep sample_type zero, because script's
perf_evsel__check_attr is triggered by sample_type != 0, and the check
would fail on stat session.

I was tempted to keep it zero unconditionally, but the pipe session is
sufficient. In perf.data session we are guarded by HEADER_STAT feature.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1452028152-26762-2-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-stat.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 9805e03..7f56824 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -184,11 +184,18 @@ static int create_perf_stat_counter(struct perf_evsel *evsel)
 	 * like tracepoints. Clear it up for counting.
 	 */
 	attr->sample_period = 0;
+
 	/*
 	 * But set sample_type to PERF_SAMPLE_IDENTIFIER, which should be harmless
 	 * while avoiding that older tools show confusing messages.
+	 *
+	 * However for pipe sessions we need to keep it zero,
+	 * because script's perf_evsel__check_attr is triggered
+	 * by attr->sample_type != 0, and we can't run it on
+	 * stat sessions.
 	 */
-	attr->sample_type   = PERF_SAMPLE_IDENTIFIER;
+	if (!(STAT_RECORD && perf_stat.file.is_pipe))
+		attr->sample_type = PERF_SAMPLE_IDENTIFIER;
 
 	/*
 	 * Disabling all counters initially, they will be enabled
-- 
cgit v1.1


From cfc8874a485992491b865dde64965bb7c18c26b5 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Tue, 5 Jan 2016 22:09:06 +0100
Subject: perf script: Process cpu/threads maps

Adding processing of cpu/threads maps. Configuring session's evlist with
these maps.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Kan Liang <kan.liang@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1452028152-26762-3-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-script.c | 67 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)

diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index bcc3542..aa6d7cf 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -18,7 +18,11 @@
 #include "util/sort.h"
 #include "util/data.h"
 #include "util/auxtrace.h"
+#include "util/cpumap.h"
+#include "util/thread_map.h"
+#include "util/stat.h"
 #include <linux/bitmap.h>
+#include "asm/bug.h"
 
 static char const		*script_name;
 static char const		*generate_script_lang;
@@ -606,6 +610,9 @@ struct perf_script {
 	bool			show_task_events;
 	bool			show_mmap_events;
 	bool			show_switch_events;
+	bool			allocated;
+	struct cpu_map		*cpus;
+	struct thread_map	*threads;
 };
 
 static void process_event(struct perf_script *script __maybe_unused, union perf_event *event,
@@ -1682,6 +1689,63 @@ static void script__setup_sample_type(struct perf_script *script)
 	}
 }
 
+static int set_maps(struct perf_script *script)
+{
+	struct perf_evlist *evlist = script->session->evlist;
+
+	if (!script->cpus || !script->threads)
+		return 0;
+
+	if (WARN_ONCE(script->allocated, "stats double allocation\n"))
+		return -EINVAL;
+
+	perf_evlist__set_maps(evlist, script->cpus, script->threads);
+
+	if (perf_evlist__alloc_stats(evlist, true))
+		return -ENOMEM;
+
+	script->allocated = true;
+	return 0;
+}
+
+static
+int process_thread_map_event(struct perf_tool *tool,
+			     union perf_event *event,
+			     struct perf_session *session __maybe_unused)
+{
+	struct perf_script *script = container_of(tool, struct perf_script, tool);
+
+	if (script->threads) {
+		pr_warning("Extra thread map event, ignoring.\n");
+		return 0;
+	}
+
+	script->threads = thread_map__new_event(&event->thread_map);
+	if (!script->threads)
+		return -ENOMEM;
+
+	return set_maps(script);
+}
+
+static
+int process_cpu_map_event(struct perf_tool *tool __maybe_unused,
+			  union perf_event *event,
+			  struct perf_session *session __maybe_unused)
+{
+	struct perf_script *script = container_of(tool, struct perf_script, tool);
+
+	if (script->cpus) {
+		pr_warning("Extra cpu map event, ignoring.\n");
+		return 0;
+	}
+
+	script->cpus = cpu_map__new_data(&event->cpu_map.data);
+	if (!script->cpus)
+		return -ENOMEM;
+
+	return set_maps(script);
+}
+
 int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
 {
 	bool show_full_info = false;
@@ -1710,6 +1774,8 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
 			.auxtrace_info	 = perf_event__process_auxtrace_info,
 			.auxtrace	 = perf_event__process_auxtrace,
 			.auxtrace_error	 = perf_event__process_auxtrace_error,
+			.thread_map	 = process_thread_map_event,
+			.cpu_map	 = process_cpu_map_event,
 			.ordered_events	 = true,
 			.ordering_requires_timestamps = true,
 		},
@@ -2063,6 +2129,7 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
 	flush_scripting();
 
 out_delete:
+	perf_evlist__free_stats(session->evlist);
 	perf_session__delete(session);
 
 	if (script_started)
-- 
cgit v1.1


From 91a2c3d54fd4b5be4e25acc1d8c1cc9a28319774 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Tue, 5 Jan 2016 22:09:07 +0100
Subject: perf script: Process stat config event

Adding processing of stat config event and initialize stat_config
object.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Kan Liang <kan.liang@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1452028152-26762-4-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-script.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index aa6d7cf..a90bc0b 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -36,6 +36,7 @@ static bool			print_flags;
 static bool			nanosecs;
 static const char		*cpu_list;
 static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
+static struct perf_stat_config	stat_config;
 
 unsigned int scripting_max_stack = PERF_MAX_STACK_DEPTH;
 
@@ -1689,6 +1690,14 @@ static void script__setup_sample_type(struct perf_script *script)
 	}
 }
 
+static int process_stat_config_event(struct perf_tool *tool __maybe_unused,
+				     union perf_event *event,
+				     struct perf_session *session __maybe_unused)
+{
+	perf_event__read_stat_config(&stat_config, &event->stat_config);
+	return 0;
+}
+
 static int set_maps(struct perf_script *script)
 {
 	struct perf_evlist *evlist = script->session->evlist;
@@ -1774,6 +1783,7 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
 			.auxtrace_info	 = perf_event__process_auxtrace_info,
 			.auxtrace	 = perf_event__process_auxtrace,
 			.auxtrace_error	 = perf_event__process_auxtrace_error,
+			.stat_config	 = process_stat_config_event,
 			.thread_map	 = process_thread_map_event,
 			.cpu_map	 = process_cpu_map_event,
 			.ordered_events	 = true,
-- 
cgit v1.1


From 8058a30ce174060a3c8156dc87b4d4ae39e8281b Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Tue, 5 Jan 2016 22:09:08 +0100
Subject: perf script: Add process_stat/process_stat_interval scripting
 interface

Python and perl scripting code will define those callbacks and get stat
data.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Kan Liang <kan.liang@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1452028152-26762-5-git-send-email-jolsa@kernel.org
[ Rename 'time' parameters to 'tstamp', to fix the build in older distros ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/trace-event.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tools/perf/util/trace-event.h b/tools/perf/util/trace-event.h
index b85ee55..bce5b1d 100644
--- a/tools/perf/util/trace-event.h
+++ b/tools/perf/util/trace-event.h
@@ -65,6 +65,7 @@ int tracing_data_put(struct tracing_data *tdata);
 struct addr_location;
 
 struct perf_session;
+struct perf_stat_config;
 
 struct scripting_ops {
 	const char *name;
@@ -75,6 +76,9 @@ struct scripting_ops {
 			       struct perf_sample *sample,
 			       struct perf_evsel *evsel,
 			       struct addr_location *al);
+	void (*process_stat)(struct perf_stat_config *config,
+			     struct perf_evsel *evsel, u64 tstamp);
+	void (*process_stat_interval)(u64 tstamp);
 	int (*generate_script) (struct pevent *pevent, const char *outfile);
 };
 
-- 
cgit v1.1


From e099eba8c8df0f96e7cd6ddc5fc3151fe37be24e Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Tue, 5 Jan 2016 22:09:09 +0100
Subject: perf script: Add stat default handlers

Implement struct scripting_ops::(process_stat|process_stat_interval)
handlers - calling scripting handlers from stat events handlers.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Kan Liang <kan.liang@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1452028152-26762-6-git-send-email-jolsa@kernel.org
[ Rename 'time' parameters to 'tstamp', to fix the build in older distros ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-script.c | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index a90bc0b..5e18654 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -221,6 +221,9 @@ static int perf_evsel__check_attr(struct perf_evsel *evsel,
 	struct perf_event_attr *attr = &evsel->attr;
 	bool allow_user_set;
 
+	if (perf_header__has_feat(&session->header, HEADER_STAT))
+		return 0;
+
 	allow_user_set = perf_header__has_feat(&session->header,
 					       HEADER_AUXTRACE);
 
@@ -674,6 +677,18 @@ static void process_event(struct perf_script *script __maybe_unused, union perf_
 
 static struct scripting_ops	*scripting_ops;
 
+static void process_stat(struct perf_evsel *counter, u64 tstamp)
+{
+	if (scripting_ops && scripting_ops->process_stat)
+		scripting_ops->process_stat(&stat_config, counter, tstamp);
+}
+
+static void process_stat_interval(u64 tstamp)
+{
+	if (scripting_ops && scripting_ops->process_stat_interval)
+		scripting_ops->process_stat_interval(tstamp);
+}
+
 static void setup_scripting(void)
 {
 	setup_perl_scripting();
@@ -1690,6 +1705,22 @@ static void script__setup_sample_type(struct perf_script *script)
 	}
 }
 
+static int process_stat_round_event(struct perf_tool *tool __maybe_unused,
+				    union perf_event *event,
+				    struct perf_session *session)
+{
+	struct stat_round_event *round = &event->stat_round;
+	struct perf_evsel *counter;
+
+	evlist__for_each(session->evlist, counter) {
+		perf_stat_process_counter(&stat_config, counter);
+		process_stat(counter, round->time);
+	}
+
+	process_stat_interval(round->time);
+	return 0;
+}
+
 static int process_stat_config_event(struct perf_tool *tool __maybe_unused,
 				     union perf_event *event,
 				     struct perf_session *session __maybe_unused)
@@ -1783,6 +1814,8 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
 			.auxtrace_info	 = perf_event__process_auxtrace_info,
 			.auxtrace	 = perf_event__process_auxtrace,
 			.auxtrace_error	 = perf_event__process_auxtrace_error,
+			.stat		 = perf_event__process_stat_event,
+			.stat_round	 = process_stat_round_event,
 			.stat_config	 = process_stat_config_event,
 			.thread_map	 = process_thread_map_event,
 			.cpu_map	 = process_cpu_map_event,
-- 
cgit v1.1


From aef90263561a87ae6d9c6a0f4071d825ce636eef Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Tue, 5 Jan 2016 22:09:11 +0100
Subject: perf script: Add python support for stat events

Add support to get stat events data in perf python scripts.

The python script shall implement the following new interface to process
stat data:

  def stat__<event_name>_[<modifier>](cpu, thread, time, val, ena, run):

    - is called for every stat event for given counter,
      if user monitors 'cycles,instructions:u" following
      callbacks should be defined:

      def stat__cycles(cpu, thread, time, val, ena, run):
      def stat__instructions_u(cpu, thread, time, val, ena, run):

  def stat__interval(time):

    - is called for every interval with its time,
      in non interval mode it's called after last
      stat event with total measured time in ns

The rest of the current interface stays untouched..

Please check example CPI metrics script in following patch
with command line examples in changelogs.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Kan Liang <kan.liang@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1452028152-26762-8-git-send-email-jolsa@kernel.org
[ Rename 'time' parameters to 'tstamp', to fix the build in older distros ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 .../util/scripting-engines/trace-event-python.c    | 115 +++++++++++++++++++--
 1 file changed, 109 insertions(+), 6 deletions(-)

diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c
index a8e825f..d72fafc 100644
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -41,6 +41,9 @@
 #include "../thread-stack.h"
 #include "../trace-event.h"
 #include "../machine.h"
+#include "thread_map.h"
+#include "cpumap.h"
+#include "stat.h"
 
 PyMODINIT_FUNC initperf_trace_context(void);
 
@@ -859,6 +862,104 @@ static void python_process_event(union perf_event *event,
 	}
 }
 
+static void get_handler_name(char *str, size_t size,
+			     struct perf_evsel *evsel)
+{
+	char *p = str;
+
+	scnprintf(str, size, "stat__%s", perf_evsel__name(evsel));
+
+	while ((p = strchr(p, ':'))) {
+		*p = '_';
+		p++;
+	}
+}
+
+static void
+process_stat(struct perf_evsel *counter, int cpu, int thread, u64 tstamp,
+	     struct perf_counts_values *count)
+{
+	PyObject *handler, *t;
+	static char handler_name[256];
+	int n = 0;
+
+	t = PyTuple_New(MAX_FIELDS);
+	if (!t)
+		Py_FatalError("couldn't create Python tuple");
+
+	get_handler_name(handler_name, sizeof(handler_name),
+			 counter);
+
+	handler = get_handler(handler_name);
+	if (!handler) {
+		pr_debug("can't find python handler %s\n", handler_name);
+		return;
+	}
+
+	PyTuple_SetItem(t, n++, PyInt_FromLong(cpu));
+	PyTuple_SetItem(t, n++, PyInt_FromLong(thread));
+
+	tuple_set_u64(t, n++, tstamp);
+	tuple_set_u64(t, n++, count->val);
+	tuple_set_u64(t, n++, count->ena);
+	tuple_set_u64(t, n++, count->run);
+
+	if (_PyTuple_Resize(&t, n) == -1)
+		Py_FatalError("error resizing Python tuple");
+
+	call_object(handler, t, handler_name);
+
+	Py_DECREF(t);
+}
+
+static void python_process_stat(struct perf_stat_config *config,
+				struct perf_evsel *counter, u64 tstamp)
+{
+	struct thread_map *threads = counter->threads;
+	struct cpu_map *cpus = counter->cpus;
+	int cpu, thread;
+
+	if (config->aggr_mode == AGGR_GLOBAL) {
+		process_stat(counter, -1, -1, tstamp,
+			     &counter->counts->aggr);
+		return;
+	}
+
+	for (thread = 0; thread < threads->nr; thread++) {
+		for (cpu = 0; cpu < cpus->nr; cpu++) {
+			process_stat(counter, cpus->map[cpu],
+				     thread_map__pid(threads, thread), tstamp,
+				     perf_counts(counter->counts, cpu, thread));
+		}
+	}
+}
+
+static void python_process_stat_interval(u64 tstamp)
+{
+	PyObject *handler, *t;
+	static const char handler_name[] = "stat__interval";
+	int n = 0;
+
+	t = PyTuple_New(MAX_FIELDS);
+	if (!t)
+		Py_FatalError("couldn't create Python tuple");
+
+	handler = get_handler(handler_name);
+	if (!handler) {
+		pr_debug("can't find python handler %s\n", handler_name);
+		return;
+	}
+
+	tuple_set_u64(t, n++, tstamp);
+
+	if (_PyTuple_Resize(&t, n) == -1)
+		Py_FatalError("error resizing Python tuple");
+
+	call_object(handler, t, handler_name);
+
+	Py_DECREF(t);
+}
+
 static int run_start_sub(void)
 {
 	main_module = PyImport_AddModule("__main__");
@@ -1201,10 +1302,12 @@ static int python_generate_script(struct pevent *pevent, const char *outfile)
 }
 
 struct scripting_ops python_scripting_ops = {
-	.name = "Python",
-	.start_script = python_start_script,
-	.flush_script = python_flush_script,
-	.stop_script = python_stop_script,
-	.process_event = python_process_event,
-	.generate_script = python_generate_script,
+	.name			= "Python",
+	.start_script		= python_start_script,
+	.flush_script		= python_flush_script,
+	.stop_script		= python_stop_script,
+	.process_event		= python_process_event,
+	.process_stat		= python_process_stat,
+	.process_stat_interval	= python_process_stat_interval,
+	.generate_script	= python_generate_script,
 };
-- 
cgit v1.1


From 15d2b9956b41ffb5961b897bf61cdc09f722dfbf Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 6 Jan 2016 11:49:55 +0100
Subject: perf cpumap: Fix cpu conversion in cpu_map__from_entries

We can't convert u16 cpu_map_entries::cpu[x] value directly to int,
because it could hold -1, which would be converted as 65535.

Adding special treatment for -1, which is not real cpu number, to be
converted to (int -1).

Reported-by: Arnaldo Carvalho de Melo <acme@kernel.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1452077397-31958-2-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/cpumap.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index a0717b9..fa93509 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -188,8 +188,17 @@ static struct cpu_map *cpu_map__from_entries(struct cpu_map_entries *cpus)
 	if (map) {
 		unsigned i;
 
-		for (i = 0; i < cpus->nr; i++)
-			map->map[i] = (int)cpus->cpu[i];
+		for (i = 0; i < cpus->nr; i++) {
+			/*
+			 * Special treatment for -1, which is not real cpu number,
+			 * and we need to use (int) -1 to initialize map[i],
+			 * otherwise it would become 65535.
+			 */
+			if (cpus->cpu[i] == (u16) -1)
+				map->map[i] = -1;
+			else
+				map->map[i] = (int) cpus->cpu[i];
+		}
 	}
 
 	return map;
-- 
cgit v1.1


From 36e33c53f4b693d96fb8dd4529fe14306d4e3e76 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 6 Jan 2016 11:49:56 +0100
Subject: perf script: Display stat events by default

If no script is specified for stat data, display stat events in raw
form.

  $ perf stat record ls

  SNIP

   Performance counter stats for 'ls':

            0.851585      task-clock (msec)         #    0.717 CPUs utilized
                   0      context-switches          #    0.000 K/sec
                   0      cpu-migrations            #    0.000 K/sec
                 114      page-faults               #    0.134 M/sec
           2,620,918      cycles                    #    3.078 GHz
     <not supported>      stalled-cycles-frontend
     <not supported>      stalled-cycles-backend
           2,714,111      instructions              #    1.04  insns per cycle
             542,434      branches                  #  636.970 M/sec
              15,946      branch-misses             #    2.94% of all branches

         0.001186954 seconds time elapsed

  $ perf script
  CPU   THREAD             VAL             ENA             RUN            TIME EVENT
   -1    26185          851585          851585          851585         1186954 task-clock
   -1    26185               0          851585          851585         1186954 context-switches
   -1    26185               0          851585          851585         1186954 cpu-migrations
   -1    26185             114          851585          851585         1186954 page-faults
   -1    26185         2620918          853340          853340         1186954 cycles
   -1    26185               0               0               0         1186954 stalled-cycles-frontend
   -1    26185               0               0               0         1186954 stalled-cycles-backend
   -1    26185         2714111          853340          853340         1186954 instructions
   -1    26185          542434          853340          853340         1186954 branches
   -1    26185           15946          853340          853340         1186954 branch-misses

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Tested-by: Kan Liang <kan.liang@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1452077397-31958-3-git-send-email-jolsa@kernel.org
[ Rename 'time' parameter to 'tstamp' to fix build on older distros ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-script.c | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 5e18654..5e2f9d2 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -677,10 +677,46 @@ static void process_event(struct perf_script *script __maybe_unused, union perf_
 
 static struct scripting_ops	*scripting_ops;
 
+static void __process_stat(struct perf_evsel *counter, u64 tstamp)
+{
+	int nthreads = thread_map__nr(counter->threads);
+	int ncpus = perf_evsel__nr_cpus(counter);
+	int cpu, thread;
+	static int header_printed;
+
+	if (counter->system_wide)
+		nthreads = 1;
+
+	if (!header_printed) {
+		printf("%3s %8s %15s %15s %15s %15s %s\n",
+		       "CPU", "THREAD", "VAL", "ENA", "RUN", "TIME", "EVENT");
+		header_printed = 1;
+	}
+
+	for (thread = 0; thread < nthreads; thread++) {
+		for (cpu = 0; cpu < ncpus; cpu++) {
+			struct perf_counts_values *counts;
+
+			counts = perf_counts(counter->counts, cpu, thread);
+
+			printf("%3d %8d %15" PRIu64 " %15" PRIu64 " %15" PRIu64 " %15" PRIu64 " %s\n",
+				counter->cpus->map[cpu],
+				thread_map__pid(counter->threads, thread),
+				counts->val,
+				counts->ena,
+				counts->run,
+				tstamp,
+				perf_evsel__name(counter));
+		}
+	}
+}
+
 static void process_stat(struct perf_evsel *counter, u64 tstamp)
 {
 	if (scripting_ops && scripting_ops->process_stat)
 		scripting_ops->process_stat(&stat_config, counter, tstamp);
+	else
+		__process_stat(counter, tstamp);
 }
 
 static void process_stat_interval(u64 tstamp)
-- 
cgit v1.1


From b8a1962d17b4e3cfdd7b7dc9ebd94affbcb4c1c5 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 6 Jan 2016 11:49:57 +0100
Subject: perf script: Add stat-cpi.py script

Adding stat-cpi.py as an example of how to do stat scripting.

It computes the CPI metrics from cycles and instructions events.

The CPI is based performance metric showing the Cycles Per Instructions
ratio, which helps to identify cycles-hungry code.

Following stat record/report/script combinations could be used:

- get CPI for given workload

    $ perf stat -e cycles,instructions record ls

    SNIP

     Performance counter stats for 'ls':

             2,904,431      cycles
             3,346,878      instructions              #    1.15  insns per cycle

           0.001782686 seconds time elapsed

    $ perf script -s ./scripts/python/stat-cpi.py
           0.001783: cpu -1, thread -1 -> cpi 0.867803 (2904431/3346878)

    $ perf stat -e cycles,instructions record ls | perf script -s ./scripts/python/stat-cpi.py

    SNIP

           0.001730: cpu -1, thread -1 -> cpi 0.869026 (2928292/3369627)

- get CPI systemwide:

    $ perf stat -e cycles,instructions -a -I 1000 record sleep 3
    #           time             counts unit events
         1.000158618        594,274,711      cycles                     (100.00%)
         1.000158618        441,898,250      instructions
         2.000350973        567,649,705      cycles                     (100.00%)
         2.000350973        432,669,206      instructions
         3.000559210        561,940,430      cycles                     (100.00%)
         3.000559210        420,403,465      instructions
         3.000670798            780,105      cycles                     (100.00%)
         3.000670798            326,516      instructions

    $ perf script -s ./scripts/python/stat-cpi.py
           1.000159: cpu -1, thread -1 -> cpi 1.344823 (594274711/441898250)
           2.000351: cpu -1, thread -1 -> cpi 1.311972 (567649705/432669206)
           3.000559: cpu -1, thread -1 -> cpi 1.336669 (561940430/420403465)
           3.000671: cpu -1, thread -1 -> cpi 2.389178 (780105/326516)

    $ perf stat -e cycles,instructions -a -I 1000 record sleep 3 | perf script -s ./scripts/python/stat-cpi.py
           1.000202: cpu -1, thread -1 -> cpi 1.035091 (940778881/908885530)
           2.000392: cpu -1, thread -1 -> cpi 1.442600 (627493992/434974455)
           3.000545: cpu -1, thread -1 -> cpi 1.353612 (741463930/547766890)
           3.000622: cpu -1, thread -1 -> cpi 2.642110 (784083/296764)

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Tested-by: Kan Liang <kan.liang@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1452077397-31958-4-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/scripts/python/stat-cpi.py | 77 +++++++++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)
 create mode 100644 tools/perf/scripts/python/stat-cpi.py

diff --git a/tools/perf/scripts/python/stat-cpi.py b/tools/perf/scripts/python/stat-cpi.py
new file mode 100644
index 0000000..8b60f34
--- /dev/null
+++ b/tools/perf/scripts/python/stat-cpi.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python
+
+data    = {}
+times   = []
+threads = []
+cpus    = []
+
+def get_key(time, event, cpu, thread):
+    return "%d-%s-%d-%d" % (time, event, cpu, thread)
+
+def store_key(time, cpu, thread):
+    if (time not in times):
+        times.append(time)
+
+    if (cpu not in cpus):
+        cpus.append(cpu)
+
+    if (thread not in threads):
+        threads.append(thread)
+
+def store(time, event, cpu, thread, val, ena, run):
+    #print "event %s cpu %d, thread %d, time %d, val %d, ena %d, run %d" % \
+    #      (event, cpu, thread, time, val, ena, run)
+
+    store_key(time, cpu, thread)
+    key = get_key(time, event, cpu, thread)
+    data[key] = [ val, ena, run]
+
+def get(time, event, cpu, thread):
+    key = get_key(time, event, cpu, thread)
+    return data[key][0]
+
+def stat__cycles_k(cpu, thread, time, val, ena, run):
+    store(time, "cycles", cpu, thread, val, ena, run);
+
+def stat__instructions_k(cpu, thread, time, val, ena, run):
+    store(time, "instructions", cpu, thread, val, ena, run);
+
+def stat__cycles_u(cpu, thread, time, val, ena, run):
+    store(time, "cycles", cpu, thread, val, ena, run);
+
+def stat__instructions_u(cpu, thread, time, val, ena, run):
+    store(time, "instructions", cpu, thread, val, ena, run);
+
+def stat__cycles(cpu, thread, time, val, ena, run):
+    store(time, "cycles", cpu, thread, val, ena, run);
+
+def stat__instructions(cpu, thread, time, val, ena, run):
+    store(time, "instructions", cpu, thread, val, ena, run);
+
+def stat__interval(time):
+    for cpu in cpus:
+        for thread in threads:
+            cyc = get(time, "cycles", cpu, thread)
+            ins = get(time, "instructions", cpu, thread)
+            cpi = 0
+
+            if ins != 0:
+                cpi = cyc/float(ins)
+
+            print "%15f: cpu %d, thread %d -> cpi %f (%d/%d)" % (time/(float(1000000000)), cpu, thread, cpi, cyc, ins)
+
+def trace_end():
+    pass
+# XXX trace_end callback could be used as an alternative place
+#     to compute same values as in the script above:
+#
+#    for time in times:
+#        for cpu in cpus:
+#            for thread in threads:
+#                cyc = get(time, "cycles", cpu, thread)
+#                ins = get(time, "instructions", cpu, thread)
+#
+#                if ins != 0:
+#                    cpi = cyc/float(ins)
+#
+#                print "time %.9f, cpu %d, thread %d -> cpi %f" % (time/(float(1000000000)), cpu, thread, cpi)
-- 
cgit v1.1


From 84530920de3c6ccb92c6661da784f6cdb66d3304 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Wed, 6 Jan 2016 19:50:01 +0100
Subject: perf pmu: fix alias->snapshot missing initialization bug

This patch fixes a bug in __perf_pmu__new_alias() whereby the
alias->snapshot field was not initialized to false. This led to random
alias->snapshot value for an alias and was breaking some measurements
such as:

  $ perf stat -a -e uncore_imc/data_reads/ -I 1000 sleep 100

Because the event ended up being treated as snapshot mode, when it is
not.

Signed-off-by: Stephane Eranian <eranian@google.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1452106201-13073-1-git-send-email-eranian@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/pmu.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index e4b173d..b597bcc 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -220,6 +220,7 @@ static int __perf_pmu__new_alias(struct list_head *list, char *dir, char *name,
 	alias->scale = 1.0;
 	alias->unit[0] = '\0';
 	alias->per_pkg = false;
+	alias->snapshot = false;
 
 	ret = parse_events_terms(&alias->terms, val);
 	if (ret) {
-- 
cgit v1.1


From 4f4ba0e6afac4b181471f5f12bd87294bd8807f6 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 7 Jan 2016 09:54:56 -0300
Subject: perf tests: No need to set attr.sample_freq in the perf time to TSC
 test

We were asking for a 4kHz sample_freq, making the test fail needlessly
when the system reduced /proc/sys/kernel/perf_event_max_sample_rate
below that.

In this test we only look at the PERF_SAMPLE_TIME fields in PERF_RECORD_
meta events, no need to set sample_freq.

Thanks to Namhyung for suggesting that max_sample_rate could be the
reason for the test failure, seeing the 'perf test -vv' output I sent.

Before:

  # echo 1000 > /proc/sys/kernel/perf_event_max_sample_rate
  # perf test TSC
  45: Test converting perf time to TSC   : FAILED!

After:

  # perf test TSC
  45: Test converting perf time to TSC   : Ok
  # cat /proc/sys/kernel/perf_event_max_sample_rate
  1000

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/n/tip-lcob05qhawkuvsyuu9g1fld5@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/x86/tests/perf-time-to-tsc.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tools/perf/arch/x86/tests/perf-time-to-tsc.c b/tools/perf/arch/x86/tests/perf-time-to-tsc.c
index a289aa8..9d29ee2 100644
--- a/tools/perf/arch/x86/tests/perf-time-to-tsc.c
+++ b/tools/perf/arch/x86/tests/perf-time-to-tsc.c
@@ -41,7 +41,6 @@ int test__perf_time_to_tsc(int subtest __maybe_unused)
 		.mmap_pages	     = UINT_MAX,
 		.user_freq	     = UINT_MAX,
 		.user_interval	     = ULLONG_MAX,
-		.freq		     = 4000,
 		.target		     = {
 			.uses_mmap   = true,
 		},
-- 
cgit v1.1


From 5bae0250237f7a5ec4355f9920701de247b8db91 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 7 Jan 2016 13:14:56 -0300
Subject: perf evlist: Introduce perf_evlist__new_dummy constructor

For case where all we need is an evlist with just an "dummy" evsel,
like in some 'perf test' entries.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/n/tip-q52le0pblm2k3ncvyilelr9z@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/evlist.c | 28 ++++++++++++++++++++++++++++
 tools/perf/util/evlist.h |  3 +++
 2 files changed, 31 insertions(+)

diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index b9eac0d..fa6dbf0 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -68,6 +68,18 @@ struct perf_evlist *perf_evlist__new_default(void)
 	return evlist;
 }
 
+struct perf_evlist *perf_evlist__new_dummy(void)
+{
+	struct perf_evlist *evlist = perf_evlist__new();
+
+	if (evlist && perf_evlist__add_dummy(evlist)) {
+		perf_evlist__delete(evlist);
+		evlist = NULL;
+	}
+
+	return evlist;
+}
+
 /**
  * perf_evlist__set_id_pos - set the positions of event ids.
  * @evlist: selected event list
@@ -248,6 +260,22 @@ error:
 	return -ENOMEM;
 }
 
+int perf_evlist__add_dummy(struct perf_evlist *evlist)
+{
+	struct perf_event_attr attr = {
+		.type	= PERF_TYPE_SOFTWARE,
+		.config = PERF_COUNT_SW_DUMMY,
+		.size	= sizeof(attr), /* to capture ABI version */
+	};
+	struct perf_evsel *evsel = perf_evsel__new(&attr);
+
+	if (evsel == NULL)
+		return -ENOMEM;
+
+	perf_evlist__add(evlist, evsel);
+	return 0;
+}
+
 static int perf_evlist__add_attrs(struct perf_evlist *evlist,
 				  struct perf_event_attr *attrs, size_t nr_attrs)
 {
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index 139a500..3b7e1e2 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -67,6 +67,7 @@ struct perf_evsel_str_handler {
 
 struct perf_evlist *perf_evlist__new(void);
 struct perf_evlist *perf_evlist__new_default(void);
+struct perf_evlist *perf_evlist__new_dummy(void);
 void perf_evlist__init(struct perf_evlist *evlist, struct cpu_map *cpus,
 		       struct thread_map *threads);
 void perf_evlist__exit(struct perf_evlist *evlist);
@@ -81,6 +82,8 @@ int __perf_evlist__add_default_attrs(struct perf_evlist *evlist,
 #define perf_evlist__add_default_attrs(evlist, array) \
 	__perf_evlist__add_default_attrs(evlist, array, ARRAY_SIZE(array))
 
+int perf_evlist__add_dummy(struct perf_evlist *evlist);
+
 int perf_evlist__add_newtp(struct perf_evlist *evlist,
 			   const char *sys, const char *name, void *handler);
 
-- 
cgit v1.1


From 69ef8f475586974d3921c4799bfa75b8fef877a8 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 7 Jan 2016 13:17:00 -0300
Subject: perf test: Use "dummy" events in the PERF_RECORD_ test

As we're test just the !PERF_RECORD_SAMPLE records.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/n/tip-qp8radcz3il4q9wbnseh337d@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/perf-record.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tools/perf/tests/perf-record.c b/tools/perf/tests/perf-record.c
index 9d5f0b5..8dc0bab 100644
--- a/tools/perf/tests/perf-record.c
+++ b/tools/perf/tests/perf-record.c
@@ -45,7 +45,7 @@ int test__PERF_RECORD(int subtest __maybe_unused)
 	};
 	cpu_set_t cpu_mask;
 	size_t cpu_mask_size = sizeof(cpu_mask);
-	struct perf_evlist *evlist = perf_evlist__new_default();
+	struct perf_evlist *evlist = perf_evlist__new_dummy();
 	struct perf_evsel *evsel;
 	struct perf_sample sample;
 	const char *cmd = "sleep";
@@ -61,6 +61,9 @@ int test__PERF_RECORD(int subtest __maybe_unused)
 	int total_events = 0, nr_events[PERF_RECORD_MAX] = { 0, };
 	char sbuf[STRERR_BUFSIZE];
 
+	if (evlist == NULL) /* Fallback for kernels lacking PERF_COUNT_SW_DUMMY */
+		evlist = perf_evlist__new_default();
+
 	if (evlist == NULL || argv == NULL) {
 		pr_debug("Not enough memory to create evlist\n");
 		goto out;
-- 
cgit v1.1


From 2e4f81ee90e801017115afb7397198f65e512031 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 7 Jan 2016 13:20:22 -0300
Subject: perf test: No need for setting attr.sample_freq on the RECORD test

We're not looking at PERF_RECORD_SAMPLE entries and now by default we
use PERF_COUNT_SW_DUMMY, so just remove that setting.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/n/tip-cly7cnotktv5rqao13pkorem@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/perf-record.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tools/perf/tests/perf-record.c b/tools/perf/tests/perf-record.c
index 8dc0bab..1cc78ce 100644
--- a/tools/perf/tests/perf-record.c
+++ b/tools/perf/tests/perf-record.c
@@ -40,7 +40,6 @@ int test__PERF_RECORD(int subtest __maybe_unused)
 			.uses_mmap = true,
 		},
 		.no_buffering = true,
-		.freq	      = 10,
 		.mmap_pages   = 256,
 	};
 	cpu_set_t cpu_mask;
-- 
cgit v1.1


From 372b212263998ae8e3b2f80ce52af3168d749158 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 7 Jan 2016 16:47:11 -0300
Subject: perf python: Add missing files to binding link list

Fixing this problem, introduced recently:

  $ perf test python
  16: Try 'import perf' in python, checking link problems      : FAILED!

In verbose mode we find out what is missing:

  $ perf test -v python
  16: Try 'import perf' in python, checking link problems      :
  --- start ---
  test child forked, pid 24894
  Traceback (most recent call last):
    File "<stdin>", line 1, in <module>
  ImportError: /tmp/build/perf/python/perf.so: undefined symbol: find_next_bit
  test child finished with -1
  ---- end ----
  Try 'import perf' in python, checking link problems: FAILED!
  $

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Fixes: f77b57ad4fc4 ("perf cpu_map: Add cpu_map__new_event function")
Link: http://lkml.kernel.org/n/tip-rajx0zkz6czdrnvvwf0jp76p@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/python-ext-sources | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/perf/util/python-ext-sources b/tools/perf/util/python-ext-sources
index 51be28b..38a0b63 100644
--- a/tools/perf/util/python-ext-sources
+++ b/tools/perf/util/python-ext-sources
@@ -10,6 +10,8 @@ util/ctype.c
 util/evlist.c
 util/evsel.c
 util/cpumap.c
+util/bitmap.c
+../lib/util/find_next_bit.c
 ../lib/hweight.c
 util/thread_map.c
 util/util.c
-- 
cgit v1.1


From 239849dde350c9f7356974ca4f62171f50a2c86e Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 7 Jan 2016 16:51:58 -0300
Subject: perf tests: No need to set attr.sample_freq for tracking
 !PERF_RECORD_SAMPLE

We were asking for a 4kHz sample_freq, making the test fail needlessly
when the system reduced /proc/sys/kernel/perf_event_max_sample_rate
below that.

Before:

  # perf test -vv dummy
  23: Test using a dummy software event to keep tracking       :
  --- start ---
  test child forked, pid 32421
  ------------------------------------------------------------
  perf_event_attr:
    type                             1
    size                             112
    config                           0x9
    { sample_period, sample_freq }   4000
    sample_type                      IP|TID|ID|PERIOD
  <SNIP>
  sys_perf_event_open failed, error -22
  Unable to open dummy and cycles event
  test child finished with -2
  ---- end ----
  Test using a dummy software event to keep tracking: Skip
  #
  [root@zoo ~]# cat /proc/sys/kernel/perf_event_max_sample_rate
  1000

After:

  [root@zoo ~]# perf test dummy
  23: Test using a dummy software event to keep tracking       : Ok

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/n/tip-487iquegrs2379e5n0pi0tcp@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/keep-tracking.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tools/perf/tests/keep-tracking.c b/tools/perf/tests/keep-tracking.c
index a337a6d..6158132 100644
--- a/tools/perf/tests/keep-tracking.c
+++ b/tools/perf/tests/keep-tracking.c
@@ -55,7 +55,6 @@ int test__keep_tracking(int subtest __maybe_unused)
 		.mmap_pages	     = UINT_MAX,
 		.user_freq	     = UINT_MAX,
 		.user_interval	     = ULLONG_MAX,
-		.freq		     = 4000,
 		.target		     = {
 			.uses_mmap   = true,
 		},
-- 
cgit v1.1


From a831e67913e356be63d5bb0509fc1af3c4e6ceb4 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 7 Jan 2016 16:59:27 -0300
Subject: perf tests: Give a bit more information on the CQM test failure path

Before:

  $ perf test -v cqm
  48: Test intel cqm nmi context read                          :
  --- start ---
  test child forked, pid 1681
  parse_events failed
  test child finished with -2
  ---- end ----
  Test intel cqm nmi context read: Skip
  $

After:

  $ perf test -v cqm
  48: Test intel cqm nmi context read                          :
  --- start ---
  test child forked, pid 1681
  parse_events failed, is "intel_cqm/llc_occupancy/" available?
  test child finished with -2
  ---- end ----
  Test intel cqm nmi context read: Skip
  $

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Matt Fleming <matt.fleming@intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/n/tip-eidpiv5x4nkbsx37xwikbnir@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/x86/tests/intel-cqm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/arch/x86/tests/intel-cqm.c b/tools/perf/arch/x86/tests/intel-cqm.c
index 94e0cb7..3e89ba8 100644
--- a/tools/perf/arch/x86/tests/intel-cqm.c
+++ b/tools/perf/arch/x86/tests/intel-cqm.c
@@ -54,7 +54,7 @@ int test__intel_cqm_count_nmi_context(int subtest __maybe_unused)
 
 	ret = parse_events(evlist, "intel_cqm/llc_occupancy/", NULL);
 	if (ret) {
-		pr_debug("parse_events failed\n");
+		pr_debug("parse_events failed, is \"intel_cqm/llc_occupancy/\" available?\n");
 		err = TEST_SKIP;
 		goto out;
 	}
-- 
cgit v1.1


From 552eb975b83756e3d95aeb5edaeb5c3c032b0021 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 8 Jan 2016 10:46:52 -0300
Subject: tools lib: Move find_next_bit.c to tools/lib/

The commit that introduced it should've moved it to the same place, plus
the 'tools/' prefix, but instead moved it to a bogus tools/lib/util/
directory, being the only file there.

Move it to tools/lib/find_bit.c, picking the name for the file where
these routines live since:

 8f6f19dd5143 ("lib: move find_last_bit to lib/find_next_bit.c")

Next step is to make tools/lib/find_bit.c to differ from lib/find_bit.c
just in removing what is not used by tools/.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: David Ahern <dsahern@gmail.com>
Cc: George Spelvin <linux@horizon.com
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Wang Nan <wangnan0@huawei.com>
Cc: Yury Norov <yury.norov@gmail.com>
Link: http://lkml.kernel.org/n/tip-p391cex5mqvahp4pwrton87n@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/lib/find_bit.c               | 89 ++++++++++++++++++++++++++++++++++++++
 tools/lib/util/find_next_bit.c     | 89 --------------------------------------
 tools/perf/MANIFEST                |  2 +-
 tools/perf/util/Build              |  6 +--
 tools/perf/util/python-ext-sources |  2 +-
 5 files changed, 94 insertions(+), 94 deletions(-)
 create mode 100644 tools/lib/find_bit.c
 delete mode 100644 tools/lib/util/find_next_bit.c

diff --git a/tools/lib/find_bit.c b/tools/lib/find_bit.c
new file mode 100644
index 0000000..732d8c3
--- /dev/null
+++ b/tools/lib/find_bit.c
@@ -0,0 +1,89 @@
+/* find_next_bit.c: fallback find next bit implementation
+ *
+ * Copied from lib/find_next_bit.c to tools/lib/find_bit.c
+ *
+ * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/bitops.h>
+#include <asm/types.h>
+#include <asm/byteorder.h>
+
+#define BITOP_WORD(nr)		((nr) / BITS_PER_LONG)
+
+#ifndef find_next_bit
+/*
+ * Find the next set bit in a memory region.
+ */
+unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
+			    unsigned long offset)
+{
+	const unsigned long *p = addr + BITOP_WORD(offset);
+	unsigned long result = offset & ~(BITS_PER_LONG-1);
+	unsigned long tmp;
+
+	if (offset >= size)
+		return size;
+	size -= result;
+	offset %= BITS_PER_LONG;
+	if (offset) {
+		tmp = *(p++);
+		tmp &= (~0UL << offset);
+		if (size < BITS_PER_LONG)
+			goto found_first;
+		if (tmp)
+			goto found_middle;
+		size -= BITS_PER_LONG;
+		result += BITS_PER_LONG;
+	}
+	while (size & ~(BITS_PER_LONG-1)) {
+		if ((tmp = *(p++)))
+			goto found_middle;
+		result += BITS_PER_LONG;
+		size -= BITS_PER_LONG;
+	}
+	if (!size)
+		return result;
+	tmp = *p;
+
+found_first:
+	tmp &= (~0UL >> (BITS_PER_LONG - size));
+	if (tmp == 0UL)		/* Are any bits set? */
+		return result + size;	/* Nope. */
+found_middle:
+	return result + __ffs(tmp);
+}
+#endif
+
+#ifndef find_first_bit
+/*
+ * Find the first set bit in a memory region.
+ */
+unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
+{
+	const unsigned long *p = addr;
+	unsigned long result = 0;
+	unsigned long tmp;
+
+	while (size & ~(BITS_PER_LONG-1)) {
+		if ((tmp = *(p++)))
+			goto found;
+		result += BITS_PER_LONG;
+		size -= BITS_PER_LONG;
+	}
+	if (!size)
+		return result;
+
+	tmp = (*p) & (~0UL >> (BITS_PER_LONG - size));
+	if (tmp == 0UL)		/* Are any bits set? */
+		return result + size;	/* Nope. */
+found:
+	return result + __ffs(tmp);
+}
+#endif
diff --git a/tools/lib/util/find_next_bit.c b/tools/lib/util/find_next_bit.c
deleted file mode 100644
index 41b44f6..0000000
--- a/tools/lib/util/find_next_bit.c
+++ /dev/null
@@ -1,89 +0,0 @@
-/* find_next_bit.c: fallback find next bit implementation
- *
- * Copied from lib/find_next_bit.c to tools/lib/next_bit.c
- *
- * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/bitops.h>
-#include <asm/types.h>
-#include <asm/byteorder.h>
-
-#define BITOP_WORD(nr)		((nr) / BITS_PER_LONG)
-
-#ifndef find_next_bit
-/*
- * Find the next set bit in a memory region.
- */
-unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
-			    unsigned long offset)
-{
-	const unsigned long *p = addr + BITOP_WORD(offset);
-	unsigned long result = offset & ~(BITS_PER_LONG-1);
-	unsigned long tmp;
-
-	if (offset >= size)
-		return size;
-	size -= result;
-	offset %= BITS_PER_LONG;
-	if (offset) {
-		tmp = *(p++);
-		tmp &= (~0UL << offset);
-		if (size < BITS_PER_LONG)
-			goto found_first;
-		if (tmp)
-			goto found_middle;
-		size -= BITS_PER_LONG;
-		result += BITS_PER_LONG;
-	}
-	while (size & ~(BITS_PER_LONG-1)) {
-		if ((tmp = *(p++)))
-			goto found_middle;
-		result += BITS_PER_LONG;
-		size -= BITS_PER_LONG;
-	}
-	if (!size)
-		return result;
-	tmp = *p;
-
-found_first:
-	tmp &= (~0UL >> (BITS_PER_LONG - size));
-	if (tmp == 0UL)		/* Are any bits set? */
-		return result + size;	/* Nope. */
-found_middle:
-	return result + __ffs(tmp);
-}
-#endif
-
-#ifndef find_first_bit
-/*
- * Find the first set bit in a memory region.
- */
-unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
-{
-	const unsigned long *p = addr;
-	unsigned long result = 0;
-	unsigned long tmp;
-
-	while (size & ~(BITS_PER_LONG-1)) {
-		if ((tmp = *(p++)))
-			goto found;
-		result += BITS_PER_LONG;
-		size -= BITS_PER_LONG;
-	}
-	if (!size)
-		return result;
-
-	tmp = (*p) & (~0UL >> (BITS_PER_LONG - size));
-	if (tmp == 0UL)		/* Are any bits set? */
-		return result + size;	/* Nope. */
-found:
-	return result + __ffs(tmp);
-}
-#endif
diff --git a/tools/perf/MANIFEST b/tools/perf/MANIFEST
index ce3932e..b3db8df 100644
--- a/tools/perf/MANIFEST
+++ b/tools/perf/MANIFEST
@@ -26,7 +26,7 @@ tools/lib/rbtree.c
 tools/lib/string.c
 tools/lib/symbol/kallsyms.c
 tools/lib/symbol/kallsyms.h
-tools/lib/util/find_next_bit.c
+tools/lib/find_bit.c
 tools/include/asm/atomic.h
 tools/include/asm/barrier.h
 tools/include/asm/bug.h
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index 94b1099..e8bc10b 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -9,7 +9,7 @@ libperf-y += env.o
 libperf-y += event.o
 libperf-y += evlist.o
 libperf-y += evsel.o
-libperf-y += find_next_bit.o
+libperf-y += find_bit.o
 libperf-y += kallsyms.o
 libperf-y += levenshtein.o
 libperf-y += llvm-utils.o
@@ -132,7 +132,7 @@ CFLAGS_pmu-bison.o          += -DYYENABLE_NLS=0 -DYYLTYPE_IS_TRIVIAL=0 -w
 $(OUTPUT)util/parse-events.o: $(OUTPUT)util/parse-events-flex.c $(OUTPUT)util/parse-events-bison.c
 $(OUTPUT)util/pmu.o: $(OUTPUT)util/pmu-flex.c $(OUTPUT)util/pmu-bison.c
 
-CFLAGS_find_next_bit.o += -Wno-unused-parameter -DETC_PERFCONFIG="BUILD_STR($(ETC_PERFCONFIG_SQ))"
+CFLAGS_find_bit.o      += -Wno-unused-parameter -DETC_PERFCONFIG="BUILD_STR($(ETC_PERFCONFIG_SQ))"
 CFLAGS_rbtree.o        += -Wno-unused-parameter -DETC_PERFCONFIG="BUILD_STR($(ETC_PERFCONFIG_SQ))"
 CFLAGS_libstring.o     += -Wno-unused-parameter -DETC_PERFCONFIG="BUILD_STR($(ETC_PERFCONFIG_SQ))"
 CFLAGS_hweight.o       += -Wno-unused-parameter -DETC_PERFCONFIG="BUILD_STR($(ETC_PERFCONFIG_SQ))"
@@ -142,7 +142,7 @@ $(OUTPUT)util/kallsyms.o: ../lib/symbol/kallsyms.c FORCE
 	$(call rule_mkdir)
 	$(call if_changed_dep,cc_o_c)
 
-$(OUTPUT)util/find_next_bit.o: ../lib/util/find_next_bit.c FORCE
+$(OUTPUT)util/find_bit.o: ../lib/find_bit.c FORCE
 	$(call rule_mkdir)
 	$(call if_changed_dep,cc_o_c)
 
diff --git a/tools/perf/util/python-ext-sources b/tools/perf/util/python-ext-sources
index 38a0b63..f15d14f 100644
--- a/tools/perf/util/python-ext-sources
+++ b/tools/perf/util/python-ext-sources
@@ -11,7 +11,7 @@ util/evlist.c
 util/evsel.c
 util/cpumap.c
 util/bitmap.c
-../lib/util/find_next_bit.c
+../lib/find_bit.c
 ../lib/hweight.c
 util/thread_map.c
 util/util.c
-- 
cgit v1.1


From 64af4e0da419ef9e9db0d34a3b5836adbf90a5e8 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 8 Jan 2016 11:26:43 -0300
Subject: tools lib: Sync tools/lib/find_bit.c with the kernel

Need to move the bitmap.[ch] things from tools/perf/ to tools/lib, will
be done in the next patches.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: David Ahern <dsahern@gmail.com>
Cc: George Spelvin <linux@horizon.com
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Wang Nan <wangnan0@huawei.com>
Cc: Yury Norov <yury.norov@gmail.com>
Link: http://lkml.kernel.org/n/tip-5fys65wkd7gu8j7a7xgukc5t@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/lib/find_bit.c                   | 103 ++++++++++++++++-----------------
 tools/perf/util/include/linux/bitmap.h |   2 +
 2 files changed, 51 insertions(+), 54 deletions(-)

diff --git a/tools/lib/find_bit.c b/tools/lib/find_bit.c
index 732d8c3..9122a9e 100644
--- a/tools/lib/find_bit.c
+++ b/tools/lib/find_bit.c
@@ -1,10 +1,17 @@
-/* find_next_bit.c: fallback find next bit implementation
+/* bit search implementation
  *
- * Copied from lib/find_next_bit.c to tools/lib/find_bit.c
+ * Copied from lib/find_bit.c to tools/lib/find_bit.c
  *
  * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
+ * Copyright (C) 2008 IBM Corporation
+ * 'find_last_bit' is written by Rusty Russell <rusty@rustcorp.com.au>
+ * (Inspired by David Howell's find_next_bit implementation)
+ *
+ * Rewritten by Yury Norov <yury.norov@gmail.com> to decrease
+ * size and improve performance, 2015.
+ *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
  * as published by the Free Software Foundation; either version
@@ -12,52 +19,50 @@
  */
 
 #include <linux/bitops.h>
-#include <asm/types.h>
-#include <asm/byteorder.h>
+#include <linux/bitmap.h>
+#include <linux/kernel.h>
 
-#define BITOP_WORD(nr)		((nr) / BITS_PER_LONG)
+#if !defined(find_next_bit)
 
-#ifndef find_next_bit
 /*
- * Find the next set bit in a memory region.
+ * This is a common helper function for find_next_bit and
+ * find_next_zero_bit.  The difference is the "invert" argument, which
+ * is XORed with each fetched word before searching it for one bits.
  */
-unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
-			    unsigned long offset)
+static unsigned long _find_next_bit(const unsigned long *addr,
+		unsigned long nbits, unsigned long start, unsigned long invert)
 {
-	const unsigned long *p = addr + BITOP_WORD(offset);
-	unsigned long result = offset & ~(BITS_PER_LONG-1);
 	unsigned long tmp;
 
-	if (offset >= size)
-		return size;
-	size -= result;
-	offset %= BITS_PER_LONG;
-	if (offset) {
-		tmp = *(p++);
-		tmp &= (~0UL << offset);
-		if (size < BITS_PER_LONG)
-			goto found_first;
-		if (tmp)
-			goto found_middle;
-		size -= BITS_PER_LONG;
-		result += BITS_PER_LONG;
-	}
-	while (size & ~(BITS_PER_LONG-1)) {
-		if ((tmp = *(p++)))
-			goto found_middle;
-		result += BITS_PER_LONG;
-		size -= BITS_PER_LONG;
+	if (!nbits || start >= nbits)
+		return nbits;
+
+	tmp = addr[start / BITS_PER_LONG] ^ invert;
+
+	/* Handle 1st word. */
+	tmp &= BITMAP_FIRST_WORD_MASK(start);
+	start = round_down(start, BITS_PER_LONG);
+
+	while (!tmp) {
+		start += BITS_PER_LONG;
+		if (start >= nbits)
+			return nbits;
+
+		tmp = addr[start / BITS_PER_LONG] ^ invert;
 	}
-	if (!size)
-		return result;
-	tmp = *p;
 
-found_first:
-	tmp &= (~0UL >> (BITS_PER_LONG - size));
-	if (tmp == 0UL)		/* Are any bits set? */
-		return result + size;	/* Nope. */
-found_middle:
-	return result + __ffs(tmp);
+	return min(start + __ffs(tmp), nbits);
+}
+#endif
+
+#ifndef find_next_bit
+/*
+ * Find the next set bit in a memory region.
+ */
+unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
+			    unsigned long offset)
+{
+	return _find_next_bit(addr, size, offset, 0UL);
 }
 #endif
 
@@ -67,23 +72,13 @@ found_middle:
  */
 unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
 {
-	const unsigned long *p = addr;
-	unsigned long result = 0;
-	unsigned long tmp;
+	unsigned long idx;
 
-	while (size & ~(BITS_PER_LONG-1)) {
-		if ((tmp = *(p++)))
-			goto found;
-		result += BITS_PER_LONG;
-		size -= BITS_PER_LONG;
+	for (idx = 0; idx * BITS_PER_LONG < size; idx++) {
+		if (addr[idx])
+			return min(idx * BITS_PER_LONG + __ffs(addr[idx]), size);
 	}
-	if (!size)
-		return result;
 
-	tmp = (*p) & (~0UL >> (BITS_PER_LONG - size));
-	if (tmp == 0UL)		/* Are any bits set? */
-		return result + size;	/* Nope. */
-found:
-	return result + __ffs(tmp);
+	return size;
 }
 #endif
diff --git a/tools/perf/util/include/linux/bitmap.h b/tools/perf/util/include/linux/bitmap.h
index 40bd214..28f5493 100644
--- a/tools/perf/util/include/linux/bitmap.h
+++ b/tools/perf/util/include/linux/bitmap.h
@@ -11,6 +11,8 @@ int __bitmap_weight(const unsigned long *bitmap, int bits);
 void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
 		 const unsigned long *bitmap2, int bits);
 
+#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1)))
+
 #define BITMAP_LAST_WORD_MASK(nbits)					\
 (									\
 	((nbits) % BITS_PER_LONG) ?					\
-- 
cgit v1.1


From 915b0882c3108a21e9b3b5e176d3151ad522242d Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 8 Jan 2016 12:33:37 -0300
Subject: tools lib: Move bitmap.[ch] from tools/perf/ to tools/{lib,include}/

So that lib/find_bit.c doesn't requires anything inside tools/perf/

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: David Ahern <dsahern@gmail.com>
Cc: George Spelvin <linux@horizon.com
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Wang Nan <wangnan0@huawei.com>
Cc: Yury Norov <yury.norov@gmail.com>
Link: http://lkml.kernel.org/n/tip-7lxe7jgohaac5faodndhdmvk@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/include/linux/bitmap.h           | 68 ++++++++++++++++++++++++++++++++++
 tools/lib/bitmap.c                     | 31 ++++++++++++++++
 tools/perf/util/Build                  |  5 +++
 tools/perf/util/bitmap.c               | 31 ----------------
 tools/perf/util/include/linux/bitmap.h | 68 ----------------------------------
 tools/perf/util/python-ext-sources     |  2 +-
 6 files changed, 105 insertions(+), 100 deletions(-)
 create mode 100644 tools/include/linux/bitmap.h
 create mode 100644 tools/lib/bitmap.c
 delete mode 100644 tools/perf/util/bitmap.c
 delete mode 100644 tools/perf/util/include/linux/bitmap.h

diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h
new file mode 100644
index 0000000..28f5493
--- /dev/null
+++ b/tools/include/linux/bitmap.h
@@ -0,0 +1,68 @@
+#ifndef _PERF_BITOPS_H
+#define _PERF_BITOPS_H
+
+#include <string.h>
+#include <linux/bitops.h>
+
+#define DECLARE_BITMAP(name,bits) \
+	unsigned long name[BITS_TO_LONGS(bits)]
+
+int __bitmap_weight(const unsigned long *bitmap, int bits);
+void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
+		 const unsigned long *bitmap2, int bits);
+
+#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1)))
+
+#define BITMAP_LAST_WORD_MASK(nbits)					\
+(									\
+	((nbits) % BITS_PER_LONG) ?					\
+		(1UL<<((nbits) % BITS_PER_LONG))-1 : ~0UL		\
+)
+
+#define small_const_nbits(nbits) \
+	(__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG)
+
+static inline void bitmap_zero(unsigned long *dst, int nbits)
+{
+	if (small_const_nbits(nbits))
+		*dst = 0UL;
+	else {
+		int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
+		memset(dst, 0, len);
+	}
+}
+
+static inline int bitmap_weight(const unsigned long *src, int nbits)
+{
+	if (small_const_nbits(nbits))
+		return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits));
+	return __bitmap_weight(src, nbits);
+}
+
+static inline void bitmap_or(unsigned long *dst, const unsigned long *src1,
+			     const unsigned long *src2, int nbits)
+{
+	if (small_const_nbits(nbits))
+		*dst = *src1 | *src2;
+	else
+		__bitmap_or(dst, src1, src2, nbits);
+}
+
+/**
+ * test_and_set_bit - Set a bit and return its old value
+ * @nr: Bit to set
+ * @addr: Address to count from
+ */
+static inline int test_and_set_bit(int nr, unsigned long *addr)
+{
+	unsigned long mask = BIT_MASK(nr);
+	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
+	unsigned long old;
+
+	old = *p;
+	*p = old | mask;
+
+	return (old & mask) != 0;
+}
+
+#endif /* _PERF_BITOPS_H */
diff --git a/tools/lib/bitmap.c b/tools/lib/bitmap.c
new file mode 100644
index 0000000..0a1adc1
--- /dev/null
+++ b/tools/lib/bitmap.c
@@ -0,0 +1,31 @@
+/*
+ * From lib/bitmap.c
+ * Helper functions for bitmap.h.
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+#include <linux/bitmap.h>
+
+int __bitmap_weight(const unsigned long *bitmap, int bits)
+{
+	int k, w = 0, lim = bits/BITS_PER_LONG;
+
+	for (k = 0; k < lim; k++)
+		w += hweight_long(bitmap[k]);
+
+	if (bits % BITS_PER_LONG)
+		w += hweight_long(bitmap[k] & BITMAP_LAST_WORD_MASK(bits));
+
+	return w;
+}
+
+void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
+		 const unsigned long *bitmap2, int bits)
+{
+	int k;
+	int nr = BITS_TO_LONGS(bits);
+
+	for (k = 0; k < nr; k++)
+		dst[k] = bitmap1[k] | bitmap2[k];
+}
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index e8bc10b..5eec53a 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -132,6 +132,7 @@ CFLAGS_pmu-bison.o          += -DYYENABLE_NLS=0 -DYYLTYPE_IS_TRIVIAL=0 -w
 $(OUTPUT)util/parse-events.o: $(OUTPUT)util/parse-events-flex.c $(OUTPUT)util/parse-events-bison.c
 $(OUTPUT)util/pmu.o: $(OUTPUT)util/pmu-flex.c $(OUTPUT)util/pmu-bison.c
 
+CFLAGS_bitmap.o        += -Wno-unused-parameter -DETC_PERFCONFIG="BUILD_STR($(ETC_PERFCONFIG_SQ))"
 CFLAGS_find_bit.o      += -Wno-unused-parameter -DETC_PERFCONFIG="BUILD_STR($(ETC_PERFCONFIG_SQ))"
 CFLAGS_rbtree.o        += -Wno-unused-parameter -DETC_PERFCONFIG="BUILD_STR($(ETC_PERFCONFIG_SQ))"
 CFLAGS_libstring.o     += -Wno-unused-parameter -DETC_PERFCONFIG="BUILD_STR($(ETC_PERFCONFIG_SQ))"
@@ -142,6 +143,10 @@ $(OUTPUT)util/kallsyms.o: ../lib/symbol/kallsyms.c FORCE
 	$(call rule_mkdir)
 	$(call if_changed_dep,cc_o_c)
 
+$(OUTPUT)util/bitmap.o: ../lib/bitmap.c FORCE
+	$(call rule_mkdir)
+	$(call if_changed_dep,cc_o_c)
+
 $(OUTPUT)util/find_bit.o: ../lib/find_bit.c FORCE
 	$(call rule_mkdir)
 	$(call if_changed_dep,cc_o_c)
diff --git a/tools/perf/util/bitmap.c b/tools/perf/util/bitmap.c
deleted file mode 100644
index 0a1adc1..0000000
--- a/tools/perf/util/bitmap.c
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * From lib/bitmap.c
- * Helper functions for bitmap.h.
- *
- * This source code is licensed under the GNU General Public License,
- * Version 2.  See the file COPYING for more details.
- */
-#include <linux/bitmap.h>
-
-int __bitmap_weight(const unsigned long *bitmap, int bits)
-{
-	int k, w = 0, lim = bits/BITS_PER_LONG;
-
-	for (k = 0; k < lim; k++)
-		w += hweight_long(bitmap[k]);
-
-	if (bits % BITS_PER_LONG)
-		w += hweight_long(bitmap[k] & BITMAP_LAST_WORD_MASK(bits));
-
-	return w;
-}
-
-void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
-		 const unsigned long *bitmap2, int bits)
-{
-	int k;
-	int nr = BITS_TO_LONGS(bits);
-
-	for (k = 0; k < nr; k++)
-		dst[k] = bitmap1[k] | bitmap2[k];
-}
diff --git a/tools/perf/util/include/linux/bitmap.h b/tools/perf/util/include/linux/bitmap.h
deleted file mode 100644
index 28f5493..0000000
--- a/tools/perf/util/include/linux/bitmap.h
+++ /dev/null
@@ -1,68 +0,0 @@
-#ifndef _PERF_BITOPS_H
-#define _PERF_BITOPS_H
-
-#include <string.h>
-#include <linux/bitops.h>
-
-#define DECLARE_BITMAP(name,bits) \
-	unsigned long name[BITS_TO_LONGS(bits)]
-
-int __bitmap_weight(const unsigned long *bitmap, int bits);
-void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
-		 const unsigned long *bitmap2, int bits);
-
-#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1)))
-
-#define BITMAP_LAST_WORD_MASK(nbits)					\
-(									\
-	((nbits) % BITS_PER_LONG) ?					\
-		(1UL<<((nbits) % BITS_PER_LONG))-1 : ~0UL		\
-)
-
-#define small_const_nbits(nbits) \
-	(__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG)
-
-static inline void bitmap_zero(unsigned long *dst, int nbits)
-{
-	if (small_const_nbits(nbits))
-		*dst = 0UL;
-	else {
-		int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
-		memset(dst, 0, len);
-	}
-}
-
-static inline int bitmap_weight(const unsigned long *src, int nbits)
-{
-	if (small_const_nbits(nbits))
-		return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits));
-	return __bitmap_weight(src, nbits);
-}
-
-static inline void bitmap_or(unsigned long *dst, const unsigned long *src1,
-			     const unsigned long *src2, int nbits)
-{
-	if (small_const_nbits(nbits))
-		*dst = *src1 | *src2;
-	else
-		__bitmap_or(dst, src1, src2, nbits);
-}
-
-/**
- * test_and_set_bit - Set a bit and return its old value
- * @nr: Bit to set
- * @addr: Address to count from
- */
-static inline int test_and_set_bit(int nr, unsigned long *addr)
-{
-	unsigned long mask = BIT_MASK(nr);
-	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
-	unsigned long old;
-
-	old = *p;
-	*p = old | mask;
-
-	return (old & mask) != 0;
-}
-
-#endif /* _PERF_BITOPS_H */
diff --git a/tools/perf/util/python-ext-sources b/tools/perf/util/python-ext-sources
index f15d14f..8162ba0 100644
--- a/tools/perf/util/python-ext-sources
+++ b/tools/perf/util/python-ext-sources
@@ -10,7 +10,7 @@ util/ctype.c
 util/evlist.c
 util/evsel.c
 util/cpumap.c
-util/bitmap.c
+../lib/bitmap.c
 ../lib/find_bit.c
 ../lib/hweight.c
 util/thread_map.c
-- 
cgit v1.1


From 42b276a2351517409d55b1202a1fa8b05c0cdc99 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Tue, 5 Jan 2016 12:06:00 +0900
Subject: perf top: Decay periods in callchains

It missed to decay periods in callchains when decaying hist entries.
This resulted in more than 100 percent overhead in callchains in the
fractal style output.

Reported-by: Arnaldo Carvalho de Melo <acme@kernel.org>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1451963160-17196-1-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/callchain.c | 28 ++++++++++++++++++++++++++++
 tools/perf/util/callchain.h |  1 +
 tools/perf/util/hist.c      |  1 +
 3 files changed, 30 insertions(+)

diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 564377d..53c43eb 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -925,6 +925,34 @@ void free_callchain(struct callchain_root *root)
 	free_callchain_node(&root->node);
 }
 
+static u64 decay_callchain_node(struct callchain_node *node)
+{
+	struct callchain_node *child;
+	struct rb_node *n;
+	u64 child_hits = 0;
+
+	n = rb_first(&node->rb_root_in);
+	while (n) {
+		child = container_of(n, struct callchain_node, rb_node_in);
+
+		child_hits += decay_callchain_node(child);
+		n = rb_next(n);
+	}
+
+	node->hit = (node->hit * 7) / 8;
+	node->children_hit = child_hits;
+
+	return node->hit;
+}
+
+void decay_callchain(struct callchain_root *root)
+{
+	if (!symbol_conf.use_callchain)
+		return;
+
+	decay_callchain_node(&root->node);
+}
+
 int callchain_node__make_parent_list(struct callchain_node *node)
 {
 	struct callchain_node *parent = node->parent;
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index 8ac8f043..18dd222 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -253,6 +253,7 @@ int callchain_node__fprintf_value(struct callchain_node *node,
 				  FILE *fp, u64 total);
 
 void free_callchain(struct callchain_root *root);
+void decay_callchain(struct callchain_root *root);
 int callchain_node__make_parent_list(struct callchain_node *node);
 
 #endif	/* __PERF_CALLCHAIN_H */
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index afc9b8f..888776b 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -254,6 +254,7 @@ static bool hists__decay_entry(struct hists *hists, struct hist_entry *he)
 	he_stat__decay(&he->stat);
 	if (symbol_conf.cumulate_callchain)
 		he_stat__decay(he->stat_acc);
+	decay_callchain(he->callchain);
 
 	diff = prev_period - he->stat.period;
 
-- 
cgit v1.1


From 1e9abf8b03c8f9352f54171647296c41317679a4 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Sun, 29 Nov 2015 23:24:17 +0900
Subject: perf report: Change default to use event group view

The event group view feature is to see related events together.  To use
the group view, events should be recorded as a group with a dedicated
syntax of surrounding events by braces (-e '{ evt1, evt2, ... }').

Also 'perf report' also requires the --group option to enable it.
However it's almost always beneficial to use the group view to see the
group events as it's more expressive.  And I think it's more natural to
see events together if they are recorded as a group.

Thus this patch changes the default value to enable it.  If users don't
want to see like it and keep the original behavior, they can set the
report.group config variable to false and/or use --no-group option in
the 'perf report' command line.

Requested-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Cc: Taeung Song <treeze.taeung@gmail.com>
Link: http://lkml.kernel.org/r/1448807057-3506-1-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/symbol.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index d51abd2..3b2de6e 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -39,6 +39,7 @@ struct symbol_conf symbol_conf = {
 	.cumulate_callchain	= true,
 	.show_hist_headers	= true,
 	.symfs			= "",
+	.event_group		= true,
 };
 
 static enum dso_binary_type binary_type_symtab[] = {
-- 
cgit v1.1


From cbd08b7335c9d559f424dcef7bea333605597490 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 7 Jan 2016 10:14:04 +0100
Subject: perf tools: Do not show trace command if it's not compiled in

The trace command still appears in help message when you run simple
'perf' command.

It's because the generate-cmdlist.sh does not care about the
HAVE_LIBAUDIT_SUPPORT dependency of trace command and puts it into
generated common_cmds array.

Wrapping trace command under HAVE_LIBAUDIT_SUPPORT dependency, which
will exclude it from common_cmds array if HAVE_LIBAUDIT_SUPPORT is not
set.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Noel Grandin <noelgrandin@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1452158050-28061-8-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/command-list.txt         |  2 +-
 tools/perf/util/generate-cmdlist.sh | 15 +++++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/tools/perf/command-list.txt b/tools/perf/command-list.txt
index acc3ea7..ab5cbaa 100644
--- a/tools/perf/command-list.txt
+++ b/tools/perf/command-list.txt
@@ -26,4 +26,4 @@ perf-stat			mainporcelain common
 perf-test			mainporcelain common
 perf-timechart			mainporcelain common
 perf-top			mainporcelain common
-perf-trace			mainporcelain common
+perf-trace			mainporcelain audit
diff --git a/tools/perf/util/generate-cmdlist.sh b/tools/perf/util/generate-cmdlist.sh
index 36a885d..0ac2037 100755
--- a/tools/perf/util/generate-cmdlist.sh
+++ b/tools/perf/util/generate-cmdlist.sh
@@ -36,4 +36,19 @@ do
      }' "Documentation/perf-$cmd.txt"
 done
 echo "#endif /* HAVE_LIBELF_SUPPORT */"
+
+echo "#ifdef HAVE_LIBAUDIT_SUPPORT"
+sed -n -e 's/^perf-\([^ 	]*\)[ 	].* audit*/\1/p' command-list.txt |
+sort |
+while read cmd
+do
+     sed -n '
+     /^NAME/,/perf-'"$cmd"'/H
+     ${
+            x
+            s/.*perf-'"$cmd"' - \(.*\)/  {"'"$cmd"'", "\1"},/
+	    p
+     }' "Documentation/perf-$cmd.txt"
+done
+echo "#endif /* HAVE_LIBELF_SUPPORT */"
 echo "};"
-- 
cgit v1.1


From 2d7c03e6b0c604decae33b0ce03e69b79b2a39a1 Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Fri, 8 Jan 2016 14:23:57 +0000
Subject: perf tools: Add missing headers in perf's MANIFEST

These lost headers are found in arm64 cross buildings, failing to build
perf using tarballs generated using:

 $ make perf-targz-src-pkg

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1452263041-225488-3-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/MANIFEST | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/perf/MANIFEST b/tools/perf/MANIFEST
index b3db8df..ddf922f 100644
--- a/tools/perf/MANIFEST
+++ b/tools/perf/MANIFEST
@@ -1,6 +1,7 @@
 tools/perf
 tools/arch/alpha/include/asm/barrier.h
 tools/arch/arm/include/asm/barrier.h
+tools/arch/arm64/include/asm/barrier.h
 tools/arch/ia64/include/asm/barrier.h
 tools/arch/mips/include/asm/barrier.h
 tools/arch/powerpc/include/asm/barrier.h
@@ -30,6 +31,7 @@ tools/lib/find_bit.c
 tools/include/asm/atomic.h
 tools/include/asm/barrier.h
 tools/include/asm/bug.h
+tools/include/asm-generic/atomic-gcc.h
 tools/include/asm-generic/barrier.h
 tools/include/asm-generic/bitops/arch_hweight.h
 tools/include/asm-generic/bitops/atomic.h
-- 
cgit v1.1


From 9cdbc409626b29ab30f06a6393db6763f040f753 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 7 Jan 2016 10:14:05 +0100
Subject: perf script: Align event name properly

Adding code to align event names, so we get aligned output in case of
multiple events with different names.

Before:
  $ perf script
  :13757 13757 163918.230829: cpu/mem-snp-none/P: ffff88085f20d010
  :13757 13757 163918.230832: cpu/mem-loads,ldlat=30/P:     7f5a5f719f00
  :13757 13757 163918.230835: cpu/mem-loads,ldlat=30/P:     7f5a5f719f00
  :13758 13758 163918.230838: cpu/mem-snp-none/P: ffff88085f4ad810
  :13758 13758 163918.154093: cpu/mem-stores/P: ffff88085bb53f28
  :13757 13757 163918.155264: cpu/mem-snp-hitm/P:           601080
  ...

After:
  $ perf script
  :13757 13757 163918.228831:       cpu/mem-snp-none/P: ffffffff81a841c0
  :13757 13757 163918.228834: cpu/mem-loads,ldlat=30/P:     7f5a5f719f08
  :13757 13757 163918.228837: cpu/mem-loads,ldlat=30/P:     7f5a5f719f08
  :13758 13758 163918.228837:       cpu/mem-snp-none/P: ffff88085f4ad800
  :13758 13758 163918.154093:         cpu/mem-stores/P: ffff88085bb53f28
  :13757 13757 163918.155264:       cpu/mem-snp-hitm/P:           601080
  ...

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: David Ahern <dsahern@gmail.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Noel Grandin <noelgrandin@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1452158050-28061-9-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-script.c | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 5e2f9d2..c691214 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -617,9 +617,24 @@ struct perf_script {
 	bool			allocated;
 	struct cpu_map		*cpus;
 	struct thread_map	*threads;
+	int			name_width;
 };
 
-static void process_event(struct perf_script *script __maybe_unused, union perf_event *event,
+static int perf_evlist__max_name_len(struct perf_evlist *evlist)
+{
+	struct perf_evsel *evsel;
+	int max = 0;
+
+	evlist__for_each(evlist, evsel) {
+		int len = strlen(perf_evsel__name(evsel));
+
+		max = MAX(len, max);
+	}
+
+	return max;
+}
+
+static void process_event(struct perf_script *script, union perf_event *event,
 			  struct perf_sample *sample, struct perf_evsel *evsel,
 			  struct addr_location *al)
 {
@@ -636,7 +651,12 @@ static void process_event(struct perf_script *script __maybe_unused, union perf_
 
 	if (PRINT_FIELD(EVNAME)) {
 		const char *evname = perf_evsel__name(evsel);
-		printf("%s: ", evname ? evname : "[unknown]");
+
+		if (!script->name_width)
+			script->name_width = perf_evlist__max_name_len(script->session->evlist);
+
+		printf("%*s: ", script->name_width,
+		       evname ? evname : "[unknown]");
 	}
 
 	if (print_flags)
-- 
cgit v1.1


From 685c84154cef61dd7d961f36ab92f13c3ef5d354 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 7 Jan 2016 10:14:06 +0100
Subject: perf tools: Include all tools/lib directory for tags/cscope/TAGS
 targets

Besides lockdep we use all the 'tools/lib' code in perf, so include it
completely in tags.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Noel Grandin <noelgrandin@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1452158050-28061-10-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Makefile.perf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index 404e3b1..1025ea7 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -488,7 +488,7 @@ INSTALL_DOC_TARGETS += quick-install-doc quick-install-man quick-install-html
 $(DOC_TARGETS):
 	$(QUIET_SUBDIR0)Documentation $(QUIET_SUBDIR1) $(@:doc=all)
 
-TAG_FOLDERS= . ../lib/traceevent ../lib/api ../lib/symbol ../include ../lib/bpf
+TAG_FOLDERS= . ../lib ../include
 TAG_FILES= ../../include/uapi/linux/perf_event.h
 
 TAGS:
-- 
cgit v1.1


From bb4ced29f5d5ff1d4d51b602dad34a0d15495a67 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 7 Jan 2016 10:14:07 +0100
Subject: perf tools: Remove list entry from struct sort_entry

It's no longer needed.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Noel Grandin <noelgrandin@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1452158050-28061-11-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/sort.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index dec536b..687bbb12 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -214,8 +214,6 @@ enum sort_type {
  */
 
 struct sort_entry {
-	struct list_head list;
-
 	const char *se_header;
 
 	int64_t (*se_cmp)(struct hist_entry *, struct hist_entry *);
-- 
cgit v1.1


From b97511c5bc94ef12613f485ab82f989df04088da Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 7 Jan 2016 10:14:08 +0100
Subject: perf tools: Add overhead/overhead_children keys defaults via string

We currently set 'overhead' and 'overhead_children' as default sort keys
within perf_hpp__init function by directly adding into the sort list.

This patch adds 'overhead' and 'overhead_children' in text form into
sort_keys and let them be added by standard sort dimension interface.

We need to eliminate dirrect sort_list additions to be able to add
support for hists specific sort keys.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Noel Grandin <noelgrandin@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1452158050-28061-12-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/ui/hist.c   | 12 ------------
 tools/perf/util/sort.c | 39 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 12 deletions(-)

diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c
index 8263c0e..bf2a66e 100644
--- a/tools/perf/ui/hist.c
+++ b/tools/perf/ui/hist.c
@@ -443,7 +443,6 @@ LIST_HEAD(perf_hpp__sort_list);
 
 void perf_hpp__init(void)
 {
-	struct list_head *list;
 	int i;
 
 	for (i = 0; i < PERF_HPP__MAX_INDEX; i++) {
@@ -484,17 +483,6 @@ void perf_hpp__init(void)
 
 	if (symbol_conf.show_total_period)
 		hpp_dimension__add_output(PERF_HPP__PERIOD);
-
-	/* prepend overhead field for backward compatiblity.  */
-	list = &perf_hpp__format[PERF_HPP__OVERHEAD].sort_list;
-	if (list_empty(list))
-		list_add(list, &perf_hpp__sort_list);
-
-	if (symbol_conf.cumulate_callchain) {
-		list = &perf_hpp__format[PERF_HPP__OVERHEAD_ACC].sort_list;
-		if (list_empty(list))
-			list_add(list, &perf_hpp__sort_list);
-	}
 }
 
 void perf_hpp__column_register(struct perf_hpp_fmt *format)
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 04e2a5c..ec72234 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -2252,6 +2252,34 @@ static int setup_sort_order(struct perf_evlist *evlist)
 	return 0;
 }
 
+/*
+ * Adds 'pre,' prefix into 'str' is 'pre' is
+ * not already part of 'str'.
+ */
+static char *prefix_if_not_in(const char *pre, char *str)
+{
+	char *n;
+
+	if (!str || strstr(str, pre))
+		return str;
+
+	if (asprintf(&n, "%s,%s", pre, str) < 0)
+		return NULL;
+
+	free(str);
+	return n;
+}
+
+static char *setup_overhead(char *keys)
+{
+	keys = prefix_if_not_in("overhead", keys);
+
+	if (symbol_conf.cumulate_callchain)
+		keys = prefix_if_not_in("overhead_children", keys);
+
+	return keys;
+}
+
 static int __setup_sorting(struct perf_evlist *evlist)
 {
 	char *tmp, *tok, *str;
@@ -2281,6 +2309,17 @@ static int __setup_sorting(struct perf_evlist *evlist)
 		return -ENOMEM;
 	}
 
+	/*
+	 * Prepend overhead fields for backward compatibility.
+	 */
+	if (!is_strict_order(field_order)) {
+		str = setup_overhead(str);
+		if (str == NULL) {
+			error("Not enough memory to setup overhead keys");
+			return -ENOMEM;
+		}
+	}
+
 	for (tok = strtok_r(str, ", ", &tmp);
 			tok; tok = strtok_r(NULL, ", ", &tmp)) {
 		ret = sort_dimension__add(tok, evlist);
-- 
cgit v1.1


From 21e6d8428664293f203be3004dcd8d70f68ebdb9 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 7 Jan 2016 10:14:09 +0100
Subject: perf diff: Use perf_hpp__register_sort_field interface

Using perf_hpp__register_sort_field interface instead of directly adding
the entry.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Noel Grandin <noelgrandin@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1452158050-28061-13-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-diff.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
index 8706383..36ccc2b 100644
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@@ -1207,7 +1207,7 @@ static int ui_init(void)
 		BUG_ON(1);
 	}
 
-	list_add(&fmt->sort_list, &perf_hpp__sort_list);
+	perf_hpp__register_sort_field(fmt);
 	return 0;
 }
 
-- 
cgit v1.1


From fc284be9d88528dd2a28d5471e40a6acde6c3036 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 7 Jan 2016 10:14:10 +0100
Subject: perf hists: Export a couple of hist functions

These are necessary for multi threaded sample processing:

 - hists__get__get_rotate_entries_in()
 - hists__collapse_insert_entry()
 - __hists__init()

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Noel Grandin <noelgrandin@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1452158050-28061-14-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/hist.c | 19 ++++++++++++-------
 tools/perf/util/hist.h |  5 +++++
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 888776b..c226303 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -1007,9 +1007,8 @@ void hist_entry__delete(struct hist_entry *he)
  * collapse the histogram
  */
 
-static bool hists__collapse_insert_entry(struct hists *hists __maybe_unused,
-					 struct rb_root *root,
-					 struct hist_entry *he)
+bool hists__collapse_insert_entry(struct hists *hists __maybe_unused,
+				  struct rb_root *root, struct hist_entry *he)
 {
 	struct rb_node **p = &root->rb_node;
 	struct rb_node *parent = NULL;
@@ -1049,7 +1048,7 @@ static bool hists__collapse_insert_entry(struct hists *hists __maybe_unused,
 	return true;
 }
 
-static struct rb_root *hists__get_rotate_entries_in(struct hists *hists)
+struct rb_root *hists__get_rotate_entries_in(struct hists *hists)
 {
 	struct rb_root *root;
 
@@ -1584,10 +1583,8 @@ int perf_hist_config(const char *var, const char *value)
 	return 0;
 }
 
-static int hists_evsel__init(struct perf_evsel *evsel)
+int __hists__init(struct hists *hists)
 {
-	struct hists *hists = evsel__hists(evsel);
-
 	memset(hists, 0, sizeof(*hists));
 	hists->entries_in_array[0] = hists->entries_in_array[1] = RB_ROOT;
 	hists->entries_in = &hists->entries_in_array[0];
@@ -1627,6 +1624,14 @@ static void hists_evsel__exit(struct perf_evsel *evsel)
 	hists__delete_all_entries(hists);
 }
 
+static int hists_evsel__init(struct perf_evsel *evsel)
+{
+	struct hists *hists = evsel__hists(evsel);
+
+	__hists__init(hists);
+	return 0;
+}
+
 /*
  * XXX We probably need a hists_evsel__exit() to free the hist_entries
  * stored in the rbtree...
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index cb8f373..d4ec482 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -185,6 +185,11 @@ static inline struct hists *evsel__hists(struct perf_evsel *evsel)
 }
 
 int hists__init(void);
+int __hists__init(struct hists *hists);
+
+struct rb_root *hists__get_rotate_entries_in(struct hists *hists);
+bool hists__collapse_insert_entry(struct hists *hists __maybe_unused,
+				  struct rb_root *root, struct hist_entry *he);
 
 struct perf_hpp {
 	char *buf;
-- 
cgit v1.1


From 14cbfbeb76540cc0c53fbb0ba34b3a4900ebe40f Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 7 Jan 2016 20:41:53 +0900
Subject: perf report: Show random usage tip on the help line

Currently perf report only shows a help message "For a higher level
overview, try: perf report --sort comm,dso" unconditionally (even if
the sort keys were used).  Add more help tips and show randomly.

Load tips from ${prefix}/share/doc/perf-tip/tips.txt file.

  $ perf report | tail
      0.10%  swapper  [kernel.vmlinux]   [k] irq_exit
      0.09%  swapper  [kernel.vmlinux]   [k] flush_smp_call_function_queue
      0.08%  swapper  [kernel.vmlinux]   [k] native_write_msr_safe
      0.03%  swapper  [kernel.vmlinux]   [k] group_sched_in
      0.01%  perf     [kernel.vmlinux]   [k] native_write_msr_safe

  #
  # (Tip: Search options using a keyword: perf report -h <keyword>)
  #

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1452166913-27046-1-git-send-email-namhyung@kernel.org
[ Renamed it to perf_tip() and the parameter dirname to dirpath to fix the build on older distros ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Build                  |  1 +
 tools/perf/Documentation/tips.txt | 14 ++++++++++++++
 tools/perf/Makefile.perf          |  3 +++
 tools/perf/builtin-report.c       |  2 +-
 tools/perf/config/Makefile        |  6 ++++++
 tools/perf/perf.c                 |  4 ++++
 tools/perf/util/util.c            | 27 +++++++++++++++++++++++++++
 tools/perf/util/util.h            |  2 ++
 8 files changed, 58 insertions(+), 1 deletion(-)
 create mode 100644 tools/perf/Documentation/tips.txt

diff --git a/tools/perf/Build b/tools/perf/Build
index 00c4b8c..6b67e6f 100644
--- a/tools/perf/Build
+++ b/tools/perf/Build
@@ -41,6 +41,7 @@ CFLAGS_perf.o              += -DPERF_HTML_PATH="BUILD_STR($(htmldir_SQ))"	\
 			      -DPREFIX="BUILD_STR($(prefix_SQ))"		\
 			      -include $(OUTPUT)PERF-VERSION-FILE
 CFLAGS_builtin-trace.o	   += -DSTRACE_GROUPS_DIR="BUILD_STR($(STRACE_GROUPS_DIR_SQ))"
+CFLAGS_builtin-report.o	   += -DTIPDIR="BUILD_STR($(tipdir_SQ))"
 
 libperf-y += util/
 libperf-y += arch/
diff --git a/tools/perf/Documentation/tips.txt b/tools/perf/Documentation/tips.txt
new file mode 100644
index 0000000..a1c10e3
--- /dev/null
+++ b/tools/perf/Documentation/tips.txt
@@ -0,0 +1,14 @@
+For a higher level overview, try: perf report --sort comm,dso
+Sample related events with: perf record -e '{cycles,instructions}:S'
+Compare performance results with: perf diff [<old file> <new file>]
+Boolean options have negative forms, e.g.: perf report --no-children
+Customize output of perf script with: perf script -F event,ip,sym
+Generate a script for your data: perf script -g <lang>
+Save output of perf stat using: perf stat record <target workload>
+Create an archive with symtabs to analyse on other machine: perf archive
+Search options using a keyword: perf report -h <keyword>
+Use parent filter to see specific call path: perf report -p <regex>
+List events using substring match: perf list <keyword>
+To see list of saved events and attributes: perf evlist -v
+Use --symfs <dir> if your symbol files are in non-standard locations
+To see callchains in a more compact form: perf report -g folded
diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index 1025ea7..0a22407 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -567,6 +567,9 @@ endif
 	$(call QUIET_INSTALL, perf_completion-script) \
 		$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(sysconfdir_SQ)/bash_completion.d'; \
 		$(INSTALL) perf-completion.sh '$(DESTDIR_SQ)$(sysconfdir_SQ)/bash_completion.d/perf'
+	$(call QUIET_INSTALL, perf-tip) \
+		$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(tip_instdir_SQ)'; \
+		$(INSTALL) Documentation/tips.txt -t '$(DESTDIR_SQ)$(tip_instdir_SQ)'
 
 install-tests: all install-gtk
 	$(call QUIET_INSTALL, tests) \
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index f10c663..d5a42ee 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -433,7 +433,7 @@ static int report__browse_hists(struct report *rep)
 	int ret;
 	struct perf_session *session = rep->session;
 	struct perf_evlist *evlist = session->evlist;
-	const char *help = "For a higher level overview, try: perf report --sort comm,dso";
+	const char *help = perf_tip(TIPDIR);
 
 	switch (use_browser) {
 	case 1:
diff --git a/tools/perf/config/Makefile b/tools/perf/config/Makefile
index 18b2f96..254d06e 100644
--- a/tools/perf/config/Makefile
+++ b/tools/perf/config/Makefile
@@ -691,6 +691,7 @@ sharedir = $(prefix)/share
 template_dir = share/perf-core/templates
 STRACE_GROUPS_DIR = share/perf-core/strace/groups
 htmldir = share/doc/perf-doc
+tipdir = share/doc/perf-tip
 ifeq ($(prefix),/usr)
 sysconfdir = /etc
 ETC_PERFCONFIG = $(sysconfdir)/perfconfig
@@ -717,6 +718,7 @@ infodir_SQ = $(subst ','\'',$(infodir))
 perfexecdir_SQ = $(subst ','\'',$(perfexecdir))
 template_dir_SQ = $(subst ','\'',$(template_dir))
 htmldir_SQ = $(subst ','\'',$(htmldir))
+tipdir_SQ = $(subst ','\'',$(tipdir))
 prefix_SQ = $(subst ','\'',$(prefix))
 sysconfdir_SQ = $(subst ','\'',$(sysconfdir))
 libdir_SQ = $(subst ','\'',$(libdir))
@@ -724,12 +726,15 @@ libdir_SQ = $(subst ','\'',$(libdir))
 ifneq ($(filter /%,$(firstword $(perfexecdir))),)
 perfexec_instdir = $(perfexecdir)
 STRACE_GROUPS_INSTDIR = $(STRACE_GROUPS_DIR)
+tip_instdir = $(tipdir)
 else
 perfexec_instdir = $(prefix)/$(perfexecdir)
 STRACE_GROUPS_INSTDIR = $(prefix)/$(STRACE_GROUPS_DIR)
+tip_instdir = $(prefix)/$(tipdir)
 endif
 perfexec_instdir_SQ = $(subst ','\'',$(perfexec_instdir))
 STRACE_GROUPS_INSTDIR_SQ = $(subst ','\'',$(STRACE_GROUPS_INSTDIR))
+tip_instdir_SQ = $(subst ','\'',$(tip_instdir))
 
 # If we install to $(HOME) we keep the traceevent default:
 # $(HOME)/.traceevent/plugins
@@ -770,6 +775,7 @@ $(call detected_var,ETC_PERFCONFIG_SQ)
 $(call detected_var,STRACE_GROUPS_DIR_SQ)
 $(call detected_var,prefix_SQ)
 $(call detected_var,perfexecdir_SQ)
+$(call detected_var,tipdir_SQ)
 $(call detected_var,LIBDIR)
 $(call detected_var,GTK_CFLAGS)
 $(call detected_var,PERL_EMBED_CCOPTS)
diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index cb1d249..a929618 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -19,6 +19,8 @@
 #include "util/debug.h"
 #include <api/fs/tracing_path.h>
 #include <pthread.h>
+#include <stdlib.h>
+#include <time.h>
 
 const char perf_usage_string[] =
 	"perf [--version] [--help] [OPTIONS] COMMAND [ARGS]";
@@ -542,6 +544,8 @@ int main(int argc, const char **argv)
 	if (!cmd)
 		cmd = "perf-help";
 
+	srandom(time(NULL));
+
 	/* get debugfs/tracefs mount point from /proc/mounts */
 	tracing_path_mount();
 
diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c
index aff0cfd..88b8f8d 100644
--- a/tools/perf/util/util.c
+++ b/tools/perf/util/util.c
@@ -16,6 +16,8 @@
 #include <linux/kernel.h>
 #include <unistd.h>
 #include "callchain.h"
+#include "strlist.h"
+#include <subcmd/exec-cmd.h>
 
 struct callchain_param	callchain_param = {
 	.mode	= CHAIN_GRAPH_ABS,
@@ -663,3 +665,28 @@ fetch_kernel_version(unsigned int *puint, char *str,
 		*puint = (version << 16) + (patchlevel << 8) + sublevel;
 	return 0;
 }
+
+const char *perf_tip(const char *dirpath)
+{
+	struct strlist *tips;
+	struct str_node *node;
+	char *tip = NULL;
+	struct strlist_config conf = {
+		.dirname = system_path(dirpath) ,
+	};
+
+	tips = strlist__new("tips.txt", &conf);
+	if (tips == NULL || strlist__nr_entries(tips) == 1) {
+		tip = (char *)"Cannot find tips.txt file";
+		goto out;
+	}
+
+	node = strlist__entry(tips, random() % strlist__nr_entries(tips));
+	if (asprintf(&tip, "Tip: %s", node->s) < 0)
+		tip = (char *)"Tip: get more memory! ;-)";
+
+out:
+	strlist__delete(tips);
+
+	return tip;
+}
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
index 4b519c5..fe915e6 100644
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -342,4 +342,6 @@ int fetch_kernel_version(unsigned int *puint,
 #define KVER_FMT	"%d.%d.%d"
 #define KVER_PARAM(x)	KVER_VERSION(x), KVER_PATCHLEVEL(x), KVER_SUBLEVEL(x)
 
+const char *perf_tip(const char *dirpath);
+
 #endif /* GIT_COMPAT_UTIL_H */
-- 
cgit v1.1


From 23df7f798435796aff07d641456326b81cb34a77 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Thu, 7 Jan 2016 10:13:59 +0100
Subject: perf evlist: Make perf_evlist__open() open evsels with their cpus and
 threads (like perf record does)

'perf record' uses perf_evsel__open() to open events and passes the
evsel->cpus and evsel->threads.  Many tests and some tools instead use
perf_evlist__open() which passes instead evlist->cpus and
evlist->threads.

Make perf_evlist__open() follow the 'perf record' behaviour so that a
consistent approach is taken.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Noel Grandin <noelgrandin@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1452158050-28061-3-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/evlist.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index fa6dbf0..29e085b2 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -1498,7 +1498,7 @@ int perf_evlist__open(struct perf_evlist *evlist)
 	perf_evlist__update_id_pos(evlist);
 
 	evlist__for_each(evlist, evsel) {
-		err = perf_evsel__open(evsel, evlist->cpus, evlist->threads);
+		err = perf_evsel__open(evsel, evsel->cpus, evsel->threads);
 		if (err < 0)
 			goto out_err;
 	}
-- 
cgit v1.1


From d2190a8091124f832c8862ace3a3d7d70a2506a5 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 7 Jan 2016 10:13:58 +0100
Subject: perf evlist: Remove perf_evlist__(enable|disable)_event functions

Replacing them with perf_evsel__(enable|disable).

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Noel Grandin <noelgrandin@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1452158050-28061-2-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/x86/util/intel-bts.c |  4 ++--
 tools/perf/arch/x86/util/intel-pt.c  |  4 ++--
 tools/perf/tests/keep-tracking.c     |  2 +-
 tools/perf/tests/switch-tracking.c   |  6 +++---
 tools/perf/util/evlist.c             | 42 ------------------------------------
 tools/perf/util/evlist.h             |  4 ----
 6 files changed, 8 insertions(+), 54 deletions(-)

diff --git a/tools/perf/arch/x86/util/intel-bts.c b/tools/perf/arch/x86/util/intel-bts.c
index 9b94ce5..8d8150f 100644
--- a/tools/perf/arch/x86/util/intel-bts.c
+++ b/tools/perf/arch/x86/util/intel-bts.c
@@ -327,7 +327,7 @@ static int intel_bts_snapshot_start(struct auxtrace_record *itr)
 
 	evlist__for_each(btsr->evlist, evsel) {
 		if (evsel->attr.type == btsr->intel_bts_pmu->type)
-			return perf_evlist__disable_event(btsr->evlist, evsel);
+			return perf_evsel__disable(evsel);
 	}
 	return -EINVAL;
 }
@@ -340,7 +340,7 @@ static int intel_bts_snapshot_finish(struct auxtrace_record *itr)
 
 	evlist__for_each(btsr->evlist, evsel) {
 		if (evsel->attr.type == btsr->intel_bts_pmu->type)
-			return perf_evlist__enable_event(btsr->evlist, evsel);
+			return perf_evsel__enable(evsel);
 	}
 	return -EINVAL;
 }
diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c
index b64d462..f05daac 100644
--- a/tools/perf/arch/x86/util/intel-pt.c
+++ b/tools/perf/arch/x86/util/intel-pt.c
@@ -725,7 +725,7 @@ static int intel_pt_snapshot_start(struct auxtrace_record *itr)
 
 	evlist__for_each(ptr->evlist, evsel) {
 		if (evsel->attr.type == ptr->intel_pt_pmu->type)
-			return perf_evlist__disable_event(ptr->evlist, evsel);
+			return perf_evsel__disable(evsel);
 	}
 	return -EINVAL;
 }
@@ -738,7 +738,7 @@ static int intel_pt_snapshot_finish(struct auxtrace_record *itr)
 
 	evlist__for_each(ptr->evlist, evsel) {
 		if (evsel->attr.type == ptr->intel_pt_pmu->type)
-			return perf_evlist__enable_event(ptr->evlist, evsel);
+			return perf_evsel__enable(evsel);
 	}
 	return -EINVAL;
 }
diff --git a/tools/perf/tests/keep-tracking.c b/tools/perf/tests/keep-tracking.c
index 6158132..ddb78fa 100644
--- a/tools/perf/tests/keep-tracking.c
+++ b/tools/perf/tests/keep-tracking.c
@@ -123,7 +123,7 @@ int test__keep_tracking(int subtest __maybe_unused)
 
 	evsel = perf_evlist__last(evlist);
 
-	CHECK__(perf_evlist__disable_event(evlist, evsel));
+	CHECK__(perf_evsel__disable(evsel));
 
 	comm = "Test COMM 2";
 	CHECK__(prctl(PR_SET_NAME, (unsigned long)comm, 0, 0, 0));
diff --git a/tools/perf/tests/switch-tracking.c b/tools/perf/tests/switch-tracking.c
index dfbd8d6..ebd8016 100644
--- a/tools/perf/tests/switch-tracking.c
+++ b/tools/perf/tests/switch-tracking.c
@@ -455,7 +455,7 @@ int test__switch_tracking(int subtest __maybe_unused)
 
 	perf_evlist__enable(evlist);
 
-	err = perf_evlist__disable_event(evlist, cpu_clocks_evsel);
+	err = perf_evsel__disable(cpu_clocks_evsel);
 	if (err) {
 		pr_debug("perf_evlist__disable_event failed!\n");
 		goto out_err;
@@ -474,7 +474,7 @@ int test__switch_tracking(int subtest __maybe_unused)
 		goto out_err;
 	}
 
-	err = perf_evlist__disable_event(evlist, cycles_evsel);
+	err = perf_evsel__disable(cycles_evsel);
 	if (err) {
 		pr_debug("perf_evlist__disable_event failed!\n");
 		goto out_err;
@@ -500,7 +500,7 @@ int test__switch_tracking(int subtest __maybe_unused)
 		goto out_err;
 	}
 
-	err = perf_evlist__enable_event(evlist, cycles_evsel);
+	err = perf_evsel__enable(cycles_evsel);
 	if (err) {
 		pr_debug("perf_evlist__disable_event failed!\n");
 		goto out_err;
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 29e085b2..d81f13d 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -393,48 +393,6 @@ void perf_evlist__toggle_enable(struct perf_evlist *evlist)
 	(evlist->enabled ? perf_evlist__disable : perf_evlist__enable)(evlist);
 }
 
-int perf_evlist__disable_event(struct perf_evlist *evlist,
-			       struct perf_evsel *evsel)
-{
-	int cpu, thread, err;
-	int nr_cpus = cpu_map__nr(evlist->cpus);
-	int nr_threads = perf_evlist__nr_threads(evlist, evsel);
-
-	if (!evsel->fd)
-		return 0;
-
-	for (cpu = 0; cpu < nr_cpus; cpu++) {
-		for (thread = 0; thread < nr_threads; thread++) {
-			err = ioctl(FD(evsel, cpu, thread),
-				    PERF_EVENT_IOC_DISABLE, 0);
-			if (err)
-				return err;
-		}
-	}
-	return 0;
-}
-
-int perf_evlist__enable_event(struct perf_evlist *evlist,
-			      struct perf_evsel *evsel)
-{
-	int cpu, thread, err;
-	int nr_cpus = cpu_map__nr(evlist->cpus);
-	int nr_threads = perf_evlist__nr_threads(evlist, evsel);
-
-	if (!evsel->fd)
-		return -EINVAL;
-
-	for (cpu = 0; cpu < nr_cpus; cpu++) {
-		for (thread = 0; thread < nr_threads; thread++) {
-			err = ioctl(FD(evsel, cpu, thread),
-				    PERF_EVENT_IOC_ENABLE, 0);
-			if (err)
-				return err;
-		}
-	}
-	return 0;
-}
-
 static int perf_evlist__enable_event_cpu(struct perf_evlist *evlist,
 					 struct perf_evsel *evsel, int cpu)
 {
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index 3b7e1e2..7c4d9a2 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -155,10 +155,6 @@ void perf_evlist__disable(struct perf_evlist *evlist);
 void perf_evlist__enable(struct perf_evlist *evlist);
 void perf_evlist__toggle_enable(struct perf_evlist *evlist);
 
-int perf_evlist__disable_event(struct perf_evlist *evlist,
-			       struct perf_evsel *evsel);
-int perf_evlist__enable_event(struct perf_evlist *evlist,
-			      struct perf_evsel *evsel);
 int perf_evlist__enable_event_idx(struct perf_evlist *evlist,
 				  struct perf_evsel *evsel, int idx);
 
-- 
cgit v1.1


From f22ed827a8d5ff5a85e7c8e865baaaaf71a8d0cc Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 7 Jan 2016 10:14:00 +0100
Subject: perf unwind: Use find_map function in access_dso_mem

The find_map helper is already there, so let's use it.

Also we're going to introduce wider search in following patch, so it'll
be easier to make this change on single place.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Noel Grandin <noelgrandin@gmail.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1452158050-28061-4-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/unwind-libunwind.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/tools/perf/util/unwind-libunwind.c b/tools/perf/util/unwind-libunwind.c
index 3c258a0..f37859c 100644
--- a/tools/perf/util/unwind-libunwind.c
+++ b/tools/perf/util/unwind-libunwind.c
@@ -416,20 +416,19 @@ get_proc_name(unw_addr_space_t __maybe_unused as,
 static int access_dso_mem(struct unwind_info *ui, unw_word_t addr,
 			  unw_word_t *data)
 {
-	struct addr_location al;
+	struct map *map;
 	ssize_t size;
 
-	thread__find_addr_map(ui->thread, PERF_RECORD_MISC_USER,
-			      MAP__FUNCTION, addr, &al);
-	if (!al.map) {
+	map = find_map(addr, ui);
+	if (!map) {
 		pr_debug("unwind: no map for %lx\n", (unsigned long)addr);
 		return -1;
 	}
 
-	if (!al.map->dso)
+	if (!map->dso)
 		return -1;
 
-	size = dso__data_read_addr(al.map->dso, al.map, ui->machine,
+	size = dso__data_read_addr(map->dso, map, ui->machine,
 				   addr, (u8 *) data, sizeof(*data));
 
 	return !(size == sizeof(*data));
-- 
cgit v1.1


From 0ddf5246f70ecc04e1bb4c4dc2be65977d1c03a7 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 7 Jan 2016 10:14:01 +0100
Subject: perf unwind: Check for mmaps also in MAP__VARIABLE tree

We've seen cases (softice) where DWARF unwinder went through non
executable mmaps, which we need to lookup in MAP__VARIABLE tree.

Reported-and-Tested-by: Noel Grandin <noelgrandin@gmail.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1452158050-28061-5-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/unwind-libunwind.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tools/perf/util/unwind-libunwind.c b/tools/perf/util/unwind-libunwind.c
index f37859c..ee7e372 100644
--- a/tools/perf/util/unwind-libunwind.c
+++ b/tools/perf/util/unwind-libunwind.c
@@ -319,6 +319,15 @@ static struct map *find_map(unw_word_t ip, struct unwind_info *ui)
 
 	thread__find_addr_map(ui->thread, PERF_RECORD_MISC_USER,
 			      MAP__FUNCTION, ip, &al);
+	if (!al.map) {
+		/*
+		 * We've seen cases (softice) where DWARF unwinder went
+		 * through non executable mmaps, which we need to lookup
+		 * in MAP__VARIABLE tree.
+		 */
+		thread__find_addr_map(ui->thread, PERF_RECORD_MISC_USER,
+				      MAP__VARIABLE, ip, &al);
+	}
 	return al.map;
 }
 
-- 
cgit v1.1


From 0ba98149f8c8b6b2ba36be9938afb731fa719004 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 7 Jan 2016 10:14:02 +0100
Subject: perf libdw: Check for mmaps also in MAP__VARIABLE tree

We've seen cases (softice) where DWARF unwinder went through non
executable mmaps, which we need to lookup in MAP__VARIABLE tree.

Reported-and-Tested-by: Noel Grandin <noelgrandin@gmail.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1452158050-28061-6-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/unwind-libdw.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c
index db8142b..cf5e250 100644
--- a/tools/perf/util/unwind-libdw.c
+++ b/tools/perf/util/unwind-libdw.c
@@ -96,6 +96,16 @@ static int access_dso_mem(struct unwind_info *ui, Dwarf_Addr addr,
 	thread__find_addr_map(ui->thread, PERF_RECORD_MISC_USER,
 			      MAP__FUNCTION, addr, &al);
 	if (!al.map) {
+		/*
+		 * We've seen cases (softice) where DWARF unwinder went
+		 * through non executable mmaps, which we need to lookup
+		 * in MAP__VARIABLE tree.
+		 */
+		thread__find_addr_map(ui->thread, PERF_RECORD_MISC_USER,
+				      MAP__VARIABLE, addr, &al);
+	}
+
+	if (!al.map) {
 		pr_debug("unwind: no map for %lx\n", (unsigned long)addr);
 		return -1;
 	}
-- 
cgit v1.1


From 5c0cf22477eaa890beeb4bc3554e5bebbea4b007 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Thu, 7 Jan 2016 14:30:22 +0100
Subject: perf record: Store data mmaps for dwarf unwind

Currently we don't synthesize data mmap by default. It depends on -d
option, that enables data address sampling.

But we've seen cases (softice) where DWARF unwinder went through non
executable mmaps, which we need to lookup in MAP__VARIABLE tree.

Making data mmaps to be synthesized for dwarf unwind as well.

Reported-by: Noel Grandin <noelgrandin@gmail.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20160107133022.GA32115@krava.brq.redhat.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-record.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 9c5cdc2c4..dc4e0ad 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -815,8 +815,12 @@ int record_parse_callchain_opt(const struct option *opt,
 	}
 
 	ret = parse_callchain_record_opt(arg, &callchain_param);
-	if (!ret)
+	if (!ret) {
+		/* Enable data address sampling for DWARF unwind. */
+		if (callchain_param.record_mode == CALLCHAIN_DWARF)
+			record->sample_address = true;
 		callchain_debug();
+	}
 
 	return ret;
 }
-- 
cgit v1.1


From 775d8a1b0d75211cc6123915c6b5b688f2002478 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 7 Jan 2016 09:12:29 +0900
Subject: perf evlist: Add --trace-fields option to show trace fields

To use dynamic sort keys, it might be good to add an option to see the
list of field names.

  $ perf evlist -i perf.data.sched
  sched:sched_switch
  sched:sched_stat_wait
  sched:sched_stat_sleep
  sched:sched_stat_iowait
  sched:sched_stat_runtime
  sched:sched_process_fork
  sched:sched_wakeup
  sched:sched_wakeup_new
  sched:sched_migrate_task
  # Tip: use 'perf evlist --trace-fields' to show fields for tracepoint events

  $ perf evlist -i perf.data.sched --trace-fields
  sched:sched_switch: trace_fields: prev_comm,prev_pid,prev_prio,prev_state,next_comm,next_pid,next_prio
  sched:sched_stat_wait: trace_fields: comm,pid,delay
  sched:sched_stat_sleep: trace_fields: comm,pid,delay
  sched:sched_stat_iowait: trace_fields: comm,pid,delay
  sched:sched_stat_runtime: trace_fields: comm,pid,runtime,vruntime
  sched:sched_process_fork: trace_fields: parent_comm,parent_pid,child_comm,child_pid
  sched:sched_wakeup: trace_fields: comm,pid,prio,success,target_cpu
  sched:sched_wakeup_new: trace_fields: comm,pid,prio,success,target_cpu
  sched:sched_migrate_task: trace_fields: comm,pid,prio,orig_cpu,dest_cpu

Committer notes:

For another file, in verbose mode:

  # perf evlist -v --trace-fields
  sched:sched_switch: type: 2, size: 112, config: 0x10b, { sample_period, sample_freq }: 1, sample_type: IP|TID|TIME|CPU|PERIOD|RAW, disabled: 1, inherit: 1, mmap: 1, comm: 1, task: 1, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, trace_fields: prev_comm,prev_pid,prev_prio,prev_state,next_comm,next_pid,next_prio
  #

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1452125549-1511-5-git-send-email-namhyung@kernel.org
[ Replaced 'trace_fields=' with 'trace_fields: ' to make the output consistent in -v mode ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-evlist.txt |  3 +++
 tools/perf/builtin-evlist.c              | 11 ++++++++++-
 tools/perf/util/evsel.c                  | 23 +++++++++++++++++++++++
 tools/perf/util/evsel.h                  |  1 +
 4 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/tools/perf/Documentation/perf-evlist.txt b/tools/perf/Documentation/perf-evlist.txt
index 1ceb370..6f7200f 100644
--- a/tools/perf/Documentation/perf-evlist.txt
+++ b/tools/perf/Documentation/perf-evlist.txt
@@ -32,6 +32,9 @@ OPTIONS
 --group::
 	Show event group information.
 
+--trace-fields::
+	Show tracepoint field names.
+
 SEE ALSO
 --------
 linkperf:perf-record[1], linkperf:perf-list[1],
diff --git a/tools/perf/builtin-evlist.c b/tools/perf/builtin-evlist.c
index 08a7d36..8a31f51 100644
--- a/tools/perf/builtin-evlist.c
+++ b/tools/perf/builtin-evlist.c
@@ -26,14 +26,22 @@ static int __cmd_evlist(const char *file_name, struct perf_attr_details *details
 		.mode = PERF_DATA_MODE_READ,
 		.force = details->force,
 	};
+	bool has_tracepoint = false;
 
 	session = perf_session__new(&file, 0, NULL);
 	if (session == NULL)
 		return -1;
 
-	evlist__for_each(session->evlist, pos)
+	evlist__for_each(session->evlist, pos) {
 		perf_evsel__fprintf(pos, details, stdout);
 
+		if (pos->attr.type == PERF_TYPE_TRACEPOINT)
+			has_tracepoint = true;
+	}
+
+	if (has_tracepoint && !details->trace_fields)
+		printf("# Tip: use 'perf evlist --trace-fields' to show fields for tracepoint events\n");
+
 	perf_session__delete(session);
 	return 0;
 }
@@ -49,6 +57,7 @@ int cmd_evlist(int argc, const char **argv, const char *prefix __maybe_unused)
 	OPT_BOOLEAN('g', "group", &details.event_group,
 		    "Show event group information"),
 	OPT_BOOLEAN('f', "force", &details.force, "don't complain, do it"),
+	OPT_BOOLEAN(0, "trace-fields", &details.trace_fields, "Show tracepoint fields"),
 	OPT_END()
 	};
 	const char * const evlist_usage[] = {
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 544e440..cdbaf9b 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -2298,6 +2298,29 @@ int perf_evsel__fprintf(struct perf_evsel *evsel,
 		printed += comma_fprintf(fp, &first, " %s=%" PRIu64,
 					 term, (u64)evsel->attr.sample_freq);
 	}
+
+	if (details->trace_fields) {
+		struct format_field *field;
+
+		if (evsel->attr.type != PERF_TYPE_TRACEPOINT) {
+			printed += comma_fprintf(fp, &first, " (not a tracepoint)");
+			goto out;
+		}
+
+		field = evsel->tp_format->format.fields;
+		if (field == NULL) {
+			printed += comma_fprintf(fp, &first, " (no trace field)");
+			goto out;
+		}
+
+		printed += comma_fprintf(fp, &first, " trace_fields: %s", field->name);
+
+		field = field->next;
+		while (field) {
+			printed += comma_fprintf(fp, &first, "%s", field->name);
+			field = field->next;
+		}
+	}
 out:
 	fputc('\n', fp);
 	return ++printed;
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 5ded1fc..8e75434 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -369,6 +369,7 @@ struct perf_attr_details {
 	bool verbose;
 	bool event_group;
 	bool force;
+	bool trace_fields;
 };
 
 int perf_evsel__fprintf(struct perf_evsel *evsel,
-- 
cgit v1.1


From 1f1a89ac05f6e88aa341e86e57435fdbb1177c0c Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Fri, 8 Jan 2016 09:55:33 +0000
Subject: x86/mm: Micro-optimise clflush_cache_range()

Whilst inspecting the asm for clflush_cache_range() and some perf profiles
that required extensive flushing of single cachelines (from part of the
intel-gpu-tools GPU benchmarks), we noticed that gcc was reloading
boot_cpu_data.x86_clflush_size on every iteration of the loop. We can
manually hoist that read which perf regarded as taking ~25% of the
function time for a single cacheline flush.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Acked-by: "H. Peter Anvin" <hpa@zytor.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Sai Praneeth <sai.praneeth.prakhya@intel.com>
Link: http://lkml.kernel.org/r/1452246933-10890-1-git-send-email-chris@chris-wilson.co.uk
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/pageattr.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index a3137a4..6000ad7 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -129,14 +129,16 @@ within(unsigned long addr, unsigned long start, unsigned long end)
  */
 void clflush_cache_range(void *vaddr, unsigned int size)
 {
-	unsigned long clflush_mask = boot_cpu_data.x86_clflush_size - 1;
+	const unsigned long clflush_size = boot_cpu_data.x86_clflush_size;
+	void *p = (void *)((unsigned long)vaddr & ~(clflush_size - 1));
 	void *vend = vaddr + size;
-	void *p;
+
+	if (p >= vend)
+		return;
 
 	mb();
 
-	for (p = (void *)((unsigned long)vaddr & ~clflush_mask);
-	     p < vend; p += boot_cpu_data.x86_clflush_size)
+	for (; p < vend; p += clflush_size)
 		clflushopt(p);
 
 	mb();
-- 
cgit v1.1


From b43417216e9ce55e1f1ab7c834c7ab43db0b53e1 Mon Sep 17 00:00:00 2001
From: Jann Horn <jann@thejh.net>
Date: Tue, 5 Jan 2016 18:27:29 +0100
Subject: compat_ioctl: don't look up the fd twice

In code in fs/compat_ioctl.c that translates ioctl arguments
into a in-kernel structure, then performs sys_ioctl, possibly
under set_fs(KERNEL_DS), this commit changes the sys_ioctl
calls to do_ioctl calls. do_ioctl is a new function that does
the same thing as sys_ioctl, but doesn't look up the fd again.

This change is made to avoid (potential) security issues
because of ioctl handlers that accept one of the ioctl
commands I2C_FUNCS, VIDEO_GET_EVENT, MTIOCPOS, MTIOCGET,
TIOCGSERIAL, TIOCSSERIAL, RTC_IRQP_READ, RTC_EPOCH_READ.
This can happen for multiple reasons:

 - The ioctl command number could be reused.
 - The ioctl handler might not check the full ioctl
   command. This is e.g. true for drm_ioctl.
 - The ioctl handler is very special, e.g. cuse_file_ioctl

The real issue is that set_fs(KERNEL_DS) is used here,
but that's fixed in a separate commit
"compat_ioctl: don't call do_ioctl under set_fs(KERNEL_DS)".

This change mitigates potential security issues by
preventing a race that permits invocation of
unlocked_ioctl handlers under KERNEL_DS through compat
code even if a corresponding compat_ioctl handler exists.

So far, no way has been identified to use this to damage
kernel memory without having CAP_SYS_ADMIN in the init ns
(with the capability, doing reads/writes at arbitrary
kernel addresses should be easy through CUSE's ioctl
handler with FUSE_IOCTL_UNRESTRICTED set).

[AV: two missed sys_ioctl() taken care of]

Signed-off-by: Jann Horn <jann@thejh.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/compat_ioctl.c | 122 ++++++++++++++++++++++++++++++------------------------
 1 file changed, 68 insertions(+), 54 deletions(-)

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index dcf2653..06e60ca 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -115,15 +115,27 @@
 #include <asm/fbio.h>
 #endif
 
-static int w_long(unsigned int fd, unsigned int cmd,
-		compat_ulong_t __user *argp)
+static int do_ioctl(struct file *file, unsigned int fd,
+		unsigned int cmd, unsigned long arg)
+{
+	int err;
+
+	err = security_file_ioctl(file, cmd, arg);
+	if (err)
+		return err;
+
+	return do_vfs_ioctl(file, fd, cmd, arg);
+}
+
+static int w_long(struct file *file, unsigned int fd,
+		unsigned int cmd, compat_ulong_t __user *argp)
 {
 	mm_segment_t old_fs = get_fs();
 	int err;
 	unsigned long val;
 
 	set_fs (KERNEL_DS);
-	err = sys_ioctl(fd, cmd, (unsigned long)&val);
+	err = do_ioctl(file, fd, cmd, (unsigned long)&val);
 	set_fs (old_fs);
 	if (!err && put_user(val, argp))
 		return -EFAULT;
@@ -139,15 +151,15 @@ struct compat_video_event {
 	} u;
 };
 
-static int do_video_get_event(unsigned int fd, unsigned int cmd,
-		struct compat_video_event __user *up)
+static int do_video_get_event(struct file *file, unsigned int fd,
+		unsigned int cmd, struct compat_video_event __user *up)
 {
 	struct video_event kevent;
 	mm_segment_t old_fs = get_fs();
 	int err;
 
 	set_fs(KERNEL_DS);
-	err = sys_ioctl(fd, cmd, (unsigned long) &kevent);
+	err = do_ioctl(file, fd, cmd, (unsigned long) &kevent);
 	set_fs(old_fs);
 
 	if (!err) {
@@ -169,8 +181,8 @@ struct compat_video_still_picture {
         int32_t size;
 };
 
-static int do_video_stillpicture(unsigned int fd, unsigned int cmd,
-	struct compat_video_still_picture __user *up)
+static int do_video_stillpicture(struct file *file, unsigned int fd,
+		unsigned int cmd, struct compat_video_still_picture __user *up)
 {
 	struct video_still_picture __user *up_native;
 	compat_uptr_t fp;
@@ -190,7 +202,7 @@ static int do_video_stillpicture(unsigned int fd, unsigned int cmd,
 	if (err)
 		return -EFAULT;
 
-	err = sys_ioctl(fd, cmd, (unsigned long) up_native);
+	err = do_ioctl(file, fd, cmd, (unsigned long) up_native);
 
 	return err;
 }
@@ -200,8 +212,8 @@ struct compat_video_spu_palette {
 	compat_uptr_t palette;
 };
 
-static int do_video_set_spu_palette(unsigned int fd, unsigned int cmd,
-		struct compat_video_spu_palette __user *up)
+static int do_video_set_spu_palette(struct file *file, unsigned int fd,
+		unsigned int cmd, struct compat_video_spu_palette __user *up)
 {
 	struct video_spu_palette __user *up_native;
 	compat_uptr_t palp;
@@ -218,7 +230,7 @@ static int do_video_set_spu_palette(unsigned int fd, unsigned int cmd,
 	if (err)
 		return -EFAULT;
 
-	err = sys_ioctl(fd, cmd, (unsigned long) up_native);
+	err = do_ioctl(file, fd, cmd, (unsigned long) up_native);
 
 	return err;
 }
@@ -276,7 +288,7 @@ static int sg_build_iovec(sg_io_hdr_t __user *sgio, void __user *dxferp, u16 iov
 	return 0;
 }
 
-static int sg_ioctl_trans(unsigned int fd, unsigned int cmd,
+static int sg_ioctl_trans(struct file *file, unsigned int fd, unsigned int cmd,
 			sg_io_hdr32_t __user *sgio32)
 {
 	sg_io_hdr_t __user *sgio;
@@ -289,7 +301,7 @@ static int sg_ioctl_trans(unsigned int fd, unsigned int cmd,
 	if (get_user(interface_id, &sgio32->interface_id))
 		return -EFAULT;
 	if (interface_id != 'S')
-		return sys_ioctl(fd, cmd, (unsigned long)sgio32);
+		return do_ioctl(file, fd, cmd, (unsigned long)sgio32);
 
 	if (get_user(iovec_count, &sgio32->iovec_count))
 		return -EFAULT;
@@ -349,7 +361,7 @@ static int sg_ioctl_trans(unsigned int fd, unsigned int cmd,
 	if (put_user(compat_ptr(data), &sgio->usr_ptr))
 		return -EFAULT;
 
-	err = sys_ioctl(fd, cmd, (unsigned long) sgio);
+	err = do_ioctl(file, fd, cmd, (unsigned long) sgio);
 
 	if (err >= 0) {
 		void __user *datap;
@@ -380,13 +392,13 @@ struct compat_sg_req_info { /* used by SG_GET_REQUEST_TABLE ioctl() */
 	int unused;
 };
 
-static int sg_grt_trans(unsigned int fd, unsigned int cmd, struct
-			compat_sg_req_info __user *o)
+static int sg_grt_trans(struct file *file, unsigned int fd,
+		unsigned int cmd, struct compat_sg_req_info __user *o)
 {
 	int err, i;
 	sg_req_info_t __user *r;
 	r = compat_alloc_user_space(sizeof(sg_req_info_t)*SG_MAX_QUEUE);
-	err = sys_ioctl(fd,cmd,(unsigned long)r);
+	err = do_ioctl(file, fd, cmd, (unsigned long)r);
 	if (err < 0)
 		return err;
 	for (i = 0; i < SG_MAX_QUEUE; i++) {
@@ -412,8 +424,8 @@ struct sock_fprog32 {
 #define PPPIOCSPASS32	_IOW('t', 71, struct sock_fprog32)
 #define PPPIOCSACTIVE32	_IOW('t', 70, struct sock_fprog32)
 
-static int ppp_sock_fprog_ioctl_trans(unsigned int fd, unsigned int cmd,
-			struct sock_fprog32 __user *u_fprog32)
+static int ppp_sock_fprog_ioctl_trans(struct file *file, unsigned int fd,
+		unsigned int cmd, struct sock_fprog32 __user *u_fprog32)
 {
 	struct sock_fprog __user *u_fprog64 = compat_alloc_user_space(sizeof(struct sock_fprog));
 	void __user *fptr64;
@@ -435,7 +447,7 @@ static int ppp_sock_fprog_ioctl_trans(unsigned int fd, unsigned int cmd,
 	else
 		cmd = PPPIOCSACTIVE;
 
-	return sys_ioctl(fd, cmd, (unsigned long) u_fprog64);
+	return do_ioctl(file, fd, cmd, (unsigned long) u_fprog64);
 }
 
 struct ppp_option_data32 {
@@ -451,7 +463,7 @@ struct ppp_idle32 {
 };
 #define PPPIOCGIDLE32		_IOR('t', 63, struct ppp_idle32)
 
-static int ppp_gidle(unsigned int fd, unsigned int cmd,
+static int ppp_gidle(struct file *file, unsigned int fd, unsigned int cmd,
 		struct ppp_idle32 __user *idle32)
 {
 	struct ppp_idle __user *idle;
@@ -460,7 +472,7 @@ static int ppp_gidle(unsigned int fd, unsigned int cmd,
 
 	idle = compat_alloc_user_space(sizeof(*idle));
 
-	err = sys_ioctl(fd, PPPIOCGIDLE, (unsigned long) idle);
+	err = do_ioctl(file, fd, PPPIOCGIDLE, (unsigned long) idle);
 
 	if (!err) {
 		if (get_user(xmit, &idle->xmit_idle) ||
@@ -472,7 +484,7 @@ static int ppp_gidle(unsigned int fd, unsigned int cmd,
 	return err;
 }
 
-static int ppp_scompress(unsigned int fd, unsigned int cmd,
+static int ppp_scompress(struct file *file, unsigned int fd, unsigned int cmd,
 	struct ppp_option_data32 __user *odata32)
 {
 	struct ppp_option_data __user *odata;
@@ -492,7 +504,7 @@ static int ppp_scompress(unsigned int fd, unsigned int cmd,
 			 sizeof(__u32) + sizeof(int)))
 		return -EFAULT;
 
-	return sys_ioctl(fd, PPPIOCSCOMPRESS, (unsigned long) odata);
+	return do_ioctl(file, fd, PPPIOCSCOMPRESS, (unsigned long) odata);
 }
 
 #ifdef CONFIG_BLOCK
@@ -512,7 +524,8 @@ struct mtpos32 {
 };
 #define MTIOCPOS32	_IOR('m', 3, struct mtpos32)
 
-static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, void __user *argp)
+static int mt_ioctl_trans(struct file *file, unsigned int fd,
+		unsigned int cmd, void __user *argp)
 {
 	mm_segment_t old_fs = get_fs();
 	struct mtget get;
@@ -534,7 +547,7 @@ static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, void __user *argp)
 		break;
 	}
 	set_fs (KERNEL_DS);
-	err = sys_ioctl (fd, kcmd, (unsigned long)karg);
+	err = do_ioctl(file, fd, kcmd, (unsigned long)karg);
 	set_fs (old_fs);
 	if (err)
 		return err;
@@ -605,8 +618,8 @@ struct serial_struct32 {
         compat_int_t    reserved[1];
 };
 
-static int serial_struct_ioctl(unsigned fd, unsigned cmd,
-			struct serial_struct32 __user *ss32)
+static int serial_struct_ioctl(struct file *file, unsigned fd,
+		unsigned cmd, struct serial_struct32 __user *ss32)
 {
         typedef struct serial_struct32 SS32;
         int err;
@@ -629,7 +642,7 @@ static int serial_struct_ioctl(unsigned fd, unsigned cmd,
                 ss.iomap_base = 0UL;
         }
         set_fs(KERNEL_DS);
-                err = sys_ioctl(fd,cmd,(unsigned long)(&ss));
+	err = do_ioctl(file, fd, cmd, (unsigned long)&ss);
         set_fs(oldseg);
         if (cmd == TIOCGSERIAL && err >= 0) {
                 if (!access_ok(VERIFY_WRITE, ss32, sizeof(SS32)))
@@ -674,8 +687,8 @@ struct i2c_rdwr_aligned {
 	struct i2c_msg msgs[0];
 };
 
-static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd,
-			struct i2c_rdwr_ioctl_data32    __user *udata)
+static int do_i2c_rdwr_ioctl(struct file *file, unsigned int fd,
+	unsigned int cmd, struct i2c_rdwr_ioctl_data32 __user *udata)
 {
 	struct i2c_rdwr_aligned		__user *tdata;
 	struct i2c_msg			__user *tmsgs;
@@ -708,11 +721,11 @@ static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd,
 		    put_user(compat_ptr(datap), &tmsgs[i].buf))
 			return -EFAULT;
 	}
-	return sys_ioctl(fd, cmd, (unsigned long)tdata);
+	return do_ioctl(file, fd, cmd, (unsigned long)tdata);
 }
 
-static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd,
-			struct i2c_smbus_ioctl_data32   __user *udata)
+static int do_i2c_smbus_ioctl(struct file *file, unsigned int fd,
+		unsigned int cmd, struct i2c_smbus_ioctl_data32   __user *udata)
 {
 	struct i2c_smbus_ioctl_data	__user *tdata;
 	compat_caddr_t			datap;
@@ -734,7 +747,7 @@ static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd,
 	    __put_user(compat_ptr(datap), &tdata->data))
 		return -EFAULT;
 
-	return sys_ioctl(fd, cmd, (unsigned long)tdata);
+	return do_ioctl(file, fd, cmd, (unsigned long)tdata);
 }
 
 #define RTC_IRQP_READ32		_IOR('p', 0x0b, compat_ulong_t)
@@ -742,7 +755,8 @@ static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd,
 #define RTC_EPOCH_READ32	_IOR('p', 0x0d, compat_ulong_t)
 #define RTC_EPOCH_SET32		_IOW('p', 0x0e, compat_ulong_t)
 
-static int rtc_ioctl(unsigned fd, unsigned cmd, void __user *argp)
+static int rtc_ioctl(struct file *file, unsigned fd,
+		unsigned cmd, void __user *argp)
 {
 	mm_segment_t oldfs = get_fs();
 	compat_ulong_t val32;
@@ -753,7 +767,7 @@ static int rtc_ioctl(unsigned fd, unsigned cmd, void __user *argp)
 	case RTC_IRQP_READ32:
 	case RTC_EPOCH_READ32:
 		set_fs(KERNEL_DS);
-		ret = sys_ioctl(fd, (cmd == RTC_IRQP_READ32) ?
+		ret = do_ioctl(file, fd, (cmd == RTC_IRQP_READ32) ?
 					RTC_IRQP_READ : RTC_EPOCH_READ,
 					(unsigned long)&kval);
 		set_fs(oldfs);
@@ -762,9 +776,9 @@ static int rtc_ioctl(unsigned fd, unsigned cmd, void __user *argp)
 		val32 = kval;
 		return put_user(val32, (unsigned int __user *)argp);
 	case RTC_IRQP_SET32:
-		return sys_ioctl(fd, RTC_IRQP_SET, (unsigned long)argp);
+		return do_ioctl(file, fd, RTC_IRQP_SET, (unsigned long)argp);
 	case RTC_EPOCH_SET32:
-		return sys_ioctl(fd, RTC_EPOCH_SET, (unsigned long)argp);
+		return do_ioctl(file, fd, RTC_EPOCH_SET, (unsigned long)argp);
 	}
 
 	return -ENOIOCTLCMD;
@@ -1443,46 +1457,46 @@ static long do_ioctl_trans(int fd, unsigned int cmd,
 
 	switch (cmd) {
 	case PPPIOCGIDLE32:
-		return ppp_gidle(fd, cmd, argp);
+		return ppp_gidle(file, fd, cmd, argp);
 	case PPPIOCSCOMPRESS32:
-		return ppp_scompress(fd, cmd, argp);
+		return ppp_scompress(file, fd, cmd, argp);
 	case PPPIOCSPASS32:
 	case PPPIOCSACTIVE32:
-		return ppp_sock_fprog_ioctl_trans(fd, cmd, argp);
+		return ppp_sock_fprog_ioctl_trans(file, fd, cmd, argp);
 #ifdef CONFIG_BLOCK
 	case SG_IO:
-		return sg_ioctl_trans(fd, cmd, argp);
+		return sg_ioctl_trans(file, fd, cmd, argp);
 	case SG_GET_REQUEST_TABLE:
-		return sg_grt_trans(fd, cmd, argp);
+		return sg_grt_trans(file, fd, cmd, argp);
 	case MTIOCGET32:
 	case MTIOCPOS32:
-		return mt_ioctl_trans(fd, cmd, argp);
+		return mt_ioctl_trans(file, fd, cmd, argp);
 #endif
 	/* Serial */
 	case TIOCGSERIAL:
 	case TIOCSSERIAL:
-		return serial_struct_ioctl(fd, cmd, argp);
+		return serial_struct_ioctl(file, fd, cmd, argp);
 	/* i2c */
 	case I2C_FUNCS:
-		return w_long(fd, cmd, argp);
+		return w_long(file, fd, cmd, argp);
 	case I2C_RDWR:
-		return do_i2c_rdwr_ioctl(fd, cmd, argp);
+		return do_i2c_rdwr_ioctl(file, fd, cmd, argp);
 	case I2C_SMBUS:
-		return do_i2c_smbus_ioctl(fd, cmd, argp);
+		return do_i2c_smbus_ioctl(file, fd, cmd, argp);
 	/* Not implemented in the native kernel */
 	case RTC_IRQP_READ32:
 	case RTC_IRQP_SET32:
 	case RTC_EPOCH_READ32:
 	case RTC_EPOCH_SET32:
-		return rtc_ioctl(fd, cmd, argp);
+		return rtc_ioctl(file, fd, cmd, argp);
 
 	/* dvb */
 	case VIDEO_GET_EVENT:
-		return do_video_get_event(fd, cmd, argp);
+		return do_video_get_event(file, fd, cmd, argp);
 	case VIDEO_STILLPICTURE:
-		return do_video_stillpicture(fd, cmd, argp);
+		return do_video_stillpicture(file, fd, cmd, argp);
 	case VIDEO_SET_SPU_PALETTE:
-		return do_video_set_spu_palette(fd, cmd, argp);
+		return do_video_set_spu_palette(file, fd, cmd, argp);
 	}
 
 	/*
-- 
cgit v1.1


From 66cf191f3eae4582a83cb4251b75b43bee95a999 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 7 Jan 2016 09:53:30 -0500
Subject: compat_ioctl: don't pass fd around when not needed

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/compat_ioctl.c  | 103 +++++++++++++++++++++++++++--------------------------
 fs/internal.h      |   7 ++++
 fs/ioctl.c         |   4 +--
 include/linux/fs.h |   2 --
 4 files changed, 61 insertions(+), 55 deletions(-)

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 06e60ca..908837c 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -58,6 +58,8 @@
 #include <linux/atalk.h>
 #include <linux/gfp.h>
 
+#include "internal.h"
+
 #include <net/bluetooth/bluetooth.h>
 #include <net/bluetooth/hci_sock.h>
 #include <net/bluetooth/rfcomm.h>
@@ -115,8 +117,7 @@
 #include <asm/fbio.h>
 #endif
 
-static int do_ioctl(struct file *file, unsigned int fd,
-		unsigned int cmd, unsigned long arg)
+static int do_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	int err;
 
@@ -124,10 +125,10 @@ static int do_ioctl(struct file *file, unsigned int fd,
 	if (err)
 		return err;
 
-	return do_vfs_ioctl(file, fd, cmd, arg);
+	return vfs_ioctl(file, cmd, arg);
 }
 
-static int w_long(struct file *file, unsigned int fd,
+static int w_long(struct file *file,
 		unsigned int cmd, compat_ulong_t __user *argp)
 {
 	mm_segment_t old_fs = get_fs();
@@ -135,7 +136,7 @@ static int w_long(struct file *file, unsigned int fd,
 	unsigned long val;
 
 	set_fs (KERNEL_DS);
-	err = do_ioctl(file, fd, cmd, (unsigned long)&val);
+	err = do_ioctl(file, cmd, (unsigned long)&val);
 	set_fs (old_fs);
 	if (!err && put_user(val, argp))
 		return -EFAULT;
@@ -151,7 +152,7 @@ struct compat_video_event {
 	} u;
 };
 
-static int do_video_get_event(struct file *file, unsigned int fd,
+static int do_video_get_event(struct file *file,
 		unsigned int cmd, struct compat_video_event __user *up)
 {
 	struct video_event kevent;
@@ -159,7 +160,7 @@ static int do_video_get_event(struct file *file, unsigned int fd,
 	int err;
 
 	set_fs(KERNEL_DS);
-	err = do_ioctl(file, fd, cmd, (unsigned long) &kevent);
+	err = do_ioctl(file, cmd, (unsigned long) &kevent);
 	set_fs(old_fs);
 
 	if (!err) {
@@ -181,7 +182,7 @@ struct compat_video_still_picture {
         int32_t size;
 };
 
-static int do_video_stillpicture(struct file *file, unsigned int fd,
+static int do_video_stillpicture(struct file *file,
 		unsigned int cmd, struct compat_video_still_picture __user *up)
 {
 	struct video_still_picture __user *up_native;
@@ -202,7 +203,7 @@ static int do_video_stillpicture(struct file *file, unsigned int fd,
 	if (err)
 		return -EFAULT;
 
-	err = do_ioctl(file, fd, cmd, (unsigned long) up_native);
+	err = do_ioctl(file, cmd, (unsigned long) up_native);
 
 	return err;
 }
@@ -212,7 +213,7 @@ struct compat_video_spu_palette {
 	compat_uptr_t palette;
 };
 
-static int do_video_set_spu_palette(struct file *file, unsigned int fd,
+static int do_video_set_spu_palette(struct file *file,
 		unsigned int cmd, struct compat_video_spu_palette __user *up)
 {
 	struct video_spu_palette __user *up_native;
@@ -230,7 +231,7 @@ static int do_video_set_spu_palette(struct file *file, unsigned int fd,
 	if (err)
 		return -EFAULT;
 
-	err = do_ioctl(file, fd, cmd, (unsigned long) up_native);
+	err = do_ioctl(file, cmd, (unsigned long) up_native);
 
 	return err;
 }
@@ -288,7 +289,7 @@ static int sg_build_iovec(sg_io_hdr_t __user *sgio, void __user *dxferp, u16 iov
 	return 0;
 }
 
-static int sg_ioctl_trans(struct file *file, unsigned int fd, unsigned int cmd,
+static int sg_ioctl_trans(struct file *file, unsigned int cmd,
 			sg_io_hdr32_t __user *sgio32)
 {
 	sg_io_hdr_t __user *sgio;
@@ -301,7 +302,7 @@ static int sg_ioctl_trans(struct file *file, unsigned int fd, unsigned int cmd,
 	if (get_user(interface_id, &sgio32->interface_id))
 		return -EFAULT;
 	if (interface_id != 'S')
-		return do_ioctl(file, fd, cmd, (unsigned long)sgio32);
+		return do_ioctl(file, cmd, (unsigned long)sgio32);
 
 	if (get_user(iovec_count, &sgio32->iovec_count))
 		return -EFAULT;
@@ -361,7 +362,7 @@ static int sg_ioctl_trans(struct file *file, unsigned int fd, unsigned int cmd,
 	if (put_user(compat_ptr(data), &sgio->usr_ptr))
 		return -EFAULT;
 
-	err = do_ioctl(file, fd, cmd, (unsigned long) sgio);
+	err = do_ioctl(file, cmd, (unsigned long) sgio);
 
 	if (err >= 0) {
 		void __user *datap;
@@ -392,13 +393,13 @@ struct compat_sg_req_info { /* used by SG_GET_REQUEST_TABLE ioctl() */
 	int unused;
 };
 
-static int sg_grt_trans(struct file *file, unsigned int fd,
+static int sg_grt_trans(struct file *file,
 		unsigned int cmd, struct compat_sg_req_info __user *o)
 {
 	int err, i;
 	sg_req_info_t __user *r;
 	r = compat_alloc_user_space(sizeof(sg_req_info_t)*SG_MAX_QUEUE);
-	err = do_ioctl(file, fd, cmd, (unsigned long)r);
+	err = do_ioctl(file, cmd, (unsigned long)r);
 	if (err < 0)
 		return err;
 	for (i = 0; i < SG_MAX_QUEUE; i++) {
@@ -424,7 +425,7 @@ struct sock_fprog32 {
 #define PPPIOCSPASS32	_IOW('t', 71, struct sock_fprog32)
 #define PPPIOCSACTIVE32	_IOW('t', 70, struct sock_fprog32)
 
-static int ppp_sock_fprog_ioctl_trans(struct file *file, unsigned int fd,
+static int ppp_sock_fprog_ioctl_trans(struct file *file,
 		unsigned int cmd, struct sock_fprog32 __user *u_fprog32)
 {
 	struct sock_fprog __user *u_fprog64 = compat_alloc_user_space(sizeof(struct sock_fprog));
@@ -447,7 +448,7 @@ static int ppp_sock_fprog_ioctl_trans(struct file *file, unsigned int fd,
 	else
 		cmd = PPPIOCSACTIVE;
 
-	return do_ioctl(file, fd, cmd, (unsigned long) u_fprog64);
+	return do_ioctl(file, cmd, (unsigned long) u_fprog64);
 }
 
 struct ppp_option_data32 {
@@ -463,7 +464,7 @@ struct ppp_idle32 {
 };
 #define PPPIOCGIDLE32		_IOR('t', 63, struct ppp_idle32)
 
-static int ppp_gidle(struct file *file, unsigned int fd, unsigned int cmd,
+static int ppp_gidle(struct file *file, unsigned int cmd,
 		struct ppp_idle32 __user *idle32)
 {
 	struct ppp_idle __user *idle;
@@ -472,7 +473,7 @@ static int ppp_gidle(struct file *file, unsigned int fd, unsigned int cmd,
 
 	idle = compat_alloc_user_space(sizeof(*idle));
 
-	err = do_ioctl(file, fd, PPPIOCGIDLE, (unsigned long) idle);
+	err = do_ioctl(file, PPPIOCGIDLE, (unsigned long) idle);
 
 	if (!err) {
 		if (get_user(xmit, &idle->xmit_idle) ||
@@ -484,7 +485,7 @@ static int ppp_gidle(struct file *file, unsigned int fd, unsigned int cmd,
 	return err;
 }
 
-static int ppp_scompress(struct file *file, unsigned int fd, unsigned int cmd,
+static int ppp_scompress(struct file *file, unsigned int cmd,
 	struct ppp_option_data32 __user *odata32)
 {
 	struct ppp_option_data __user *odata;
@@ -504,7 +505,7 @@ static int ppp_scompress(struct file *file, unsigned int fd, unsigned int cmd,
 			 sizeof(__u32) + sizeof(int)))
 		return -EFAULT;
 
-	return do_ioctl(file, fd, PPPIOCSCOMPRESS, (unsigned long) odata);
+	return do_ioctl(file, PPPIOCSCOMPRESS, (unsigned long) odata);
 }
 
 #ifdef CONFIG_BLOCK
@@ -524,7 +525,7 @@ struct mtpos32 {
 };
 #define MTIOCPOS32	_IOR('m', 3, struct mtpos32)
 
-static int mt_ioctl_trans(struct file *file, unsigned int fd,
+static int mt_ioctl_trans(struct file *file,
 		unsigned int cmd, void __user *argp)
 {
 	mm_segment_t old_fs = get_fs();
@@ -547,7 +548,7 @@ static int mt_ioctl_trans(struct file *file, unsigned int fd,
 		break;
 	}
 	set_fs (KERNEL_DS);
-	err = do_ioctl(file, fd, kcmd, (unsigned long)karg);
+	err = do_ioctl(file, kcmd, (unsigned long)karg);
 	set_fs (old_fs);
 	if (err)
 		return err;
@@ -618,7 +619,7 @@ struct serial_struct32 {
         compat_int_t    reserved[1];
 };
 
-static int serial_struct_ioctl(struct file *file, unsigned fd,
+static int serial_struct_ioctl(struct file *file,
 		unsigned cmd, struct serial_struct32 __user *ss32)
 {
         typedef struct serial_struct32 SS32;
@@ -642,7 +643,7 @@ static int serial_struct_ioctl(struct file *file, unsigned fd,
                 ss.iomap_base = 0UL;
         }
         set_fs(KERNEL_DS);
-	err = do_ioctl(file, fd, cmd, (unsigned long)&ss);
+	err = do_ioctl(file, cmd, (unsigned long)&ss);
         set_fs(oldseg);
         if (cmd == TIOCGSERIAL && err >= 0) {
                 if (!access_ok(VERIFY_WRITE, ss32, sizeof(SS32)))
@@ -687,7 +688,7 @@ struct i2c_rdwr_aligned {
 	struct i2c_msg msgs[0];
 };
 
-static int do_i2c_rdwr_ioctl(struct file *file, unsigned int fd,
+static int do_i2c_rdwr_ioctl(struct file *file,
 	unsigned int cmd, struct i2c_rdwr_ioctl_data32 __user *udata)
 {
 	struct i2c_rdwr_aligned		__user *tdata;
@@ -721,10 +722,10 @@ static int do_i2c_rdwr_ioctl(struct file *file, unsigned int fd,
 		    put_user(compat_ptr(datap), &tmsgs[i].buf))
 			return -EFAULT;
 	}
-	return do_ioctl(file, fd, cmd, (unsigned long)tdata);
+	return do_ioctl(file, cmd, (unsigned long)tdata);
 }
 
-static int do_i2c_smbus_ioctl(struct file *file, unsigned int fd,
+static int do_i2c_smbus_ioctl(struct file *file,
 		unsigned int cmd, struct i2c_smbus_ioctl_data32   __user *udata)
 {
 	struct i2c_smbus_ioctl_data	__user *tdata;
@@ -747,7 +748,7 @@ static int do_i2c_smbus_ioctl(struct file *file, unsigned int fd,
 	    __put_user(compat_ptr(datap), &tdata->data))
 		return -EFAULT;
 
-	return do_ioctl(file, fd, cmd, (unsigned long)tdata);
+	return do_ioctl(file, cmd, (unsigned long)tdata);
 }
 
 #define RTC_IRQP_READ32		_IOR('p', 0x0b, compat_ulong_t)
@@ -755,7 +756,7 @@ static int do_i2c_smbus_ioctl(struct file *file, unsigned int fd,
 #define RTC_EPOCH_READ32	_IOR('p', 0x0d, compat_ulong_t)
 #define RTC_EPOCH_SET32		_IOW('p', 0x0e, compat_ulong_t)
 
-static int rtc_ioctl(struct file *file, unsigned fd,
+static int rtc_ioctl(struct file *file,
 		unsigned cmd, void __user *argp)
 {
 	mm_segment_t oldfs = get_fs();
@@ -767,7 +768,7 @@ static int rtc_ioctl(struct file *file, unsigned fd,
 	case RTC_IRQP_READ32:
 	case RTC_EPOCH_READ32:
 		set_fs(KERNEL_DS);
-		ret = do_ioctl(file, fd, (cmd == RTC_IRQP_READ32) ?
+		ret = do_ioctl(file, (cmd == RTC_IRQP_READ32) ?
 					RTC_IRQP_READ : RTC_EPOCH_READ,
 					(unsigned long)&kval);
 		set_fs(oldfs);
@@ -776,9 +777,9 @@ static int rtc_ioctl(struct file *file, unsigned fd,
 		val32 = kval;
 		return put_user(val32, (unsigned int __user *)argp);
 	case RTC_IRQP_SET32:
-		return do_ioctl(file, fd, RTC_IRQP_SET, (unsigned long)argp);
+		return do_ioctl(file, RTC_IRQP_SET, (unsigned long)argp);
 	case RTC_EPOCH_SET32:
-		return do_ioctl(file, fd, RTC_EPOCH_SET, (unsigned long)argp);
+		return do_ioctl(file, RTC_EPOCH_SET, (unsigned long)argp);
 	}
 
 	return -ENOIOCTLCMD;
@@ -1450,53 +1451,53 @@ IGNORE_IOCTL(FBIOGCURSOR32)
  * a compat_ioctl operation in the place that handleѕ the
  * ioctl for the native case.
  */
-static long do_ioctl_trans(int fd, unsigned int cmd,
+static long do_ioctl_trans(unsigned int cmd,
 		 unsigned long arg, struct file *file)
 {
 	void __user *argp = compat_ptr(arg);
 
 	switch (cmd) {
 	case PPPIOCGIDLE32:
-		return ppp_gidle(file, fd, cmd, argp);
+		return ppp_gidle(file, cmd, argp);
 	case PPPIOCSCOMPRESS32:
-		return ppp_scompress(file, fd, cmd, argp);
+		return ppp_scompress(file, cmd, argp);
 	case PPPIOCSPASS32:
 	case PPPIOCSACTIVE32:
-		return ppp_sock_fprog_ioctl_trans(file, fd, cmd, argp);
+		return ppp_sock_fprog_ioctl_trans(file, cmd, argp);
 #ifdef CONFIG_BLOCK
 	case SG_IO:
-		return sg_ioctl_trans(file, fd, cmd, argp);
+		return sg_ioctl_trans(file, cmd, argp);
 	case SG_GET_REQUEST_TABLE:
-		return sg_grt_trans(file, fd, cmd, argp);
+		return sg_grt_trans(file, cmd, argp);
 	case MTIOCGET32:
 	case MTIOCPOS32:
-		return mt_ioctl_trans(file, fd, cmd, argp);
+		return mt_ioctl_trans(file, cmd, argp);
 #endif
 	/* Serial */
 	case TIOCGSERIAL:
 	case TIOCSSERIAL:
-		return serial_struct_ioctl(file, fd, cmd, argp);
+		return serial_struct_ioctl(file, cmd, argp);
 	/* i2c */
 	case I2C_FUNCS:
-		return w_long(file, fd, cmd, argp);
+		return w_long(file, cmd, argp);
 	case I2C_RDWR:
-		return do_i2c_rdwr_ioctl(file, fd, cmd, argp);
+		return do_i2c_rdwr_ioctl(file, cmd, argp);
 	case I2C_SMBUS:
-		return do_i2c_smbus_ioctl(file, fd, cmd, argp);
+		return do_i2c_smbus_ioctl(file, cmd, argp);
 	/* Not implemented in the native kernel */
 	case RTC_IRQP_READ32:
 	case RTC_IRQP_SET32:
 	case RTC_EPOCH_READ32:
 	case RTC_EPOCH_SET32:
-		return rtc_ioctl(file, fd, cmd, argp);
+		return rtc_ioctl(file, cmd, argp);
 
 	/* dvb */
 	case VIDEO_GET_EVENT:
-		return do_video_get_event(file, fd, cmd, argp);
+		return do_video_get_event(file, cmd, argp);
 	case VIDEO_STILLPICTURE:
-		return do_video_stillpicture(file, fd, cmd, argp);
+		return do_video_stillpicture(file, cmd, argp);
 	case VIDEO_SET_SPU_PALETTE:
-		return do_video_set_spu_palette(file, fd, cmd, argp);
+		return do_video_set_spu_palette(file, cmd, argp);
 	}
 
 	/*
@@ -1527,7 +1528,7 @@ static long do_ioctl_trans(int fd, unsigned int cmd,
 	case NBD_SET_BLKSIZE:
 	case NBD_SET_SIZE:
 	case NBD_SET_SIZE_BLOCKS:
-		return do_vfs_ioctl(file, fd, cmd, arg);
+		return vfs_ioctl(file, cmd, arg);
 	}
 
 	return -ENOIOCTLCMD;
@@ -1616,7 +1617,7 @@ COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
 	if (compat_ioctl_check_table(XFORM(cmd)))
 		goto found_handler;
 
-	error = do_ioctl_trans(fd, cmd, arg, f.file);
+	error = do_ioctl_trans(cmd, arg, f.file);
 	if (error == -ENOIOCTLCMD)
 		error = -ENOTTY;
 
diff --git a/fs/internal.h b/fs/internal.h
index 71859c4d..e38c08c 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -151,3 +151,10 @@ extern void mnt_pin_kill(struct mount *m);
  * fs/nsfs.c
  */
 extern struct dentry_operations ns_dentry_operations;
+
+/*
+ * fs/ioctl.c
+ */
+extern int do_vfs_ioctl(struct file *file, unsigned int fd, unsigned int cmd,
+		    unsigned long arg);
+extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 5d01d26..41c352e 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -15,6 +15,7 @@
 #include <linux/writeback.h>
 #include <linux/buffer_head.h>
 #include <linux/falloc.h>
+#include "internal.h"
 
 #include <asm/ioctls.h>
 
@@ -32,8 +33,7 @@
  *
  * Returns 0 on success, -errno on error.
  */
-static long vfs_ioctl(struct file *filp, unsigned int cmd,
-		      unsigned long arg)
+long vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	int error = -ENOTTY;
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3aa5142..51f9f8d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2764,8 +2764,6 @@ extern int vfs_lstat(const char __user *, struct kstat *);
 extern int vfs_fstat(unsigned int, struct kstat *);
 extern int vfs_fstatat(int , const char __user *, struct kstat *, int);
 
-extern int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
-		    unsigned long arg);
 extern int __generic_block_fiemap(struct inode *inode,
 				  struct fiemap_extent_info *fieinfo,
 				  loff_t start, loff_t len,
-- 
cgit v1.1


From a7f61e89af73e9bf760826b20dba4e637221fcb9 Mon Sep 17 00:00:00 2001
From: Jann Horn <jann@thejh.net>
Date: Tue, 5 Jan 2016 18:27:30 +0100
Subject: compat_ioctl: don't call do_ioctl under set_fs(KERNEL_DS)

This replaces all code in fs/compat_ioctl.c that translated
ioctl arguments into a in-kernel structure, then performed
do_ioctl under set_fs(KERNEL_DS), with code that allocates
data on the user stack and can call the VFS ioctl handler
under USER_DS.

This is done as a hardening measure because the caller
does not know what kind of ioctl handler will be invoked,
only that no corresponding compat_ioctl handler exists and
what the ioctl command number is. The accidental
invocation of an unlocked_ioctl handler that unexpectedly
calls copy_to_user could be a severe security issue.

Signed-off-by: Jann Horn <jann@thejh.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/compat_ioctl.c | 130 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 68 insertions(+), 62 deletions(-)

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 908837c..9144b77 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -117,6 +117,13 @@
 #include <asm/fbio.h>
 #endif
 
+#define convert_in_user(srcptr, dstptr)			\
+({							\
+	typeof(*srcptr) val;				\
+							\
+	get_user(val, srcptr) || put_user(val, dstptr);	\
+})
+
 static int do_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	int err;
@@ -131,16 +138,17 @@ static int do_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 static int w_long(struct file *file,
 		unsigned int cmd, compat_ulong_t __user *argp)
 {
-	mm_segment_t old_fs = get_fs();
 	int err;
-	unsigned long val;
+	unsigned long __user *valp = compat_alloc_user_space(sizeof(*valp));
 
-	set_fs (KERNEL_DS);
-	err = do_ioctl(file, cmd, (unsigned long)&val);
-	set_fs (old_fs);
-	if (!err && put_user(val, argp))
+	if (valp == NULL)
 		return -EFAULT;
-	return err;
+	err = do_ioctl(file, cmd, (unsigned long)valp);
+	if (err)
+		return err;
+	if (convert_in_user(valp, argp))
+		return -EFAULT;
+	return 0;
 }
 
 struct compat_video_event {
@@ -155,20 +163,20 @@ struct compat_video_event {
 static int do_video_get_event(struct file *file,
 		unsigned int cmd, struct compat_video_event __user *up)
 {
-	struct video_event kevent;
-	mm_segment_t old_fs = get_fs();
+	struct video_event __user *kevent =
+		compat_alloc_user_space(sizeof(*kevent));
 	int err;
 
-	set_fs(KERNEL_DS);
-	err = do_ioctl(file, cmd, (unsigned long) &kevent);
-	set_fs(old_fs);
+	if (kevent == NULL)
+		return -EFAULT;
 
+	err = do_ioctl(file, cmd, (unsigned long)kevent);
 	if (!err) {
-		err  = put_user(kevent.type, &up->type);
-		err |= put_user(kevent.timestamp, &up->timestamp);
-		err |= put_user(kevent.u.size.w, &up->u.size.w);
-		err |= put_user(kevent.u.size.h, &up->u.size.h);
-		err |= put_user(kevent.u.size.aspect_ratio,
+		err  = convert_in_user(&kevent->type, &up->type);
+		err |= convert_in_user(&kevent->timestamp, &up->timestamp);
+		err |= convert_in_user(&kevent->u.size.w, &up->u.size.w);
+		err |= convert_in_user(&kevent->u.size.h, &up->u.size.h);
+		err |= convert_in_user(&kevent->u.size.aspect_ratio,
 				&up->u.size.aspect_ratio);
 		if (err)
 			err = -EFAULT;
@@ -528,10 +536,10 @@ struct mtpos32 {
 static int mt_ioctl_trans(struct file *file,
 		unsigned int cmd, void __user *argp)
 {
-	mm_segment_t old_fs = get_fs();
-	struct mtget get;
+	/* NULL initialization to make gcc shut up */
+	struct mtget __user *get = NULL;
 	struct mtget32 __user *umget32;
-	struct mtpos pos;
+	struct mtpos __user *pos = NULL;
 	struct mtpos32 __user *upos32;
 	unsigned long kcmd;
 	void *karg;
@@ -540,32 +548,34 @@ static int mt_ioctl_trans(struct file *file,
 	switch(cmd) {
 	case MTIOCPOS32:
 		kcmd = MTIOCPOS;
-		karg = &pos;
+		pos = compat_alloc_user_space(sizeof(*pos));
+		karg = pos;
 		break;
 	default:	/* MTIOCGET32 */
 		kcmd = MTIOCGET;
-		karg = &get;
+		get = compat_alloc_user_space(sizeof(*get));
+		karg = get;
 		break;
 	}
-	set_fs (KERNEL_DS);
+	if (karg == NULL)
+		return -EFAULT;
 	err = do_ioctl(file, kcmd, (unsigned long)karg);
-	set_fs (old_fs);
 	if (err)
 		return err;
 	switch (cmd) {
 	case MTIOCPOS32:
 		upos32 = argp;
-		err = __put_user(pos.mt_blkno, &upos32->mt_blkno);
+		err = convert_in_user(&pos->mt_blkno, &upos32->mt_blkno);
 		break;
 	case MTIOCGET32:
 		umget32 = argp;
-		err = __put_user(get.mt_type, &umget32->mt_type);
-		err |= __put_user(get.mt_resid, &umget32->mt_resid);
-		err |= __put_user(get.mt_dsreg, &umget32->mt_dsreg);
-		err |= __put_user(get.mt_gstat, &umget32->mt_gstat);
-		err |= __put_user(get.mt_erreg, &umget32->mt_erreg);
-		err |= __put_user(get.mt_fileno, &umget32->mt_fileno);
-		err |= __put_user(get.mt_blkno, &umget32->mt_blkno);
+		err = convert_in_user(&get->mt_type, &umget32->mt_type);
+		err |= convert_in_user(&get->mt_resid, &umget32->mt_resid);
+		err |= convert_in_user(&get->mt_dsreg, &umget32->mt_dsreg);
+		err |= convert_in_user(&get->mt_gstat, &umget32->mt_gstat);
+		err |= convert_in_user(&get->mt_erreg, &umget32->mt_erreg);
+		err |= convert_in_user(&get->mt_fileno, &umget32->mt_fileno);
+		err |= convert_in_user(&get->mt_blkno, &umget32->mt_blkno);
 		break;
 	}
 	return err ? -EFAULT: 0;
@@ -624,37 +634,36 @@ static int serial_struct_ioctl(struct file *file,
 {
         typedef struct serial_struct32 SS32;
         int err;
-        struct serial_struct ss;
-        mm_segment_t oldseg = get_fs();
+	struct serial_struct __user *ss = compat_alloc_user_space(sizeof(*ss));
         __u32 udata;
 	unsigned int base;
+	unsigned char *iomem_base;
 
+	if (ss == NULL)
+		return -EFAULT;
         if (cmd == TIOCSSERIAL) {
-                if (!access_ok(VERIFY_READ, ss32, sizeof(SS32)))
-                        return -EFAULT;
-                if (__copy_from_user(&ss, ss32, offsetof(SS32, iomem_base)))
+		if (copy_in_user(ss, ss32, offsetof(SS32, iomem_base)) ||
+		    get_user(udata, &ss32->iomem_base))
 			return -EFAULT;
-                if (__get_user(udata, &ss32->iomem_base))
+		iomem_base = compat_ptr(udata);
+		if (put_user(iomem_base, &ss->iomem_base) ||
+		    convert_in_user(&ss32->iomem_reg_shift,
+		      &ss->iomem_reg_shift) ||
+		    convert_in_user(&ss32->port_high, &ss->port_high) ||
+		    put_user(0UL, &ss->iomap_base))
 			return -EFAULT;
-                ss.iomem_base = compat_ptr(udata);
-                if (__get_user(ss.iomem_reg_shift, &ss32->iomem_reg_shift) ||
-		    __get_user(ss.port_high, &ss32->port_high))
-			return -EFAULT;
-                ss.iomap_base = 0UL;
         }
-        set_fs(KERNEL_DS);
-	err = do_ioctl(file, cmd, (unsigned long)&ss);
-        set_fs(oldseg);
+	err = do_ioctl(file, cmd, (unsigned long)ss);
         if (cmd == TIOCGSERIAL && err >= 0) {
-                if (!access_ok(VERIFY_WRITE, ss32, sizeof(SS32)))
-                        return -EFAULT;
-                if (__copy_to_user(ss32,&ss,offsetof(SS32,iomem_base)))
+		if (copy_in_user(ss32, ss, offsetof(SS32, iomem_base)) ||
+		    get_user(iomem_base, &ss->iomem_base))
 			return -EFAULT;
-		base = (unsigned long)ss.iomem_base  >> 32 ?
-			0xffffffff : (unsigned)(unsigned long)ss.iomem_base;
-		if (__put_user(base, &ss32->iomem_base) ||
-		    __put_user(ss.iomem_reg_shift, &ss32->iomem_reg_shift) ||
-		    __put_user(ss.port_high, &ss32->port_high))
+		base = (unsigned long)iomem_base  >> 32 ?
+			0xffffffff : (unsigned)(unsigned long)iomem_base;
+		if (put_user(base, &ss32->iomem_base) ||
+		    convert_in_user(&ss->iomem_reg_shift,
+		      &ss32->iomem_reg_shift) ||
+		    convert_in_user(&ss->port_high, &ss32->port_high))
 			return -EFAULT;
         }
         return err;
@@ -759,23 +768,20 @@ static int do_i2c_smbus_ioctl(struct file *file,
 static int rtc_ioctl(struct file *file,
 		unsigned cmd, void __user *argp)
 {
-	mm_segment_t oldfs = get_fs();
-	compat_ulong_t val32;
-	unsigned long kval;
+	unsigned long __user *valp = compat_alloc_user_space(sizeof(*valp));
 	int ret;
 
+	if (valp == NULL)
+		return -EFAULT;
 	switch (cmd) {
 	case RTC_IRQP_READ32:
 	case RTC_EPOCH_READ32:
-		set_fs(KERNEL_DS);
 		ret = do_ioctl(file, (cmd == RTC_IRQP_READ32) ?
 					RTC_IRQP_READ : RTC_EPOCH_READ,
-					(unsigned long)&kval);
-		set_fs(oldfs);
+					(unsigned long)valp);
 		if (ret)
 			return ret;
-		val32 = kval;
-		return put_user(val32, (unsigned int __user *)argp);
+		return convert_in_user(valp, (unsigned int __user *)argp);
 	case RTC_IRQP_SET32:
 		return do_ioctl(file, RTC_IRQP_SET, (unsigned long)argp);
 	case RTC_EPOCH_SET32:
-- 
cgit v1.1