powerpc/mm/radix: Workaround prefetch issue with KVM

There's a somewhat architectural issue with Radix MMU and KVM. When coming out of a guest with AIL (Alternate Interrupt Location, ie, MMU enabled), we start executing hypervisor code with the PID register still containing whatever the guest has been using. The problem is that the CPU can (and will) then start prefetching or speculatively load from whatever host context has that same PID (if any), thus bringing translations for that context into the TLB, which Linux doesn't know about. This can cause stale translations and subsequent crashes. Fixing this in a way that is neither racy nor a huge performance impact is difficult. We could just make the host invalidations always use broadcast forms but that would hurt single threaded programs for example. We chose to fix it instead by partitioning the PID space between guest and host. This is possible because today Linux only use 19 out of the 20 bits of PID space, so existing guests will work if we make the host use the top half of the 20 bits space. We additionally add support for a property to indicate to Linux the size of the PID register which will be useful if we eventually have processors with a larger PID space available. There is still an issue with malicious guests purposefully setting the PID register to a value in the hosts PID range. Hopefully future HW can prevent that, but in the meantime, we handle it with a pair of kludges: - On the way out of a guest, before we clear the current VCPU in the PACA, we check the PID and if it's outside of the permitted range we flush the TLB for that PID. - When context switching, if the mm is "new" on that CPU (the corresponding bit was set for the first time in the mm cpumask), we check if any sibling thread is in KVM (has a non-NULL VCPU pointer in the PACA). If that is the case, we also flush the PID for that CPU (core). This second part is needed to handle the case where a process is migrated (or starts a new pthread) on a sibling thread of the CPU coming out of KVM, as there's a window where stale translations can exist before we detect it and flush them out. A future optimization could be added by keeping track of whether the PID has ever been used and avoid doing that for completely fresh PIDs. We could similarily mark PIDs that have been the subject of a global invalidation as "fresh". But for now this will do. Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> [mpe: Rework the asm to build with CONFIG_PPC_RADIX_MMU=n, drop unneeded include of kvm_book3s_asm.h] Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
author: Benjamin Herrenschmidt <benh@kernel.crashing.org> 2017-07-24 14:26:06 +1000
committer: Michael Ellerman <mpe@ellerman.id.au> 2017-07-26 16:41:52 +1000
commit: a25bd72badfa793ab5aeafd50dbd9db39f8c9179 (patch)
tree: 9945d409b81667476c7db1c8db2c263179379cc0 /arch/powerpc/mm
parent: 029d9252b116fa52a95150819e62af1f6e420fe5 (diff)
download: op-kernel-dev-a25bd72badfa793ab5aeafd50dbd9db39f8c9179.zip
op-kernel-dev-a25bd72badfa793ab5aeafd50dbd9db39f8c9179.tar.gz
3 files changed, 79 insertions, 5 deletions
diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c
index abed1fe..a75f638 100644
--- a/arch/powerpc/mm/mmu_context_book3s64.c
+++ b/arch/powerpc/mm/mmu_context_book3s64.c
@@ -126,9 +126,10 @@ static int hash__init_new_context(struct mm_struct *mm)
 static int radix__init_new_context(struct mm_struct *mm)
 {
 	unsigned long rts_field;
-	int index;
+	int index, max_id;
 
-	index = alloc_context_id(1, PRTB_ENTRIES - 1);
+	max_id = (1 << mmu_pid_bits) - 1;
+	index = alloc_context_id(mmu_base_pid, max_id);
 	if (index < 0)
 		return index;
 
diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index 5cc50d4..671a45d 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -25,6 +25,9 @@
 
 #include <trace/events/thp.h>
 
+unsigned int mmu_pid_bits;
+unsigned int mmu_base_pid;
+
 static int native_register_process_table(unsigned long base, unsigned long pg_sz,
 					 unsigned long table_size)
 {
@@ -261,11 +264,34 @@ static void __init radix_init_pgtable(void)
 	for_each_memblock(memory, reg)
 		WARN_ON(create_physical_mapping(reg->base,
 						reg->base + reg->size));
+
+	/* Find out how many PID bits are supported */
+	if (cpu_has_feature(CPU_FTR_HVMODE)) {
+		if (!mmu_pid_bits)
+			mmu_pid_bits = 20;
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+		/*
+		 * When KVM is possible, we only use the top half of the
+		 * PID space to avoid collisions between host and guest PIDs
+		 * which can cause problems due to prefetch when exiting the
+		 * guest with AIL=3
+		 */
+		mmu_base_pid = 1 << (mmu_pid_bits - 1);
+#else
+		mmu_base_pid = 1;
+#endif
+	} else {
+		/* The guest uses the bottom half of the PID space */
+		if (!mmu_pid_bits)
+			mmu_pid_bits = 19;
+		mmu_base_pid = 1;
+	}
+
 	/*
 	 * Allocate Partition table and process table for the
 	 * host.
 	 */
-	BUILD_BUG_ON_MSG((PRTB_SIZE_SHIFT > 36), "Process table size too large.");
+	BUG_ON(PRTB_SIZE_SHIFT > 36);
 	process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT);
 	/*
 	 * Fill in the process table.
@@ -339,6 +365,12 @@ static int __init radix_dt_scan_page_sizes(unsigned long node,
 	if (type == NULL || strcmp(type, "cpu") != 0)
 		return 0;
 
+	/* Find MMU PID size */
+	prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size);
+	if (prop && size == 4)
+		mmu_pid_bits = be32_to_cpup(prop);
+
+	/* Grab page size encodings */
 	prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size);
 	if (!prop)
 		return 0;
diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index 744e016..16ae1bb 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -12,12 +12,12 @@
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
 #include <linux/memblock.h>
-#include <asm/ppc-opcode.h>
 
+#include <asm/ppc-opcode.h>
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 #include <asm/trace.h>
-
+#include <asm/cputhreads.h>
 
 #define RIC_FLUSH_TLB 0
 #define RIC_FLUSH_PWC 1
@@ -454,3 +454,44 @@ void radix__flush_tlb_pte_p9_dd1(unsigned long old_pte, struct mm_struct *mm,
 	else
 		radix__flush_tlb_page_psize(mm, address, mmu_virtual_psize);
 }
+
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+extern void radix_kvm_prefetch_workaround(struct mm_struct *mm)
+{
+	unsigned int pid = mm->context.id;
+
+	if (unlikely(pid == MMU_NO_CONTEXT))
+		return;
+
+	/*
+	 * If this context hasn't run on that CPU before and KVM is
+	 * around, there's a slim chance that the guest on another
+	 * CPU just brought in obsolete translation into the TLB of
+	 * this CPU due to a bad prefetch using the guest PID on
+	 * the way into the hypervisor.
+	 *
+	 * We work around this here. If KVM is possible, we check if
+	 * any sibling thread is in KVM. If it is, the window may exist
+	 * and thus we flush that PID from the core.
+	 *
+	 * A potential future improvement would be to mark which PIDs
+	 * have never been used on the system and avoid it if the PID
+	 * is new and the process has no other cpumask bit set.
+	 */
+	if (cpu_has_feature(CPU_FTR_HVMODE) && radix_enabled()) {
+		int cpu = smp_processor_id();
+		int sib = cpu_first_thread_sibling(cpu);
+		bool flush = false;
+
+		for (; sib <= cpu_last_thread_sibling(cpu) && !flush; sib++) {
+			if (sib == cpu)
+				continue;
+			if (paca[sib].kvm_hstate.kvm_vcpu)
+				flush = true;
+		}
+		if (flush)
+			_tlbiel_pid(pid, RIC_FLUSH_ALL);
+	}
+}
+EXPORT_SYMBOL_GPL(radix_kvm_prefetch_workaround);
+#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
author	Benjamin Herrenschmidt <benh@kernel.crashing.org>	2017-07-24 14:26:06 +1000
committer	Michael Ellerman <mpe@ellerman.id.au>	2017-07-26 16:41:52 +1000
commit	a25bd72badfa793ab5aeafd50dbd9db39f8c9179 (patch)
tree	9945d409b81667476c7db1c8db2c263179379cc0 /arch/powerpc/mm
parent	029d9252b116fa52a95150819e62af1f6e420fe5 (diff)
download	op-kernel-dev-a25bd72badfa793ab5aeafd50dbd9db39f8c9179.zip op-kernel-dev-a25bd72badfa793ab5aeafd50dbd9db39f8c9179.tar.gz