146 files changed, 8282 insertions, 2750 deletions
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index d488b88..f0e6657 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -30,7 +30,6 @@
 #define __KVM_HAVE_ARCH_INTC_INITIALIZED
 
 #define KVM_USER_MEM_SLOTS 32
-#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
 #define KVM_HAVE_ONE_REG
 #define KVM_HALT_POLL_NS_DEFAULT 500000
 
@@ -45,7 +44,7 @@
 #define KVM_MAX_VCPUS VGIC_V2_MAX_CPUS
 #endif
 
-#define KVM_REQ_VCPU_EXIT	8
+#define KVM_REQ_VCPU_EXIT	(8 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
 
 u32 *kvm_vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num, u32 mode);
 int __attribute_const__ kvm_target_cpu(void);
diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h
index a5838d6..a887263 100644
--- a/arch/arm/include/uapi/asm/kvm.h
+++ b/arch/arm/include/uapi/asm/kvm.h
@@ -27,6 +27,8 @@
 #define __KVM_HAVE_IRQ_LINE
 #define __KVM_HAVE_READONLY_MEM
 
+#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
+
 #define KVM_REG_SIZE(id)						\
 	(1U << (((id) & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT))
 
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 7941699..8966e59 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -208,9 +208,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_IMMEDIATE_EXIT:
 		r = 1;
 		break;
-	case KVM_CAP_COALESCED_MMIO:
-		r = KVM_COALESCED_MMIO_PAGE_OFFSET;
-		break;
 	case KVM_CAP_ARM_SET_DEVICE_ADDR:
 		r = 1;
 		break;
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 578df18..5e19165 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -31,7 +31,6 @@
 #define __KVM_HAVE_ARCH_INTC_INITIALIZED
 
 #define KVM_USER_MEM_SLOTS 512
-#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
 #define KVM_HALT_POLL_NS_DEFAULT 500000
 
 #include <kvm/arm_vgic.h>
@@ -42,7 +41,7 @@
 
 #define KVM_VCPU_MAX_FEATURES 4
 
-#define KVM_REQ_VCPU_EXIT	8
+#define KVM_REQ_VCPU_EXIT	(8 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
 
 int __attribute_const__ kvm_target_cpu(void);
 int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index cd6bea4..869ee48 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -39,6 +39,8 @@
 #define __KVM_HAVE_IRQ_LINE
 #define __KVM_HAVE_READONLY_MEM
 
+#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
+
 #define KVM_REG_SIZE(id)						\
 	(1U << (((id) & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT))
 
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index a008a9f..0a4adbc 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -1687,6 +1687,7 @@ config CPU_CAVIUM_OCTEON
 	select USB_EHCI_BIG_ENDIAN_MMIO if CPU_BIG_ENDIAN
 	select USB_OHCI_BIG_ENDIAN_MMIO if CPU_BIG_ENDIAN
 	select MIPS_L1_CACHE_SHIFT_7
+	select HAVE_KVM
 	help
 	  The Cavium Octeon processor is a highly integrated chip containing
 	  many ethernet hardware widgets for networking tasks. The processor
diff --git a/arch/mips/include/asm/cpu-features.h b/arch/mips/include/asm/cpu-features.h
index e961c8a..494d382 100644
--- a/arch/mips/include/asm/cpu-features.h
+++ b/arch/mips/include/asm/cpu-features.h
@@ -444,6 +444,10 @@
 # define cpu_has_msa		0
 #endif
 
+#ifndef cpu_has_ufr
+# define cpu_has_ufr		(cpu_data[0].options & MIPS_CPU_UFR)
+#endif
+
 #ifndef cpu_has_fre
 # define cpu_has_fre		(cpu_data[0].options & MIPS_CPU_FRE)
 #endif
@@ -528,6 +532,9 @@
 #ifndef cpu_guest_has_htw
 #define cpu_guest_has_htw	(cpu_data[0].guest.options & MIPS_CPU_HTW)
 #endif
+#ifndef cpu_guest_has_mvh
+#define cpu_guest_has_mvh	(cpu_data[0].guest.options & MIPS_CPU_MVH)
+#endif
 #ifndef cpu_guest_has_msa
 #define cpu_guest_has_msa	(cpu_data[0].guest.ases & MIPS_ASE_MSA)
 #endif
@@ -543,6 +550,9 @@
 #ifndef cpu_guest_has_maar
 #define cpu_guest_has_maar	(cpu_data[0].guest.options & MIPS_CPU_MAAR)
 #endif
+#ifndef cpu_guest_has_userlocal
+#define cpu_guest_has_userlocal	(cpu_data[0].guest.options & MIPS_CPU_ULRI)
+#endif
 
 /*
  * Guest dynamic capabilities
diff --git a/arch/mips/include/asm/cpu-info.h b/arch/mips/include/asm/cpu-info.h
index edbe273..be3b4c2 100644
--- a/arch/mips/include/asm/cpu-info.h
+++ b/arch/mips/include/asm/cpu-info.h
@@ -33,6 +33,7 @@ struct guest_info {
 	unsigned long		ases_dyn;
 	unsigned long long	options;
 	unsigned long long	options_dyn;
+	int			tlbsize;
 	u8			conf;
 	u8			kscratch_mask;
 };
@@ -109,6 +110,7 @@ struct cpuinfo_mips {
 	struct guest_info	guest;
 	unsigned int		gtoffset_mask;
 	unsigned int		guestid_mask;
+	unsigned int		guestid_cache;
 } __attribute__((aligned(SMP_CACHE_BYTES)));
 
 extern struct cpuinfo_mips cpu_data[];
diff --git a/arch/mips/include/asm/cpu.h b/arch/mips/include/asm/cpu.h
index 9a83724..98f5930 100644
--- a/arch/mips/include/asm/cpu.h
+++ b/arch/mips/include/asm/cpu.h
@@ -415,6 +415,7 @@ enum cpu_type_enum {
 #define MIPS_CPU_GUESTCTL2	MBIT_ULL(50)	/* CPU has VZ GuestCtl2 register */
 #define MIPS_CPU_GUESTID	MBIT_ULL(51)	/* CPU uses VZ ASE GuestID feature */
 #define MIPS_CPU_DRG		MBIT_ULL(52)	/* CPU has VZ Direct Root to Guest (DRG) */
+#define MIPS_CPU_UFR		MBIT_ULL(53)	/* CPU supports User mode FR switching */
 
 /*
  * CPU ASE encodings
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 05e785f..2998479 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -10,6 +10,7 @@
 #ifndef __MIPS_KVM_HOST_H__
 #define __MIPS_KVM_HOST_H__
 
+#include <linux/cpumask.h>
 #include <linux/mutex.h>
 #include <linux/hrtimer.h>
 #include <linux/interrupt.h>
@@ -33,12 +34,23 @@
 #define KVM_REG_MIPS_CP0_ENTRYLO0	MIPS_CP0_64(2, 0)
 #define KVM_REG_MIPS_CP0_ENTRYLO1	MIPS_CP0_64(3, 0)
 #define KVM_REG_MIPS_CP0_CONTEXT	MIPS_CP0_64(4, 0)
+#define KVM_REG_MIPS_CP0_CONTEXTCONFIG	MIPS_CP0_32(4, 1)
 #define KVM_REG_MIPS_CP0_USERLOCAL	MIPS_CP0_64(4, 2)
+#define KVM_REG_MIPS_CP0_XCONTEXTCONFIG	MIPS_CP0_64(4, 3)
 #define KVM_REG_MIPS_CP0_PAGEMASK	MIPS_CP0_32(5, 0)
 #define KVM_REG_MIPS_CP0_PAGEGRAIN	MIPS_CP0_32(5, 1)
+#define KVM_REG_MIPS_CP0_SEGCTL0	MIPS_CP0_64(5, 2)
+#define KVM_REG_MIPS_CP0_SEGCTL1	MIPS_CP0_64(5, 3)
+#define KVM_REG_MIPS_CP0_SEGCTL2	MIPS_CP0_64(5, 4)
+#define KVM_REG_MIPS_CP0_PWBASE		MIPS_CP0_64(5, 5)
+#define KVM_REG_MIPS_CP0_PWFIELD	MIPS_CP0_64(5, 6)
+#define KVM_REG_MIPS_CP0_PWSIZE		MIPS_CP0_64(5, 7)
 #define KVM_REG_MIPS_CP0_WIRED		MIPS_CP0_32(6, 0)
+#define KVM_REG_MIPS_CP0_PWCTL		MIPS_CP0_32(6, 6)
 #define KVM_REG_MIPS_CP0_HWRENA		MIPS_CP0_32(7, 0)
 #define KVM_REG_MIPS_CP0_BADVADDR	MIPS_CP0_64(8, 0)
+#define KVM_REG_MIPS_CP0_BADINSTR	MIPS_CP0_32(8, 1)
+#define KVM_REG_MIPS_CP0_BADINSTRP	MIPS_CP0_32(8, 2)
 #define KVM_REG_MIPS_CP0_COUNT		MIPS_CP0_32(9, 0)
 #define KVM_REG_MIPS_CP0_ENTRYHI	MIPS_CP0_64(10, 0)
 #define KVM_REG_MIPS_CP0_COMPARE	MIPS_CP0_32(11, 0)
@@ -55,6 +67,7 @@
 #define KVM_REG_MIPS_CP0_CONFIG4	MIPS_CP0_32(16, 4)
 #define KVM_REG_MIPS_CP0_CONFIG5	MIPS_CP0_32(16, 5)
 #define KVM_REG_MIPS_CP0_CONFIG7	MIPS_CP0_32(16, 7)
+#define KVM_REG_MIPS_CP0_MAARI		MIPS_CP0_64(17, 2)
 #define KVM_REG_MIPS_CP0_XCONTEXT	MIPS_CP0_64(20, 0)
 #define KVM_REG_MIPS_CP0_ERROREPC	MIPS_CP0_64(30, 0)
 #define KVM_REG_MIPS_CP0_KSCRATCH1	MIPS_CP0_64(31, 2)
@@ -70,9 +83,13 @@
 /* memory slots that does not exposed to userspace */
 #define KVM_PRIVATE_MEM_SLOTS	0
 
-#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
 #define KVM_HALT_POLL_NS_DEFAULT 500000
 
+#ifdef CONFIG_KVM_MIPS_VZ
+extern unsigned long GUESTID_MASK;
+extern unsigned long GUESTID_FIRST_VERSION;
+extern unsigned long GUESTID_VERSION_MASK;
+#endif
 
 
 /*
@@ -145,6 +162,16 @@ struct kvm_vcpu_stat {
 	u64 fpe_exits;
 	u64 msa_disabled_exits;
 	u64 flush_dcache_exits;
+#ifdef CONFIG_KVM_MIPS_VZ
+	u64 vz_gpsi_exits;
+	u64 vz_gsfc_exits;
+	u64 vz_hc_exits;
+	u64 vz_grr_exits;
+	u64 vz_gva_exits;
+	u64 vz_ghfc_exits;
+	u64 vz_gpa_exits;
+	u64 vz_resvd_exits;
+#endif
 	u64 halt_successful_poll;
 	u64 halt_attempted_poll;
 	u64 halt_poll_invalid;
@@ -157,6 +184,8 @@ struct kvm_arch_memory_slot {
 struct kvm_arch {
 	/* Guest physical mm */
 	struct mm_struct gpa_mm;
+	/* Mask of CPUs needing GPA ASID flush */
+	cpumask_t asid_flush_mask;
 };
 
 #define N_MIPS_COPROC_REGS	32
@@ -214,6 +243,11 @@ struct mips_coproc {
 #define MIPS_CP0_CONFIG4_SEL	4
 #define MIPS_CP0_CONFIG5_SEL	5
 
+#define MIPS_CP0_GUESTCTL2	10
+#define MIPS_CP0_GUESTCTL2_SEL	5
+#define MIPS_CP0_GTOFFSET	12
+#define MIPS_CP0_GTOFFSET_SEL	7
+
 /* Resume Flags */
 #define RESUME_FLAG_DR		(1<<0)	/* Reload guest nonvolatile state? */
 #define RESUME_FLAG_HOST	(1<<1)	/* Resume host? */
@@ -229,6 +263,7 @@ enum emulation_result {
 	EMULATE_WAIT,		/* WAIT instruction */
 	EMULATE_PRIV_FAIL,
 	EMULATE_EXCEPT,		/* A guest exception has been generated */
+	EMULATE_HYPERCALL,	/* HYPCALL instruction */
 };
 
 #define mips3_paddr_to_tlbpfn(x) \
@@ -276,13 +311,18 @@ struct kvm_mmu_memory_cache {
 struct kvm_vcpu_arch {
 	void *guest_ebase;
 	int (*vcpu_run)(struct kvm_run *run, struct kvm_vcpu *vcpu);
+
+	/* Host registers preserved across guest mode execution */
 	unsigned long host_stack;
 	unsigned long host_gp;
+	unsigned long host_pgd;
+	unsigned long host_entryhi;
 
 	/* Host CP0 registers used when handling exits from guest */
 	unsigned long host_cp0_badvaddr;
 	unsigned long host_cp0_epc;
 	u32 host_cp0_cause;
+	u32 host_cp0_guestctl0;
 	u32 host_cp0_badinstr;
 	u32 host_cp0_badinstrp;
 
@@ -340,7 +380,23 @@ struct kvm_vcpu_arch {
 	/* Cache some mmu pages needed inside spinlock regions */
 	struct kvm_mmu_memory_cache mmu_page_cache;
 
+#ifdef CONFIG_KVM_MIPS_VZ
+	/* vcpu's vzguestid is different on each host cpu in an smp system */
+	u32 vzguestid[NR_CPUS];
+
+	/* wired guest TLB entries */
+	struct kvm_mips_tlb *wired_tlb;
+	unsigned int wired_tlb_limit;
+	unsigned int wired_tlb_used;
+
+	/* emulated guest MAAR registers */
+	unsigned long maar[6];
+#endif
+
+	/* Last CPU the VCPU state was loaded on */
 	int last_sched_cpu;
+	/* Last CPU the VCPU actually executed guest code on */
+	int last_exec_cpu;
 
 	/* WAIT executed */
 	int wait;
@@ -349,78 +405,6 @@ struct kvm_vcpu_arch {
 	u8 msa_enabled;
 };
 
-
-#define kvm_read_c0_guest_index(cop0)		(cop0->reg[MIPS_CP0_TLB_INDEX][0])
-#define kvm_write_c0_guest_index(cop0, val)	(cop0->reg[MIPS_CP0_TLB_INDEX][0] = val)
-#define kvm_read_c0_guest_entrylo0(cop0)	(cop0->reg[MIPS_CP0_TLB_LO0][0])
-#define kvm_write_c0_guest_entrylo0(cop0, val)	(cop0->reg[MIPS_CP0_TLB_LO0][0] = (val))
-#define kvm_read_c0_guest_entrylo1(cop0)	(cop0->reg[MIPS_CP0_TLB_LO1][0])
-#define kvm_write_c0_guest_entrylo1(cop0, val)	(cop0->reg[MIPS_CP0_TLB_LO1][0] = (val))
-#define kvm_read_c0_guest_context(cop0)		(cop0->reg[MIPS_CP0_TLB_CONTEXT][0])
-#define kvm_write_c0_guest_context(cop0, val)	(cop0->reg[MIPS_CP0_TLB_CONTEXT][0] = (val))
-#define kvm_read_c0_guest_userlocal(cop0)	(cop0->reg[MIPS_CP0_TLB_CONTEXT][2])
-#define kvm_write_c0_guest_userlocal(cop0, val)	(cop0->reg[MIPS_CP0_TLB_CONTEXT][2] = (val))
-#define kvm_read_c0_guest_pagemask(cop0)	(cop0->reg[MIPS_CP0_TLB_PG_MASK][0])
-#define kvm_write_c0_guest_pagemask(cop0, val)	(cop0->reg[MIPS_CP0_TLB_PG_MASK][0] = (val))
-#define kvm_read_c0_guest_wired(cop0)		(cop0->reg[MIPS_CP0_TLB_WIRED][0])
-#define kvm_write_c0_guest_wired(cop0, val)	(cop0->reg[MIPS_CP0_TLB_WIRED][0] = (val))
-#define kvm_read_c0_guest_hwrena(cop0)		(cop0->reg[MIPS_CP0_HWRENA][0])
-#define kvm_write_c0_guest_hwrena(cop0, val)	(cop0->reg[MIPS_CP0_HWRENA][0] = (val))
-#define kvm_read_c0_guest_badvaddr(cop0)	(cop0->reg[MIPS_CP0_BAD_VADDR][0])
-#define kvm_write_c0_guest_badvaddr(cop0, val)	(cop0->reg[MIPS_CP0_BAD_VADDR][0] = (val))
-#define kvm_read_c0_guest_count(cop0)		(cop0->reg[MIPS_CP0_COUNT][0])
-#define kvm_write_c0_guest_count(cop0, val)	(cop0->reg[MIPS_CP0_COUNT][0] = (val))
-#define kvm_read_c0_guest_entryhi(cop0)		(cop0->reg[MIPS_CP0_TLB_HI][0])
-#define kvm_write_c0_guest_entryhi(cop0, val)	(cop0->reg[MIPS_CP0_TLB_HI][0] = (val))
-#define kvm_read_c0_guest_compare(cop0)		(cop0->reg[MIPS_CP0_COMPARE][0])
-#define kvm_write_c0_guest_compare(cop0, val)	(cop0->reg[MIPS_CP0_COMPARE][0] = (val))
-#define kvm_read_c0_guest_status(cop0)		(cop0->reg[MIPS_CP0_STATUS][0])
-#define kvm_write_c0_guest_status(cop0, val)	(cop0->reg[MIPS_CP0_STATUS][0] = (val))
-#define kvm_read_c0_guest_intctl(cop0)		(cop0->reg[MIPS_CP0_STATUS][1])
-#define kvm_write_c0_guest_intctl(cop0, val)	(cop0->reg[MIPS_CP0_STATUS][1] = (val))
-#define kvm_read_c0_guest_cause(cop0)		(cop0->reg[MIPS_CP0_CAUSE][0])
-#define kvm_write_c0_guest_cause(cop0, val)	(cop0->reg[MIPS_CP0_CAUSE][0] = (val))
-#define kvm_read_c0_guest_epc(cop0)		(cop0->reg[MIPS_CP0_EXC_PC][0])
-#define kvm_write_c0_guest_epc(cop0, val)	(cop0->reg[MIPS_CP0_EXC_PC][0] = (val))
-#define kvm_read_c0_guest_prid(cop0)		(cop0->reg[MIPS_CP0_PRID][0])
-#define kvm_write_c0_guest_prid(cop0, val)	(cop0->reg[MIPS_CP0_PRID][0] = (val))
-#define kvm_read_c0_guest_ebase(cop0)		(cop0->reg[MIPS_CP0_PRID][1])
-#define kvm_write_c0_guest_ebase(cop0, val)	(cop0->reg[MIPS_CP0_PRID][1] = (val))
-#define kvm_read_c0_guest_config(cop0)		(cop0->reg[MIPS_CP0_CONFIG][0])
-#define kvm_read_c0_guest_config1(cop0)		(cop0->reg[MIPS_CP0_CONFIG][1])
-#define kvm_read_c0_guest_config2(cop0)		(cop0->reg[MIPS_CP0_CONFIG][2])
-#define kvm_read_c0_guest_config3(cop0)		(cop0->reg[MIPS_CP0_CONFIG][3])
-#define kvm_read_c0_guest_config4(cop0)		(cop0->reg[MIPS_CP0_CONFIG][4])
-#define kvm_read_c0_guest_config5(cop0)		(cop0->reg[MIPS_CP0_CONFIG][5])
-#define kvm_read_c0_guest_config7(cop0)		(cop0->reg[MIPS_CP0_CONFIG][7])
-#define kvm_write_c0_guest_config(cop0, val)	(cop0->reg[MIPS_CP0_CONFIG][0] = (val))
-#define kvm_write_c0_guest_config1(cop0, val)	(cop0->reg[MIPS_CP0_CONFIG][1] = (val))
-#define kvm_write_c0_guest_config2(cop0, val)	(cop0->reg[MIPS_CP0_CONFIG][2] = (val))
-#define kvm_write_c0_guest_config3(cop0, val)	(cop0->reg[MIPS_CP0_CONFIG][3] = (val))
-#define kvm_write_c0_guest_config4(cop0, val)	(cop0->reg[MIPS_CP0_CONFIG][4] = (val))
-#define kvm_write_c0_guest_config5(cop0, val)	(cop0->reg[MIPS_CP0_CONFIG][5] = (val))
-#define kvm_write_c0_guest_config7(cop0, val)	(cop0->reg[MIPS_CP0_CONFIG][7] = (val))
-#define kvm_read_c0_guest_errorepc(cop0)	(cop0->reg[MIPS_CP0_ERROR_PC][0])
-#define kvm_write_c0_guest_errorepc(cop0, val)	(cop0->reg[MIPS_CP0_ERROR_PC][0] = (val))
-#define kvm_read_c0_guest_kscratch1(cop0)	(cop0->reg[MIPS_CP0_DESAVE][2])
-#define kvm_read_c0_guest_kscratch2(cop0)	(cop0->reg[MIPS_CP0_DESAVE][3])
-#define kvm_read_c0_guest_kscratch3(cop0)	(cop0->reg[MIPS_CP0_DESAVE][4])
-#define kvm_read_c0_guest_kscratch4(cop0)	(cop0->reg[MIPS_CP0_DESAVE][5])
-#define kvm_read_c0_guest_kscratch5(cop0)	(cop0->reg[MIPS_CP0_DESAVE][6])
-#define kvm_read_c0_guest_kscratch6(cop0)	(cop0->reg[MIPS_CP0_DESAVE][7])
-#define kvm_write_c0_guest_kscratch1(cop0, val)	(cop0->reg[MIPS_CP0_DESAVE][2] = (val))
-#define kvm_write_c0_guest_kscratch2(cop0, val)	(cop0->reg[MIPS_CP0_DESAVE][3] = (val))
-#define kvm_write_c0_guest_kscratch3(cop0, val)	(cop0->reg[MIPS_CP0_DESAVE][4] = (val))
-#define kvm_write_c0_guest_kscratch4(cop0, val)	(cop0->reg[MIPS_CP0_DESAVE][5] = (val))
-#define kvm_write_c0_guest_kscratch5(cop0, val)	(cop0->reg[MIPS_CP0_DESAVE][6] = (val))
-#define kvm_write_c0_guest_kscratch6(cop0, val)	(cop0->reg[MIPS_CP0_DESAVE][7] = (val))
-
-/*
- * Some of the guest registers may be modified asynchronously (e.g. from a
- * hrtimer callback in hard irq context) and therefore need stronger atomicity
- * guarantees than other registers.
- */
-
 static inline void _kvm_atomic_set_c0_guest_reg(unsigned long *reg,
 						unsigned long val)
 {
@@ -471,26 +455,286 @@ static inline void _kvm_atomic_change_c0_guest_reg(unsigned long *reg,
 	} while (unlikely(!temp));
 }
 
-#define kvm_set_c0_guest_status(cop0, val)	(cop0->reg[MIPS_CP0_STATUS][0] |= (val))
-#define kvm_clear_c0_guest_status(cop0, val)	(cop0->reg[MIPS_CP0_STATUS][0] &= ~(val))
+/* Guest register types, used in accessor build below */
+#define __KVMT32	u32
+#define __KVMTl	unsigned long
 
-/* Cause can be modified asynchronously from hardirq hrtimer callback */
-#define kvm_set_c0_guest_cause(cop0, val)				\
-	_kvm_atomic_set_c0_guest_reg(&cop0->reg[MIPS_CP0_CAUSE][0], val)
-#define kvm_clear_c0_guest_cause(cop0, val)				\
-	_kvm_atomic_clear_c0_guest_reg(&cop0->reg[MIPS_CP0_CAUSE][0], val)
-#define kvm_change_c0_guest_cause(cop0, change, val)			\
-	_kvm_atomic_change_c0_guest_reg(&cop0->reg[MIPS_CP0_CAUSE][0],	\
-					change, val)
-
-#define kvm_set_c0_guest_ebase(cop0, val)	(cop0->reg[MIPS_CP0_PRID][1] |= (val))
-#define kvm_clear_c0_guest_ebase(cop0, val)	(cop0->reg[MIPS_CP0_PRID][1] &= ~(val))
-#define kvm_change_c0_guest_ebase(cop0, change, val)			\
+/*
+ * __BUILD_KVM_$ops_SAVED(): kvm_$op_sw_gc0_$reg()
+ * These operate on the saved guest C0 state in RAM.
+ */
+
+/* Generate saved context simple accessors */
+#define __BUILD_KVM_RW_SAVED(name, type, _reg, sel)			\
+static inline __KVMT##type kvm_read_sw_gc0_##name(struct mips_coproc *cop0) \
+{									\
+	return cop0->reg[(_reg)][(sel)];				\
+}									\
+static inline void kvm_write_sw_gc0_##name(struct mips_coproc *cop0,	\
+					   __KVMT##type val)		\
+{									\
+	cop0->reg[(_reg)][(sel)] = val;					\
+}
+
+/* Generate saved context bitwise modifiers */
+#define __BUILD_KVM_SET_SAVED(name, type, _reg, sel)			\
+static inline void kvm_set_sw_gc0_##name(struct mips_coproc *cop0,	\
+					 __KVMT##type val)		\
+{									\
+	cop0->reg[(_reg)][(sel)] |= val;				\
+}									\
+static inline void kvm_clear_sw_gc0_##name(struct mips_coproc *cop0,	\
+					   __KVMT##type val)		\
+{									\
+	cop0->reg[(_reg)][(sel)] &= ~val;				\
+}									\
+static inline void kvm_change_sw_gc0_##name(struct mips_coproc *cop0,	\
+					    __KVMT##type mask,		\
+					    __KVMT##type val)		\
+{									\
+	unsigned long _mask = mask;					\
+	cop0->reg[(_reg)][(sel)] &= ~_mask;				\
+	cop0->reg[(_reg)][(sel)] |= val & _mask;			\
+}
+
+/* Generate saved context atomic bitwise modifiers */
+#define __BUILD_KVM_ATOMIC_SAVED(name, type, _reg, sel)			\
+static inline void kvm_set_sw_gc0_##name(struct mips_coproc *cop0,	\
+					 __KVMT##type val)		\
+{									\
+	_kvm_atomic_set_c0_guest_reg(&cop0->reg[(_reg)][(sel)], val);	\
+}									\
+static inline void kvm_clear_sw_gc0_##name(struct mips_coproc *cop0,	\
+					   __KVMT##type val)		\
+{									\
+	_kvm_atomic_clear_c0_guest_reg(&cop0->reg[(_reg)][(sel)], val);	\
+}									\
+static inline void kvm_change_sw_gc0_##name(struct mips_coproc *cop0,	\
+					    __KVMT##type mask,		\
+					    __KVMT##type val)		\
+{									\
+	_kvm_atomic_change_c0_guest_reg(&cop0->reg[(_reg)][(sel)], mask, \
+					val);				\
+}
+
+/*
+ * __BUILD_KVM_$ops_VZ(): kvm_$op_vz_gc0_$reg()
+ * These operate on the VZ guest C0 context in hardware.
+ */
+
+/* Generate VZ guest context simple accessors */
+#define __BUILD_KVM_RW_VZ(name, type, _reg, sel)			\
+static inline __KVMT##type kvm_read_vz_gc0_##name(struct mips_coproc *cop0) \
+{									\
+	return read_gc0_##name();					\
+}									\
+static inline void kvm_write_vz_gc0_##name(struct mips_coproc *cop0,	\
+					   __KVMT##type val)		\
+{									\
+	write_gc0_##name(val);						\
+}
+
+/* Generate VZ guest context bitwise modifiers */
+#define __BUILD_KVM_SET_VZ(name, type, _reg, sel)			\
+static inline void kvm_set_vz_gc0_##name(struct mips_coproc *cop0,	\
+					 __KVMT##type val)		\
+{									\
+	set_gc0_##name(val);						\
+}									\
+static inline void kvm_clear_vz_gc0_##name(struct mips_coproc *cop0,	\
+					   __KVMT##type val)		\
+{									\
+	clear_gc0_##name(val);						\
+}									\
+static inline void kvm_change_vz_gc0_##name(struct mips_coproc *cop0,	\
+					    __KVMT##type mask,		\
+					    __KVMT##type val)		\
+{									\
+	change_gc0_##name(mask, val);					\
+}
+
+/* Generate VZ guest context save/restore to/from saved context */
+#define __BUILD_KVM_SAVE_VZ(name, _reg, sel)			\
+static inline void kvm_restore_gc0_##name(struct mips_coproc *cop0)	\
+{									\
+	write_gc0_##name(cop0->reg[(_reg)][(sel)]);			\
+}									\
+static inline void kvm_save_gc0_##name(struct mips_coproc *cop0)	\
+{									\
+	cop0->reg[(_reg)][(sel)] = read_gc0_##name();			\
+}
+
+/*
+ * __BUILD_KVM_$ops_WRAP(): kvm_$op_$name1() -> kvm_$op_$name2()
+ * These wrap a set of operations to provide them with a different name.
+ */
+
+/* Generate simple accessor wrapper */
+#define __BUILD_KVM_RW_WRAP(name1, name2, type)				\
+static inline __KVMT##type kvm_read_##name1(struct mips_coproc *cop0)	\
+{									\
+	return kvm_read_##name2(cop0);					\
+}									\
+static inline void kvm_write_##name1(struct mips_coproc *cop0,		\
+				     __KVMT##type val)			\
+{									\
+	kvm_write_##name2(cop0, val);					\
+}
+
+/* Generate bitwise modifier wrapper */
+#define __BUILD_KVM_SET_WRAP(name1, name2, type)			\
+static inline void kvm_set_##name1(struct mips_coproc *cop0,		\
+				   __KVMT##type val)			\
 {									\
-	kvm_clear_c0_guest_ebase(cop0, change);				\
-	kvm_set_c0_guest_ebase(cop0, ((val) & (change)));		\
+	kvm_set_##name2(cop0, val);					\
+}									\
+static inline void kvm_clear_##name1(struct mips_coproc *cop0,		\
+				     __KVMT##type val)			\
+{									\
+	kvm_clear_##name2(cop0, val);					\
+}									\
+static inline void kvm_change_##name1(struct mips_coproc *cop0,		\
+				      __KVMT##type mask,		\
+				      __KVMT##type val)			\
+{									\
+	kvm_change_##name2(cop0, mask, val);				\
 }
 
+/*
+ * __BUILD_KVM_$ops_SW(): kvm_$op_c0_guest_$reg() -> kvm_$op_sw_gc0_$reg()
+ * These generate accessors operating on the saved context in RAM, and wrap them
+ * with the common guest C0 accessors (for use by common emulation code).
+ */
+
+#define __BUILD_KVM_RW_SW(name, type, _reg, sel)			\
+	__BUILD_KVM_RW_SAVED(name, type, _reg, sel)			\
+	__BUILD_KVM_RW_WRAP(c0_guest_##name, sw_gc0_##name, type)
+
+#define __BUILD_KVM_SET_SW(name, type, _reg, sel)			\
+	__BUILD_KVM_SET_SAVED(name, type, _reg, sel)			\
+	__BUILD_KVM_SET_WRAP(c0_guest_##name, sw_gc0_##name, type)
+
+#define __BUILD_KVM_ATOMIC_SW(name, type, _reg, sel)			\
+	__BUILD_KVM_ATOMIC_SAVED(name, type, _reg, sel)			\
+	__BUILD_KVM_SET_WRAP(c0_guest_##name, sw_gc0_##name, type)
+
+#ifndef CONFIG_KVM_MIPS_VZ
+
+/*
+ * T&E (trap & emulate software based virtualisation)
+ * We generate the common accessors operating exclusively on the saved context
+ * in RAM.
+ */
+
+#define __BUILD_KVM_RW_HW	__BUILD_KVM_RW_SW
+#define __BUILD_KVM_SET_HW	__BUILD_KVM_SET_SW
+#define __BUILD_KVM_ATOMIC_HW	__BUILD_KVM_ATOMIC_SW
+
+#else
+
+/*
+ * VZ (hardware assisted virtualisation)
+ * These macros use the active guest state in VZ mode (hardware registers),
+ */
+
+/*
+ * __BUILD_KVM_$ops_HW(): kvm_$op_c0_guest_$reg() -> kvm_$op_vz_gc0_$reg()
+ * These generate accessors operating on the VZ guest context in hardware, and
+ * wrap them with the common guest C0 accessors (for use by common emulation
+ * code).
+ *
+ * Accessors operating on the saved context in RAM are also generated to allow
+ * convenient explicit saving and restoring of the state.
+ */
+
+#define __BUILD_KVM_RW_HW(name, type, _reg, sel)			\
+	__BUILD_KVM_RW_SAVED(name, type, _reg, sel)			\
+	__BUILD_KVM_RW_VZ(name, type, _reg, sel)			\
+	__BUILD_KVM_RW_WRAP(c0_guest_##name, vz_gc0_##name, type)	\
+	__BUILD_KVM_SAVE_VZ(name, _reg, sel)
+
+#define __BUILD_KVM_SET_HW(name, type, _reg, sel)			\
+	__BUILD_KVM_SET_SAVED(name, type, _reg, sel)			\
+	__BUILD_KVM_SET_VZ(name, type, _reg, sel)			\
+	__BUILD_KVM_SET_WRAP(c0_guest_##name, vz_gc0_##name, type)
+
+/*
+ * We can't do atomic modifications of COP0 state if hardware can modify it.
+ * Races must be handled explicitly.
+ */
+#define __BUILD_KVM_ATOMIC_HW	__BUILD_KVM_SET_HW
+
+#endif
+
+/*
+ * Define accessors for CP0 registers that are accessible to the guest. These
+ * are primarily used by common emulation code, which may need to access the
+ * registers differently depending on the implementation.
+ *
+ *    fns_hw/sw    name     type    reg num         select
+ */
+__BUILD_KVM_RW_HW(index,          32, MIPS_CP0_TLB_INDEX,    0)
+__BUILD_KVM_RW_HW(entrylo0,       l,  MIPS_CP0_TLB_LO0,      0)
+__BUILD_KVM_RW_HW(entrylo1,       l,  MIPS_CP0_TLB_LO1,      0)
+__BUILD_KVM_RW_HW(context,        l,  MIPS_CP0_TLB_CONTEXT,  0)
+__BUILD_KVM_RW_HW(contextconfig,  32, MIPS_CP0_TLB_CONTEXT,  1)
+__BUILD_KVM_RW_HW(userlocal,      l,  MIPS_CP0_TLB_CONTEXT,  2)
+__BUILD_KVM_RW_HW(xcontextconfig, l,  MIPS_CP0_TLB_CONTEXT,  3)
+__BUILD_KVM_RW_HW(pagemask,       l,  MIPS_CP0_TLB_PG_MASK,  0)
+__BUILD_KVM_RW_HW(pagegrain,      32, MIPS_CP0_TLB_PG_MASK,  1)
+__BUILD_KVM_RW_HW(segctl0,        l,  MIPS_CP0_TLB_PG_MASK,  2)
+__BUILD_KVM_RW_HW(segctl1,        l,  MIPS_CP0_TLB_PG_MASK,  3)
+__BUILD_KVM_RW_HW(segctl2,        l,  MIPS_CP0_TLB_PG_MASK,  4)
+__BUILD_KVM_RW_HW(pwbase,         l,  MIPS_CP0_TLB_PG_MASK,  5)
+__BUILD_KVM_RW_HW(pwfield,        l,  MIPS_CP0_TLB_PG_MASK,  6)
+__BUILD_KVM_RW_HW(pwsize,         l,  MIPS_CP0_TLB_PG_MASK,  7)
+__BUILD_KVM_RW_HW(wired,          32, MIPS_CP0_TLB_WIRED,    0)
+__BUILD_KVM_RW_HW(pwctl,          32, MIPS_CP0_TLB_WIRED,    6)
+__BUILD_KVM_RW_HW(hwrena,         32, MIPS_CP0_HWRENA,       0)
+__BUILD_KVM_RW_HW(badvaddr,       l,  MIPS_CP0_BAD_VADDR,    0)
+__BUILD_KVM_RW_HW(badinstr,       32, MIPS_CP0_BAD_VADDR,    1)
+__BUILD_KVM_RW_HW(badinstrp,      32, MIPS_CP0_BAD_VADDR,    2)
+__BUILD_KVM_RW_SW(count,          32, MIPS_CP0_COUNT,        0)
+__BUILD_KVM_RW_HW(entryhi,        l,  MIPS_CP0_TLB_HI,       0)
+__BUILD_KVM_RW_HW(compare,        32, MIPS_CP0_COMPARE,      0)
+__BUILD_KVM_RW_HW(status,         32, MIPS_CP0_STATUS,       0)
+__BUILD_KVM_RW_HW(intctl,         32, MIPS_CP0_STATUS,       1)
+__BUILD_KVM_RW_HW(cause,          32, MIPS_CP0_CAUSE,        0)
+__BUILD_KVM_RW_HW(epc,            l,  MIPS_CP0_EXC_PC,       0)
+__BUILD_KVM_RW_SW(prid,           32, MIPS_CP0_PRID,         0)
+__BUILD_KVM_RW_HW(ebase,          l,  MIPS_CP0_PRID,         1)
+__BUILD_KVM_RW_HW(config,         32, MIPS_CP0_CONFIG,       0)
+__BUILD_KVM_RW_HW(config1,        32, MIPS_CP0_CONFIG,       1)
+__BUILD_KVM_RW_HW(config2,        32, MIPS_CP0_CONFIG,       2)
+__BUILD_KVM_RW_HW(config3,        32, MIPS_CP0_CONFIG,       3)
+__BUILD_KVM_RW_HW(config4,        32, MIPS_CP0_CONFIG,       4)
+__BUILD_KVM_RW_HW(config5,        32, MIPS_CP0_CONFIG,       5)
+__BUILD_KVM_RW_HW(config6,        32, MIPS_CP0_CONFIG,       6)
+__BUILD_KVM_RW_HW(config7,        32, MIPS_CP0_CONFIG,       7)
+__BUILD_KVM_RW_SW(maari,          l,  MIPS_CP0_LLADDR,       2)
+__BUILD_KVM_RW_HW(xcontext,       l,  MIPS_CP0_TLB_XCONTEXT, 0)
+__BUILD_KVM_RW_HW(errorepc,       l,  MIPS_CP0_ERROR_PC,     0)
+__BUILD_KVM_RW_HW(kscratch1,      l,  MIPS_CP0_DESAVE,       2)
+__BUILD_KVM_RW_HW(kscratch2,      l,  MIPS_CP0_DESAVE,       3)
+__BUILD_KVM_RW_HW(kscratch3,      l,  MIPS_CP0_DESAVE,       4)
+__BUILD_KVM_RW_HW(kscratch4,      l,  MIPS_CP0_DESAVE,       5)
+__BUILD_KVM_RW_HW(kscratch5,      l,  MIPS_CP0_DESAVE,       6)
+__BUILD_KVM_RW_HW(kscratch6,      l,  MIPS_CP0_DESAVE,       7)
+
+/* Bitwise operations (on HW state) */
+__BUILD_KVM_SET_HW(status,        32, MIPS_CP0_STATUS,       0)
+/* Cause can be modified asynchronously from hardirq hrtimer callback */
+__BUILD_KVM_ATOMIC_HW(cause,      32, MIPS_CP0_CAUSE,        0)
+__BUILD_KVM_SET_HW(ebase,         l,  MIPS_CP0_PRID,         1)
+
+/* Bitwise operations (on saved state) */
+__BUILD_KVM_SET_SAVED(config,     32, MIPS_CP0_CONFIG,       0)
+__BUILD_KVM_SET_SAVED(config1,    32, MIPS_CP0_CONFIG,       1)
+__BUILD_KVM_SET_SAVED(config2,    32, MIPS_CP0_CONFIG,       2)
+__BUILD_KVM_SET_SAVED(config3,    32, MIPS_CP0_CONFIG,       3)
+__BUILD_KVM_SET_SAVED(config4,    32, MIPS_CP0_CONFIG,       4)
+__BUILD_KVM_SET_SAVED(config5,    32, MIPS_CP0_CONFIG,       5)
+
 /* Helpers */
 
 static inline bool kvm_mips_guest_can_have_fpu(struct kvm_vcpu_arch *vcpu)
@@ -531,6 +775,10 @@ struct kvm_mips_callbacks {
 	int (*handle_msa_fpe)(struct kvm_vcpu *vcpu);
 	int (*handle_fpe)(struct kvm_vcpu *vcpu);
 	int (*handle_msa_disabled)(struct kvm_vcpu *vcpu);
+	int (*handle_guest_exit)(struct kvm_vcpu *vcpu);
+	int (*hardware_enable)(void);
+	void (*hardware_disable)(void);
+	int (*check_extension)(struct kvm *kvm, long ext);
 	int (*vcpu_init)(struct kvm_vcpu *vcpu);
 	void (*vcpu_uninit)(struct kvm_vcpu *vcpu);
 	int (*vcpu_setup)(struct kvm_vcpu *vcpu);
@@ -599,6 +847,10 @@ u32 kvm_get_user_asid(struct kvm_vcpu *vcpu);
 
 u32 kvm_get_commpage_asid (struct kvm_vcpu *vcpu);
 
+#ifdef CONFIG_KVM_MIPS_VZ
+int kvm_mips_handle_vz_root_tlb_fault(unsigned long badvaddr,
+				      struct kvm_vcpu *vcpu, bool write_fault);
+#endif
 extern int kvm_mips_handle_kseg0_tlb_fault(unsigned long badbaddr,
 					   struct kvm_vcpu *vcpu,
 					   bool write_fault);
@@ -625,6 +877,18 @@ extern int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long entryhi,
 extern int kvm_mips_guest_tlb_lookup(struct kvm_vcpu *vcpu,
 				     unsigned long entryhi);
 
+#ifdef CONFIG_KVM_MIPS_VZ
+int kvm_vz_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long entryhi);
+int kvm_vz_guest_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long gva,
+			    unsigned long *gpa);
+void kvm_vz_local_flush_roottlb_all_guests(void);
+void kvm_vz_local_flush_guesttlb_all(void);
+void kvm_vz_save_guesttlb(struct kvm_mips_tlb *buf, unsigned int index,
+			  unsigned int count);
+void kvm_vz_load_guesttlb(const struct kvm_mips_tlb *buf, unsigned int index,
+			  unsigned int count);
+#endif
+
 void kvm_mips_suspend_mm(int cpu);
 void kvm_mips_resume_mm(int cpu);
 
@@ -795,7 +1059,7 @@ extern enum emulation_result kvm_mips_complete_mmio_load(struct kvm_vcpu *vcpu,
 u32 kvm_mips_read_count(struct kvm_vcpu *vcpu);
 void kvm_mips_write_count(struct kvm_vcpu *vcpu, u32 count);
 void kvm_mips_write_compare(struct kvm_vcpu *vcpu, u32 compare, bool ack);
-void kvm_mips_init_count(struct kvm_vcpu *vcpu);
+void kvm_mips_init_count(struct kvm_vcpu *vcpu, unsigned long count_hz);
 int kvm_mips_set_count_ctl(struct kvm_vcpu *vcpu, s64 count_ctl);
 int kvm_mips_set_count_resume(struct kvm_vcpu *vcpu, s64 count_resume);
 int kvm_mips_set_count_hz(struct kvm_vcpu *vcpu, s64 count_hz);
@@ -803,6 +1067,20 @@ void kvm_mips_count_enable_cause(struct kvm_vcpu *vcpu);
 void kvm_mips_count_disable_cause(struct kvm_vcpu *vcpu);
 enum hrtimer_restart kvm_mips_count_timeout(struct kvm_vcpu *vcpu);
 
+/* fairly internal functions requiring some care to use */
+int kvm_mips_count_disabled(struct kvm_vcpu *vcpu);
+ktime_t kvm_mips_freeze_hrtimer(struct kvm_vcpu *vcpu, u32 *count);
+int kvm_mips_restore_hrtimer(struct kvm_vcpu *vcpu, ktime_t before,
+			     u32 count, int min_drift);
+
+#ifdef CONFIG_KVM_MIPS_VZ
+void kvm_vz_acquire_htimer(struct kvm_vcpu *vcpu);
+void kvm_vz_lose_htimer(struct kvm_vcpu *vcpu);
+#else
+static inline void kvm_vz_acquire_htimer(struct kvm_vcpu *vcpu) {}
+static inline void kvm_vz_lose_htimer(struct kvm_vcpu *vcpu) {}
+#endif
+
 enum emulation_result kvm_mips_check_privilege(u32 cause,
 					       u32 *opc,
 					       struct kvm_run *run,
@@ -827,11 +1105,20 @@ enum emulation_result kvm_mips_emulate_load(union mips_instruction inst,
 					    struct kvm_run *run,
 					    struct kvm_vcpu *vcpu);
 
+/* COP0 */
+enum emulation_result kvm_mips_emul_wait(struct kvm_vcpu *vcpu);
+
 unsigned int kvm_mips_config1_wrmask(struct kvm_vcpu *vcpu);
 unsigned int kvm_mips_config3_wrmask(struct kvm_vcpu *vcpu);
 unsigned int kvm_mips_config4_wrmask(struct kvm_vcpu *vcpu);
 unsigned int kvm_mips_config5_wrmask(struct kvm_vcpu *vcpu);
 
+/* Hypercalls (hypcall.c) */
+
+enum emulation_result kvm_mips_emul_hypcall(struct kvm_vcpu *vcpu,
+					    union mips_instruction inst);
+int kvm_mips_handle_hypcall(struct kvm_vcpu *vcpu);
+
 /* Dynamic binary translation */
 extern int kvm_mips_trans_cache_index(union mips_instruction inst,
 				      u32 *opc, struct kvm_vcpu *vcpu);
@@ -846,7 +1133,6 @@ extern int kvm_mips_trans_mtc0(union mips_instruction inst, u32 *opc,
 extern void kvm_mips_dump_stats(struct kvm_vcpu *vcpu);
 extern unsigned long kvm_mips_get_ramsize(struct kvm *kvm);
 
-static inline void kvm_arch_hardware_disable(void) {}
 static inline void kvm_arch_hardware_unsetup(void) {}
 static inline void kvm_arch_sync_events(struct kvm *kvm) {}
 static inline void kvm_arch_free_memslot(struct kvm *kvm,
diff --git a/arch/mips/include/asm/maar.h b/arch/mips/include/asm/maar.h
index 21d9607..e10f78b 100644
--- a/arch/mips/include/asm/maar.h
+++ b/arch/mips/include/asm/maar.h
@@ -36,7 +36,7 @@ unsigned platform_maar_init(unsigned num_pairs);
  * @upper:	The highest address that the MAAR pair will affect. Must be
  *		aligned to one byte before a 2^16 byte boundary.
  * @attrs:	The accessibility attributes to program, eg. MIPS_MAAR_S. The
- *		MIPS_MAAR_V attribute will automatically be set.
+ *		MIPS_MAAR_VL attribute will automatically be set.
  *
  * Program the pair of MAAR registers specified by idx to apply the attributes
  * specified by attrs to the range of addresses from lower to higher.
@@ -49,10 +49,10 @@ static inline void write_maar_pair(unsigned idx, phys_addr_t lower,
 	BUG_ON(((upper & 0xffff) != 0xffff)
 		|| ((upper & ~0xffffull) & ~(MIPS_MAAR_ADDR << 4)));
 
-	/* Automatically set MIPS_MAAR_V */
-	attrs |= MIPS_MAAR_V;
+	/* Automatically set MIPS_MAAR_VL */
+	attrs |= MIPS_MAAR_VL;
 
-	/* Write the upper address & attributes (only MIPS_MAAR_V matters) */
+	/* Write the upper address & attributes (only MIPS_MAAR_VL matters) */
 	write_c0_maari(idx << 1);
 	back_to_back_c0_hazard();
 	write_c0_maar(((upper >> 4) & MIPS_MAAR_ADDR) | attrs);
@@ -81,7 +81,7 @@ extern void maar_init(void);
  * @upper:	The highest address that the MAAR pair will affect. Must be
  *		aligned to one byte before a 2^16 byte boundary.
  * @attrs:	The accessibility attributes to program, eg. MIPS_MAAR_S. The
- *		MIPS_MAAR_V attribute will automatically be set.
+ *		MIPS_MAAR_VL attribute will automatically be set.
  *
  * Describes the configuration of a pair of Memory Accessibility Attribute
  * Registers - applying attributes from attrs to the range of physical
diff --git a/arch/mips/include/asm/mipsregs.h b/arch/mips/include/asm/mipsregs.h
index f8d1d2f..6875b69 100644
--- a/arch/mips/include/asm/mipsregs.h
+++ b/arch/mips/include/asm/mipsregs.h
@@ -34,8 +34,10 @@
  */
 #ifdef __ASSEMBLY__
 #define _ULCAST_
+#define _U64CAST_
 #else
 #define _ULCAST_ (unsigned long)
+#define _U64CAST_ (u64)
 #endif
 
 /*
@@ -217,8 +219,10 @@
 /*
  * Wired register bits
  */
-#define MIPSR6_WIRED_LIMIT	(_ULCAST_(0xffff) << 16)
-#define MIPSR6_WIRED_WIRED	(_ULCAST_(0xffff) << 0)
+#define MIPSR6_WIRED_LIMIT_SHIFT 16
+#define MIPSR6_WIRED_LIMIT	(_ULCAST_(0xffff) << MIPSR6_WIRED_LIMIT_SHIFT)
+#define MIPSR6_WIRED_WIRED_SHIFT 0
+#define MIPSR6_WIRED_WIRED	(_ULCAST_(0xffff) << MIPSR6_WIRED_WIRED_SHIFT)
 
 /*
  * Values used for computation of new tlb entries
@@ -645,6 +649,7 @@
 #define MIPS_CONF5_LLB		(_ULCAST_(1) << 4)
 #define MIPS_CONF5_MVH		(_ULCAST_(1) << 5)
 #define MIPS_CONF5_VP		(_ULCAST_(1) << 7)
+#define MIPS_CONF5_SBRI		(_ULCAST_(1) << 6)
 #define MIPS_CONF5_FRE		(_ULCAST_(1) << 8)
 #define MIPS_CONF5_UFE		(_ULCAST_(1) << 9)
 #define MIPS_CONF5_MSAEN	(_ULCAST_(1) << 27)
@@ -719,10 +724,14 @@
 #define XLR_PERFCTRL_ALLTHREADS	(_ULCAST_(1) << 13)
 
 /* MAAR bit definitions */
+#define MIPS_MAAR_VH		(_U64CAST_(1) << 63)
 #define MIPS_MAAR_ADDR		((BIT_ULL(BITS_PER_LONG - 12) - 1) << 12)
 #define MIPS_MAAR_ADDR_SHIFT	12
 #define MIPS_MAAR_S		(_ULCAST_(1) << 1)
-#define MIPS_MAAR_V		(_ULCAST_(1) << 0)
+#define MIPS_MAAR_VL		(_ULCAST_(1) << 0)
+
+/* MAARI bit definitions */
+#define MIPS_MAARI_INDEX	(_ULCAST_(0x3f) << 0)
 
 /* EBase bit definitions */
 #define MIPS_EBASE_CPUNUM_SHIFT	0
@@ -736,6 +745,10 @@
 #define MIPS_CMGCRB_BASE	11
 #define MIPS_CMGCRF_BASE	(~_ULCAST_((1 << MIPS_CMGCRB_BASE) - 1))
 
+/* LLAddr bit definitions */
+#define MIPS_LLADDR_LLB_SHIFT	0
+#define MIPS_LLADDR_LLB		(_ULCAST_(1) << MIPS_LLADDR_LLB_SHIFT)
+
 /*
  * Bits in the MIPS32 Memory Segmentation registers.
  */
@@ -961,6 +974,22 @@
 /* Flush FTLB */
 #define LOONGSON_DIAG_FTLB	(_ULCAST_(1) << 13)
 
+/* CvmCtl register field definitions */
+#define CVMCTL_IPPCI_SHIFT	7
+#define CVMCTL_IPPCI		(_U64CAST_(0x7) << CVMCTL_IPPCI_SHIFT)
+#define CVMCTL_IPTI_SHIFT	4
+#define CVMCTL_IPTI		(_U64CAST_(0x7) << CVMCTL_IPTI_SHIFT)
+
+/* CvmMemCtl2 register field definitions */
+#define CVMMEMCTL2_INHIBITTS	(_U64CAST_(1) << 17)
+
+/* CvmVMConfig register field definitions */
+#define CVMVMCONF_DGHT		(_U64CAST_(1) << 60)
+#define CVMVMCONF_MMUSIZEM1_S	12
+#define CVMVMCONF_MMUSIZEM1	(_U64CAST_(0xff) << CVMVMCONF_MMUSIZEM1_S)
+#define CVMVMCONF_RMMUSIZEM1_S	0
+#define CVMVMCONF_RMMUSIZEM1	(_U64CAST_(0xff) << CVMVMCONF_RMMUSIZEM1_S)
+
 /*
  * Coprocessor 1 (FPU) register names
  */
@@ -1720,6 +1749,13 @@ do {									\
 
 #define read_c0_cvmmemctl()	__read_64bit_c0_register($11, 7)
 #define write_c0_cvmmemctl(val) __write_64bit_c0_register($11, 7, val)
+
+#define read_c0_cvmmemctl2()	__read_64bit_c0_register($16, 6)
+#define write_c0_cvmmemctl2(val) __write_64bit_c0_register($16, 6, val)
+
+#define read_c0_cvmvmconfig()	__read_64bit_c0_register($16, 7)
+#define write_c0_cvmvmconfig(val) __write_64bit_c0_register($16, 7, val)
+
 /*
  * The cacheerr registers are not standardized.	 On OCTEON, they are
  * 64 bits wide.
@@ -1989,6 +2025,8 @@ do {									\
 #define read_gc0_epc()			__read_ulong_gc0_register(14, 0)
 #define write_gc0_epc(val)		__write_ulong_gc0_register(14, 0, val)
 
+#define read_gc0_prid()			__read_32bit_gc0_register(15, 0)
+
 #define read_gc0_ebase()		__read_32bit_gc0_register(15, 1)
 #define write_gc0_ebase(val)		__write_32bit_gc0_register(15, 1, val)
 
@@ -2012,6 +2050,9 @@ do {									\
 #define write_gc0_config6(val)		__write_32bit_gc0_register(16, 6, val)
 #define write_gc0_config7(val)		__write_32bit_gc0_register(16, 7, val)
 
+#define read_gc0_lladdr()		__read_ulong_gc0_register(17, 0)
+#define write_gc0_lladdr(val)		__write_ulong_gc0_register(17, 0, val)
+
 #define read_gc0_watchlo0()		__read_ulong_gc0_register(18, 0)
 #define read_gc0_watchlo1()		__read_ulong_gc0_register(18, 1)
 #define read_gc0_watchlo2()		__read_ulong_gc0_register(18, 2)
@@ -2090,6 +2131,19 @@ do {									\
 #define write_gc0_kscratch5(val)	__write_ulong_gc0_register(31, 6, val)
 #define write_gc0_kscratch6(val)	__write_ulong_gc0_register(31, 7, val)
 
+/* Cavium OCTEON (cnMIPS) */
+#define read_gc0_cvmcount()		__read_ulong_gc0_register(9, 6)
+#define write_gc0_cvmcount(val)		__write_ulong_gc0_register(9, 6, val)
+
+#define read_gc0_cvmctl()		__read_64bit_gc0_register(9, 7)
+#define write_gc0_cvmctl(val)		__write_64bit_gc0_register(9, 7, val)
+
+#define read_gc0_cvmmemctl()		__read_64bit_gc0_register(11, 7)
+#define write_gc0_cvmmemctl(val)	__write_64bit_gc0_register(11, 7, val)
+
+#define read_gc0_cvmmemctl2()		__read_64bit_gc0_register(16, 6)
+#define write_gc0_cvmmemctl2(val)	__write_64bit_gc0_register(16, 6, val)
+
 /*
  * Macros to access the floating point coprocessor control registers
  */
@@ -2696,9 +2750,11 @@ __BUILD_SET_C0(brcm_mode)
  */
 #define __BUILD_SET_GC0(name)	__BUILD_SET_COMMON(gc0_##name)
 
+__BUILD_SET_GC0(wired)
 __BUILD_SET_GC0(status)
 __BUILD_SET_GC0(cause)
 __BUILD_SET_GC0(ebase)
+__BUILD_SET_GC0(config1)
 
 /*
  * Return low 10 bits of ebase.
diff --git a/arch/mips/include/asm/tlb.h b/arch/mips/include/asm/tlb.h
index dd179fd..939734d 100644
--- a/arch/mips/include/asm/tlb.h
+++ b/arch/mips/include/asm/tlb.h
@@ -21,9 +21,11 @@
  */
 #define tlb_flush(tlb) flush_tlb_mm((tlb)->mm)
 
-#define UNIQUE_ENTRYHI(idx)						\
-		((CKSEG0 + ((idx) << (PAGE_SHIFT + 1))) |		\
+#define _UNIQUE_ENTRYHI(base, idx)					\
+		(((base) + ((idx) << (PAGE_SHIFT + 1))) |		\
 		 (cpu_has_tlbinv ? MIPS_ENTRYHI_EHINV : 0))
+#define UNIQUE_ENTRYHI(idx)		_UNIQUE_ENTRYHI(CKSEG0, idx)
+#define UNIQUE_GUEST_ENTRYHI(idx)	_UNIQUE_ENTRYHI(CKSEG1, idx)
 
 static inline unsigned int num_wired_entries(void)
 {
diff --git a/arch/mips/include/uapi/asm/inst.h b/arch/mips/include/uapi/asm/inst.h
index 77429d1..b5e46ae 100644
--- a/arch/mips/include/uapi/asm/inst.h
+++ b/arch/mips/include/uapi/asm/inst.h
@@ -179,7 +179,7 @@ enum cop0_coi_func {
 	tlbr_op	      = 0x01, tlbwi_op	    = 0x02,
 	tlbwr_op      = 0x06, tlbp_op	    = 0x08,
 	rfe_op	      = 0x10, eret_op	    = 0x18,
-	wait_op       = 0x20,
+	wait_op       = 0x20, hypcall_op    = 0x28
 };
 
 /*
diff --git a/arch/mips/include/uapi/asm/kvm.h b/arch/mips/include/uapi/asm/kvm.h
index a8a0199..0318c6b 100644
--- a/arch/mips/include/uapi/asm/kvm.h
+++ b/arch/mips/include/uapi/asm/kvm.h
@@ -21,6 +21,8 @@
 
 #define __KVM_HAVE_READONLY_MEM
 
+#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
+
 /*
  * for KVM_GET_REGS and KVM_SET_REGS
  *
@@ -54,9 +56,14 @@ struct kvm_fpu {
  * Register set = 0: GP registers from kvm_regs (see definitions below).
  *
  * Register set = 1: CP0 registers.
- *  bits[15..8]  - Must be zero.
- *  bits[7..3]   - Register 'rd'  index.
- *  bits[2..0]   - Register 'sel' index.
+ *  bits[15..8]  - COP0 register set.
+ *
+ *  COP0 register set = 0: Main CP0 registers.
+ *   bits[7..3]   - Register 'rd'  index.
+ *   bits[2..0]   - Register 'sel' index.
+ *
+ *  COP0 register set = 1: MAARs.
+ *   bits[7..0]   - MAAR index.
  *
  * Register set = 2: KVM specific registers (see definitions below).
  *
@@ -115,6 +122,15 @@ struct kvm_fpu {
 
 
 /*
+ * KVM_REG_MIPS_CP0 - Coprocessor 0 registers.
+ */
+
+#define KVM_REG_MIPS_MAAR	(KVM_REG_MIPS_CP0 | (1 << 8))
+#define KVM_REG_MIPS_CP0_MAAR(n)	(KVM_REG_MIPS_MAAR | \
+					 KVM_REG_SIZE_U64 | (n))
+
+
+/*
  * KVM_REG_MIPS_KVM - KVM specific control registers.
  */
 
diff --git a/arch/mips/kernel/cpu-probe.c b/arch/mips/kernel/cpu-probe.c
index 07718bb..c72a4cd 100644
--- a/arch/mips/kernel/cpu-probe.c
+++ b/arch/mips/kernel/cpu-probe.c
@@ -289,6 +289,8 @@ static void cpu_set_fpu_opts(struct cpuinfo_mips *c)
 			    MIPS_CPU_ISA_M32R6 | MIPS_CPU_ISA_M64R6)) {
 		if (c->fpu_id & MIPS_FPIR_3D)
 			c->ases |= MIPS_ASE_MIPS3D;
+		if (c->fpu_id & MIPS_FPIR_UFRP)
+			c->options |= MIPS_CPU_UFR;
 		if (c->fpu_id & MIPS_FPIR_FREP)
 			c->options |= MIPS_CPU_FRE;
 	}
@@ -1003,7 +1005,8 @@ static inline unsigned int decode_guest_config3(struct cpuinfo_mips *c)
 	unsigned int config3, config3_dyn;
 
 	probe_gc0_config_dyn(config3, config3, config3_dyn,
-			     MIPS_CONF_M | MIPS_CONF3_MSA | MIPS_CONF3_CTXTC);
+			     MIPS_CONF_M | MIPS_CONF3_MSA | MIPS_CONF3_ULRI |
+			     MIPS_CONF3_CTXTC);
 
 	if (config3 & MIPS_CONF3_CTXTC)
 		c->guest.options |= MIPS_CPU_CTXTC;
@@ -1013,6 +1016,9 @@ static inline unsigned int decode_guest_config3(struct cpuinfo_mips *c)
 	if (config3 & MIPS_CONF3_PW)
 		c->guest.options |= MIPS_CPU_HTW;
 
+	if (config3 & MIPS_CONF3_ULRI)
+		c->guest.options |= MIPS_CPU_ULRI;
+
 	if (config3 & MIPS_CONF3_SC)
 		c->guest.options |= MIPS_CPU_SEGMENTS;
 
@@ -1051,7 +1057,7 @@ static inline unsigned int decode_guest_config5(struct cpuinfo_mips *c)
 	unsigned int config5, config5_dyn;
 
 	probe_gc0_config_dyn(config5, config5, config5_dyn,
-			 MIPS_CONF_M | MIPS_CONF5_MRP);
+			 MIPS_CONF_M | MIPS_CONF5_MVH | MIPS_CONF5_MRP);
 
 	if (config5 & MIPS_CONF5_MRP)
 		c->guest.options |= MIPS_CPU_MAAR;
@@ -1061,6 +1067,9 @@ static inline unsigned int decode_guest_config5(struct cpuinfo_mips *c)
 	if (config5 & MIPS_CONF5_LLB)
 		c->guest.options |= MIPS_CPU_RW_LLB;
 
+	if (config5 & MIPS_CONF5_MVH)
+		c->guest.options |= MIPS_CPU_MVH;
+
 	if (config5 & MIPS_CONF_M)
 		c->guest.conf |= BIT(6);
 	return config5 & MIPS_CONF_M;
diff --git a/arch/mips/kernel/time.c b/arch/mips/kernel/time.c
index a7f8126..c036157 100644
--- a/arch/mips/kernel/time.c
+++ b/arch/mips/kernel/time.c
@@ -70,6 +70,7 @@ EXPORT_SYMBOL(perf_irq);
  */
 
 unsigned int mips_hpt_frequency;
+EXPORT_SYMBOL_GPL(mips_hpt_frequency);
 
 /*
  * This function exists in order to cause an error due to a duplicate
diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig
index 6506732..50a722d 100644
--- a/arch/mips/kvm/Kconfig
+++ b/arch/mips/kvm/Kconfig
@@ -26,11 +26,34 @@ config KVM
 	select SRCU
 	---help---
 	  Support for hosting Guest kernels.
-	  Currently supported on MIPS32 processors.
+
+choice
+	prompt "Virtualization mode"
+	depends on KVM
+	default KVM_MIPS_TE
+
+config KVM_MIPS_TE
+	bool "Trap & Emulate"
+	---help---
+	  Use trap and emulate to virtualize 32-bit guests in user mode. This
+	  does not require any special hardware Virtualization support beyond
+	  standard MIPS32/64 r2 or later, but it does require the guest kernel
+	  to be configured with CONFIG_KVM_GUEST=y so that it resides in the
+	  user address segment.
+
+config KVM_MIPS_VZ
+	bool "MIPS Virtualization (VZ) ASE"
+	---help---
+	  Use the MIPS Virtualization (VZ) ASE to virtualize guests. This
+	  supports running unmodified guest kernels (with CONFIG_KVM_GUEST=n),
+	  but requires hardware support.
+
+endchoice
 
 config KVM_MIPS_DYN_TRANS
 	bool "KVM/MIPS: Dynamic binary translation to reduce traps"
-	depends on KVM
+	depends on KVM_MIPS_TE
+	default y
 	---help---
 	  When running in Trap & Emulate mode patch privileged
 	  instructions to reduce the number of traps.
diff --git a/arch/mips/kvm/Makefile b/arch/mips/kvm/Makefile
index 847429d..45d90f5 100644
--- a/arch/mips/kvm/Makefile
+++ b/arch/mips/kvm/Makefile
@@ -9,8 +9,15 @@ common-objs-$(CONFIG_CPU_HAS_MSA) += msa.o
 
 kvm-objs := $(common-objs-y) mips.o emulate.o entry.o \
 	    interrupt.o stats.o commpage.o \
-	    dyntrans.o trap_emul.o fpu.o
+	    fpu.o
+kvm-objs += hypcall.o
 kvm-objs += mmu.o
 
+ifdef CONFIG_KVM_MIPS_VZ
+kvm-objs		+= vz.o
+else
+kvm-objs		+= dyntrans.o
+kvm-objs		+= trap_emul.o
+endif
 obj-$(CONFIG_KVM)	+= kvm.o
 obj-y			+= callback.o tlb.o
diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index d40cfaa..4144bfa 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -308,7 +308,7 @@ int kvm_get_badinstrp(u32 *opc, struct kvm_vcpu *vcpu, u32 *out)
  *		CP0_Cause.DC bit or the count_ctl.DC bit.
  *		0 otherwise (in which case CP0_Count timer is running).
  */
-static inline int kvm_mips_count_disabled(struct kvm_vcpu *vcpu)
+int kvm_mips_count_disabled(struct kvm_vcpu *vcpu)
 {
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
 
@@ -467,7 +467,7 @@ u32 kvm_mips_read_count(struct kvm_vcpu *vcpu)
  *
  * Returns:	The ktime at the point of freeze.
  */
-static ktime_t kvm_mips_freeze_hrtimer(struct kvm_vcpu *vcpu, u32 *count)
+ktime_t kvm_mips_freeze_hrtimer(struct kvm_vcpu *vcpu, u32 *count)
 {
 	ktime_t now;
 
@@ -517,6 +517,82 @@ static void kvm_mips_resume_hrtimer(struct kvm_vcpu *vcpu,
 }
 
 /**
+ * kvm_mips_restore_hrtimer() - Restore hrtimer after a gap, updating expiry.
+ * @vcpu:	Virtual CPU.
+ * @before:	Time before Count was saved, lower bound of drift calculation.
+ * @count:	CP0_Count at point of restore.
+ * @min_drift:	Minimum amount of drift permitted before correction.
+ *		Must be <= 0.
+ *
+ * Restores the timer from a particular @count, accounting for drift. This can
+ * be used in conjunction with kvm_mips_freeze_timer() when a hardware timer is
+ * to be used for a period of time, but the exact ktime corresponding to the
+ * final Count that must be restored is not known.
+ *
+ * It is gauranteed that a timer interrupt immediately after restore will be
+ * handled, but not if CP0_Compare is exactly at @count. That case should
+ * already be handled when the hardware timer state is saved.
+ *
+ * Assumes !kvm_mips_count_disabled(@vcpu) (guest CP0_Count timer is not
+ * stopped).
+ *
+ * Returns:	Amount of correction to count_bias due to drift.
+ */
+int kvm_mips_restore_hrtimer(struct kvm_vcpu *vcpu, ktime_t before,
+			     u32 count, int min_drift)
+{
+	ktime_t now, count_time;
+	u32 now_count, before_count;
+	u64 delta;
+	int drift, ret = 0;
+
+	/* Calculate expected count at before */
+	before_count = vcpu->arch.count_bias +
+			kvm_mips_ktime_to_count(vcpu, before);
+
+	/*
+	 * Detect significantly negative drift, where count is lower than
+	 * expected. Some negative drift is expected when hardware counter is
+	 * set after kvm_mips_freeze_timer(), and it is harmless to allow the
+	 * time to jump forwards a little, within reason. If the drift is too
+	 * significant, adjust the bias to avoid a big Guest.CP0_Count jump.
+	 */
+	drift = count - before_count;
+	if (drift < min_drift) {
+		count_time = before;
+		vcpu->arch.count_bias += drift;
+		ret = drift;
+		goto resume;
+	}
+
+	/* Calculate expected count right now */
+	now = ktime_get();
+	now_count = vcpu->arch.count_bias + kvm_mips_ktime_to_count(vcpu, now);
+
+	/*
+	 * Detect positive drift, where count is higher than expected, and
+	 * adjust the bias to avoid guest time going backwards.
+	 */
+	drift = count - now_count;
+	if (drift > 0) {
+		count_time = now;
+		vcpu->arch.count_bias += drift;
+		ret = drift;
+		goto resume;
+	}
+
+	/* Subtract nanosecond delta to find ktime when count was read */
+	delta = (u64)(u32)(now_count - count);
+	delta = div_u64(delta * NSEC_PER_SEC, vcpu->arch.count_hz);
+	count_time = ktime_sub_ns(now, delta);
+
+resume:
+	/* Resume using the calculated ktime */
+	kvm_mips_resume_hrtimer(vcpu, count_time, count);
+	return ret;
+}
+
+/**
  * kvm_mips_write_count() - Modify the count and update timer.
  * @vcpu:	Virtual CPU.
  * @count:	Guest CP0_Count value to set.
@@ -543,16 +619,15 @@ void kvm_mips_write_count(struct kvm_vcpu *vcpu, u32 count)
 /**
  * kvm_mips_init_count() - Initialise timer.
  * @vcpu:	Virtual CPU.
+ * @count_hz:	Frequency of timer.
  *
- * Initialise the timer to a sensible frequency, namely 100MHz, zero it, and set
- * it going if it's enabled.
+ * Initialise the timer to the specified frequency, zero it, and set it going if
+ * it's enabled.
  */
-void kvm_mips_init_count(struct kvm_vcpu *vcpu)
+void kvm_mips_init_count(struct kvm_vcpu *vcpu, unsigned long count_hz)
 {
-	/* 100 MHz */
-	vcpu->arch.count_hz = 100*1000*1000;
-	vcpu->arch.count_period = div_u64((u64)NSEC_PER_SEC << 32,
-					  vcpu->arch.count_hz);
+	vcpu->arch.count_hz = count_hz;
+	vcpu->arch.count_period = div_u64((u64)NSEC_PER_SEC << 32, count_hz);
 	vcpu->arch.count_dyn_bias = 0;
 
 	/* Starting at 0 */
@@ -622,7 +697,9 @@ void kvm_mips_write_compare(struct kvm_vcpu *vcpu, u32 compare, bool ack)
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
 	int dc;
 	u32 old_compare = kvm_read_c0_guest_compare(cop0);
-	ktime_t now;
+	s32 delta = compare - old_compare;
+	u32 cause;
+	ktime_t now = ktime_set(0, 0); /* silence bogus GCC warning */
 	u32 count;
 
 	/* if unchanged, must just be an ack */
@@ -634,6 +711,21 @@ void kvm_mips_write_compare(struct kvm_vcpu *vcpu, u32 compare, bool ack)
 		return;
 	}
 
+	/*
+	 * If guest CP0_Compare moves forward, CP0_GTOffset should be adjusted
+	 * too to prevent guest CP0_Count hitting guest CP0_Compare.
+	 *
+	 * The new GTOffset corresponds to the new value of CP0_Compare, and is
+	 * set prior to it being written into the guest context. We disable
+	 * preemption until the new value is written to prevent restore of a
+	 * GTOffset corresponding to the old CP0_Compare value.
+	 */
+	if (IS_ENABLED(CONFIG_KVM_MIPS_VZ) && delta > 0) {
+		preempt_disable();
+		write_c0_gtoffset(compare - read_c0_count());
+		back_to_back_c0_hazard();
+	}
+
 	/* freeze_hrtimer() takes care of timer interrupts <= count */
 	dc = kvm_mips_count_disabled(vcpu);
 	if (!dc)
@@ -641,12 +733,36 @@ void kvm_mips_write_compare(struct kvm_vcpu *vcpu, u32 compare, bool ack)
 
 	if (ack)
 		kvm_mips_callbacks->dequeue_timer_int(vcpu);
+	else if (IS_ENABLED(CONFIG_KVM_MIPS_VZ))
+		/*
+		 * With VZ, writing CP0_Compare acks (clears) CP0_Cause.TI, so
+		 * preserve guest CP0_Cause.TI if we don't want to ack it.
+		 */
+		cause = kvm_read_c0_guest_cause(cop0);
 
 	kvm_write_c0_guest_compare(cop0, compare);
 
+	if (IS_ENABLED(CONFIG_KVM_MIPS_VZ)) {
+		if (delta > 0)
+			preempt_enable();
+
+		back_to_back_c0_hazard();
+
+		if (!ack && cause & CAUSEF_TI)
+			kvm_write_c0_guest_cause(cop0, cause);
+	}
+
 	/* resume_hrtimer() takes care of timer interrupts > count */
 	if (!dc)
 		kvm_mips_resume_hrtimer(vcpu, now, count);
+
+	/*
+	 * If guest CP0_Compare is moving backward, we delay CP0_GTOffset change
+	 * until after the new CP0_Compare is written, otherwise new guest
+	 * CP0_Count could hit new guest CP0_Compare.
+	 */
+	if (IS_ENABLED(CONFIG_KVM_MIPS_VZ) && delta <= 0)
+		write_c0_gtoffset(compare - read_c0_count());
 }
 
 /**
@@ -857,6 +973,7 @@ enum emulation_result kvm_mips_emul_wait(struct kvm_vcpu *vcpu)
 	++vcpu->stat.wait_exits;
 	trace_kvm_exit(vcpu, KVM_TRACE_EXIT_WAIT);
 	if (!vcpu->arch.pending_exceptions) {
+		kvm_vz_lose_htimer(vcpu);
 		vcpu->arch.wait = 1;
 		kvm_vcpu_block(vcpu);
 
@@ -865,7 +982,7 @@ enum emulation_result kvm_mips_emul_wait(struct kvm_vcpu *vcpu)
 		 * check if any I/O interrupts are pending.
 		 */
 		if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) {
-			clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
+			kvm_clear_request(KVM_REQ_UNHALT, vcpu);
 			vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
 		}
 	}
@@ -873,17 +990,62 @@ enum emulation_result kvm_mips_emul_wait(struct kvm_vcpu *vcpu)
 	return EMULATE_DONE;
 }
 
-/*
- * XXXKYMA: Linux doesn't seem to use TLBR, return EMULATE_FAIL for now so that
- * we can catch this, if things ever change
- */
+static void kvm_mips_change_entryhi(struct kvm_vcpu *vcpu,
+				    unsigned long entryhi)
+{
+	struct mips_coproc *cop0 = vcpu->arch.cop0;
+	struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm;
+	int cpu, i;
+	u32 nasid = entryhi & KVM_ENTRYHI_ASID;
+
+	if (((kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID) != nasid)) {
+		trace_kvm_asid_change(vcpu, kvm_read_c0_guest_entryhi(cop0) &
+				      KVM_ENTRYHI_ASID, nasid);
+
+		/*
+		 * Flush entries from the GVA page tables.
+		 * Guest user page table will get flushed lazily on re-entry to
+		 * guest user if the guest ASID actually changes.
+		 */
+		kvm_mips_flush_gva_pt(kern_mm->pgd, KMF_KERN);
+
+		/*
+		 * Regenerate/invalidate kernel MMU context.
+		 * The user MMU context will be regenerated lazily on re-entry
+		 * to guest user if the guest ASID actually changes.
+		 */
+		preempt_disable();
+		cpu = smp_processor_id();
+		get_new_mmu_context(kern_mm, cpu);
+		for_each_possible_cpu(i)
+			if (i != cpu)
+				cpu_context(i, kern_mm) = 0;
+		preempt_enable();
+	}
+	kvm_write_c0_guest_entryhi(cop0, entryhi);
+}
+
 enum emulation_result kvm_mips_emul_tlbr(struct kvm_vcpu *vcpu)
 {
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
+	struct kvm_mips_tlb *tlb;
 	unsigned long pc = vcpu->arch.pc;
+	int index;
 
-	kvm_err("[%#lx] COP0_TLBR [%ld]\n", pc, kvm_read_c0_guest_index(cop0));
-	return EMULATE_FAIL;
+	index = kvm_read_c0_guest_index(cop0);
+	if (index < 0 || index >= KVM_MIPS_GUEST_TLB_SIZE) {
+		/* UNDEFINED */
+		kvm_debug("[%#lx] TLBR Index %#x out of range\n", pc, index);
+		index &= KVM_MIPS_GUEST_TLB_SIZE - 1;
+	}
+
+	tlb = &vcpu->arch.guest_tlb[index];
+	kvm_write_c0_guest_pagemask(cop0, tlb->tlb_mask);
+	kvm_write_c0_guest_entrylo0(cop0, tlb->tlb_lo[0]);
+	kvm_write_c0_guest_entrylo1(cop0, tlb->tlb_lo[1]);
+	kvm_mips_change_entryhi(vcpu, tlb->tlb_hi);
+
+	return EMULATE_DONE;
 }
 
 /**
@@ -1105,11 +1267,9 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst,
 					   struct kvm_vcpu *vcpu)
 {
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
-	struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm;
 	enum emulation_result er = EMULATE_DONE;
 	u32 rt, rd, sel;
 	unsigned long curr_pc;
-	int cpu, i;
 
 	/*
 	 * Update PC and hold onto current PC in case there is
@@ -1143,6 +1303,9 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst,
 		case wait_op:
 			er = kvm_mips_emul_wait(vcpu);
 			break;
+		case hypcall_op:
+			er = kvm_mips_emul_hypcall(vcpu, inst);
+			break;
 		}
 	} else {
 		rt = inst.c0r_format.rt;
@@ -1208,44 +1371,8 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst,
 				kvm_change_c0_guest_ebase(cop0, 0x1ffff000,
 							  vcpu->arch.gprs[rt]);
 			} else if (rd == MIPS_CP0_TLB_HI && sel == 0) {
-				u32 nasid =
-					vcpu->arch.gprs[rt] & KVM_ENTRYHI_ASID;
-				if (((kvm_read_c0_guest_entryhi(cop0) &
-				      KVM_ENTRYHI_ASID) != nasid)) {
-					trace_kvm_asid_change(vcpu,
-						kvm_read_c0_guest_entryhi(cop0)
-							& KVM_ENTRYHI_ASID,
-						nasid);
-
-					/*
-					 * Flush entries from the GVA page
-					 * tables.
-					 * Guest user page table will get
-					 * flushed lazily on re-entry to guest
-					 * user if the guest ASID actually
-					 * changes.
-					 */
-					kvm_mips_flush_gva_pt(kern_mm->pgd,
-							      KMF_KERN);
-
-					/*
-					 * Regenerate/invalidate kernel MMU
-					 * context.
-					 * The user MMU context will be
-					 * regenerated lazily on re-entry to
-					 * guest user if the guest ASID actually
-					 * changes.
-					 */
-					preempt_disable();
-					cpu = smp_processor_id();
-					get_new_mmu_context(kern_mm, cpu);
-					for_each_possible_cpu(i)
-						if (i != cpu)
-							cpu_context(i, kern_mm) = 0;
-					preempt_enable();
-				}
-				kvm_write_c0_guest_entryhi(cop0,
-							   vcpu->arch.gprs[rt]);
+				kvm_mips_change_entryhi(vcpu,
+							vcpu->arch.gprs[rt]);
 			}
 			/* Are we writing to COUNT */
 			else if ((rd == MIPS_CP0_COUNT) && (sel == 0)) {
@@ -1474,9 +1601,8 @@ enum emulation_result kvm_mips_emulate_store(union mips_instruction inst,
 					     struct kvm_run *run,
 					     struct kvm_vcpu *vcpu)
 {
-	enum emulation_result er = EMULATE_DO_MMIO;
+	enum emulation_result er;
 	u32 rt;
-	u32 bytes;
 	void *data = run->mmio.data;
 	unsigned long curr_pc;
 
@@ -1491,103 +1617,74 @@ enum emulation_result kvm_mips_emulate_store(union mips_instruction inst,
 
 	rt = inst.i_format.rt;
 
+	run->mmio.phys_addr = kvm_mips_callbacks->gva_to_gpa(
+						vcpu->arch.host_cp0_badvaddr);
+	if (run->mmio.phys_addr == KVM_INVALID_ADDR)
+		goto out_fail;
+
 	switch (inst.i_format.opcode) {
-	case sb_op:
-		bytes = 1;
-		if (bytes > sizeof(run->mmio.data)) {
-			kvm_err("%s: bad MMIO length: %d\n", __func__,
-			       run->mmio.len);
-		}
-		run->mmio.phys_addr =
-		    kvm_mips_callbacks->gva_to_gpa(vcpu->arch.
-						   host_cp0_badvaddr);
-		if (run->mmio.phys_addr == KVM_INVALID_ADDR) {
-			er = EMULATE_FAIL;
-			break;
-		}
-		run->mmio.len = bytes;
-		run->mmio.is_write = 1;
-		vcpu->mmio_needed = 1;
-		vcpu->mmio_is_write = 1;
-		*(u8 *) data = vcpu->arch.gprs[rt];
-		kvm_debug("OP_SB: eaddr: %#lx, gpr: %#lx, data: %#x\n",
-			  vcpu->arch.host_cp0_badvaddr, vcpu->arch.gprs[rt],
-			  *(u8 *) data);
+#if defined(CONFIG_64BIT) && defined(CONFIG_KVM_MIPS_VZ)
+	case sd_op:
+		run->mmio.len = 8;
+		*(u64 *)data = vcpu->arch.gprs[rt];
 
+		kvm_debug("[%#lx] OP_SD: eaddr: %#lx, gpr: %#lx, data: %#llx\n",
+			  vcpu->arch.pc, vcpu->arch.host_cp0_badvaddr,
+			  vcpu->arch.gprs[rt], *(u64 *)data);
 		break;
+#endif
 
 	case sw_op:
-		bytes = 4;
-		if (bytes > sizeof(run->mmio.data)) {
-			kvm_err("%s: bad MMIO length: %d\n", __func__,
-			       run->mmio.len);
-		}
-		run->mmio.phys_addr =
-		    kvm_mips_callbacks->gva_to_gpa(vcpu->arch.
-						   host_cp0_badvaddr);
-		if (run->mmio.phys_addr == KVM_INVALID_ADDR) {
-			er = EMULATE_FAIL;
-			break;
-		}
-
-		run->mmio.len = bytes;
-		run->mmio.is_write = 1;
-		vcpu->mmio_needed = 1;
-		vcpu->mmio_is_write = 1;
-		*(u32 *) data = vcpu->arch.gprs[rt];
+		run->mmio.len = 4;
+		*(u32 *)data = vcpu->arch.gprs[rt];
 
 		kvm_debug("[%#lx] OP_SW: eaddr: %#lx, gpr: %#lx, data: %#x\n",
 			  vcpu->arch.pc, vcpu->arch.host_cp0_badvaddr,
-			  vcpu->arch.gprs[rt], *(u32 *) data);
+			  vcpu->arch.gprs[rt], *(u32 *)data);
 		break;
 
 	case sh_op:
-		bytes = 2;
-		if (bytes > sizeof(run->mmio.data)) {
-			kvm_err("%s: bad MMIO length: %d\n", __func__,
-			       run->mmio.len);
-		}
-		run->mmio.phys_addr =
-		    kvm_mips_callbacks->gva_to_gpa(vcpu->arch.
-						   host_cp0_badvaddr);
-		if (run->mmio.phys_addr == KVM_INVALID_ADDR) {
-			er = EMULATE_FAIL;
-			break;
-		}
-
-		run->mmio.len = bytes;
-		run->mmio.is_write = 1;
-		vcpu->mmio_needed = 1;
-		vcpu->mmio_is_write = 1;
-		*(u16 *) data = vcpu->arch.gprs[rt];
+		run->mmio.len = 2;
+		*(u16 *)data = vcpu->arch.gprs[rt];
 
 		kvm_debug("[%#lx] OP_SH: eaddr: %#lx, gpr: %#lx, data: %#x\n",
 			  vcpu->arch.pc, vcpu->arch.host_cp0_badvaddr,
-			  vcpu->arch.gprs[rt], *(u32 *) data);
+			  vcpu->arch.gprs[rt], *(u16 *)data);
+		break;
+
+	case sb_op:
+		run->mmio.len = 1;
+		*(u8 *)data = vcpu->arch.gprs[rt];
+
+		kvm_debug("[%#lx] OP_SB: eaddr: %#lx, gpr: %#lx, data: %#x\n",
+			  vcpu->arch.pc, vcpu->arch.host_cp0_badvaddr,
+			  vcpu->arch.gprs[rt], *(u8 *)data);
 		break;
 
 	default:
 		kvm_err("Store not yet supported (inst=0x%08x)\n",
 			inst.word);
-		er = EMULATE_FAIL;
-		break;
+		goto out_fail;
 	}
 
-	/* Rollback PC if emulation was unsuccessful */
-	if (er == EMULATE_FAIL)
-		vcpu->arch.pc = curr_pc;
+	run->mmio.is_write = 1;
+	vcpu->mmio_needed = 1;
+	vcpu->mmio_is_write = 1;
+	return EMULATE_DO_MMIO;
 
-	return er;
+out_fail:
+	/* Rollback PC if emulation was unsuccessful */
+	vcpu->arch.pc = curr_pc;
+	return EMULATE_FAIL;
 }
 
 enum emulation_result kvm_mips_emulate_load(union mips_instruction inst,
 					    u32 cause, struct kvm_run *run,
 					    struct kvm_vcpu *vcpu)
 {
-	enum emulation_result er = EMULATE_DO_MMIO;
+	enum emulation_result er;
 	unsigned long curr_pc;
 	u32 op, rt;
-	u32 bytes;
 
 	rt = inst.i_format.rt;
 	op = inst.i_format.opcode;
@@ -1606,96 +1703,53 @@ enum emulation_result kvm_mips_emulate_load(union mips_instruction inst,
 
 	vcpu->arch.io_gpr = rt;
 
+	run->mmio.phys_addr = kvm_mips_callbacks->gva_to_gpa(
+						vcpu->arch.host_cp0_badvaddr);
+	if (run->mmio.phys_addr == KVM_INVALID_ADDR)
+		return EMULATE_FAIL;
+
+	vcpu->mmio_needed = 2;	/* signed */
 	switch (op) {
-	case lw_op:
-		bytes = 4;
-		if (bytes > sizeof(run->mmio.data)) {
-			kvm_err("%s: bad MMIO length: %d\n", __func__,
-			       run->mmio.len);
-			er = EMULATE_FAIL;
-			break;
-		}
-		run->mmio.phys_addr =
-		    kvm_mips_callbacks->gva_to_gpa(vcpu->arch.
-						   host_cp0_badvaddr);
-		if (run->mmio.phys_addr == KVM_INVALID_ADDR) {
-			er = EMULATE_FAIL;
-			break;
-		}
+#if defined(CONFIG_64BIT) && defined(CONFIG_KVM_MIPS_VZ)
+	case ld_op:
+		run->mmio.len = 8;
+		break;
 
-		run->mmio.len = bytes;
-		run->mmio.is_write = 0;
-		vcpu->mmio_needed = 1;
-		vcpu->mmio_is_write = 0;
+	case lwu_op:
+		vcpu->mmio_needed = 1;	/* unsigned */
+		/* fall through */
+#endif
+	case lw_op:
+		run->mmio.len = 4;
 		break;
 
-	case lh_op:
 	case lhu_op:
-		bytes = 2;
-		if (bytes > sizeof(run->mmio.data)) {
-			kvm_err("%s: bad MMIO length: %d\n", __func__,
-			       run->mmio.len);
-			er = EMULATE_FAIL;
-			break;
-		}
-		run->mmio.phys_addr =
-		    kvm_mips_callbacks->gva_to_gpa(vcpu->arch.
-						   host_cp0_badvaddr);
-		if (run->mmio.phys_addr == KVM_INVALID_ADDR) {
-			er = EMULATE_FAIL;
-			break;
-		}
-
-		run->mmio.len = bytes;
-		run->mmio.is_write = 0;
-		vcpu->mmio_needed = 1;
-		vcpu->mmio_is_write = 0;
-
-		if (op == lh_op)
-			vcpu->mmio_needed = 2;
-		else
-			vcpu->mmio_needed = 1;
-
+		vcpu->mmio_needed = 1;	/* unsigned */
+		/* fall through */
+	case lh_op:
+		run->mmio.len = 2;
 		break;
 
 	case lbu_op:
+		vcpu->mmio_needed = 1;	/* unsigned */
+		/* fall through */
 	case lb_op:
-		bytes = 1;
-		if (bytes > sizeof(run->mmio.data)) {
-			kvm_err("%s: bad MMIO length: %d\n", __func__,
-			       run->mmio.len);
-			er = EMULATE_FAIL;
-			break;
-		}
-		run->mmio.phys_addr =
-		    kvm_mips_callbacks->gva_to_gpa(vcpu->arch.
-						   host_cp0_badvaddr);
-		if (run->mmio.phys_addr == KVM_INVALID_ADDR) {
-			er = EMULATE_FAIL;
-			break;
-		}
-
-		run->mmio.len = bytes;
-		run->mmio.is_write = 0;
-		vcpu->mmio_is_write = 0;
-
-		if (op == lb_op)
-			vcpu->mmio_needed = 2;
-		else
-			vcpu->mmio_needed = 1;
-
+		run->mmio.len = 1;
 		break;
 
 	default:
 		kvm_err("Load not yet supported (inst=0x%08x)\n",
 			inst.word);
-		er = EMULATE_FAIL;
-		break;
+		vcpu->mmio_needed = 0;
+		return EMULATE_FAIL;
 	}
 
-	return er;
+	run->mmio.is_write = 0;
+	vcpu->mmio_is_write = 0;
+	return EMULATE_DO_MMIO;
 }
 
+#ifndef CONFIG_KVM_MIPS_VZ
 static enum emulation_result kvm_mips_guest_cache_op(int (*fn)(unsigned long),
 						     unsigned long curr_pc,
 						     unsigned long addr,
@@ -1786,11 +1840,35 @@ enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst,
 			  vcpu->arch.pc, vcpu->arch.gprs[31], cache, op, base,
 			  arch->gprs[base], offset);
 
-		if (cache == Cache_D)
+		if (cache == Cache_D) {
+#ifdef CONFIG_CPU_R4K_CACHE_TLB
 			r4k_blast_dcache();
-		else if (cache == Cache_I)
+#else
+			switch (boot_cpu_type()) {
+			case CPU_CAVIUM_OCTEON3:
+				/* locally flush icache */
+				local_flush_icache_range(0, 0);
+				break;
+			default:
+				__flush_cache_all();
+				break;
+			}
+#endif
+		} else if (cache == Cache_I) {
+#ifdef CONFIG_CPU_R4K_CACHE_TLB
 			r4k_blast_icache();
-		else {
+#else
+			switch (boot_cpu_type()) {
+			case CPU_CAVIUM_OCTEON3:
+				/* locally flush icache */
+				local_flush_icache_range(0, 0);
+				break;
+			default:
+				flush_icache_all();
+				break;
+			}
+#endif
+		} else {
 			kvm_err("%s: unsupported CACHE INDEX operation\n",
 				__func__);
 			return EMULATE_FAIL;
@@ -1870,18 +1948,6 @@ enum emulation_result kvm_mips_emulate_inst(u32 cause, u32 *opc,
 	case cop0_op:
 		er = kvm_mips_emulate_CP0(inst, opc, cause, run, vcpu);
 		break;
-	case sb_op:
-	case sh_op:
-	case sw_op:
-		er = kvm_mips_emulate_store(inst, cause, run, vcpu);
-		break;
-	case lb_op:
-	case lbu_op:
-	case lhu_op:
-	case lh_op:
-	case lw_op:
-		er = kvm_mips_emulate_load(inst, cause, run, vcpu);
-		break;
 
 #ifndef CONFIG_CPU_MIPSR6
 	case cache_op:
@@ -1915,6 +1981,7 @@ unknown:
 
 	return er;
 }
+#endif /* CONFIG_KVM_MIPS_VZ */
 
 /**
  * kvm_mips_guest_exception_base() - Find guest exception vector base address.
@@ -2524,8 +2591,15 @@ enum emulation_result kvm_mips_complete_mmio_load(struct kvm_vcpu *vcpu,
 	vcpu->arch.pc = vcpu->arch.io_pc;
 
 	switch (run->mmio.len) {
+	case 8:
+		*gpr = *(s64 *)run->mmio.data;
+		break;
+
 	case 4:
-		*gpr = *(s32 *) run->mmio.data;
+		if (vcpu->mmio_needed == 2)
+			*gpr = *(s32 *)run->mmio.data;
+		else
+			*gpr = *(u32 *)run->mmio.data;
 		break;
 
 	case 2:
diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c
index c5b254c..16e1c93 100644
--- a/arch/mips/kvm/entry.c
+++ b/arch/mips/kvm/entry.c
@@ -51,12 +51,15 @@
 #define RA		31
 
 /* Some CP0 registers */
+#define C0_PWBASE	5, 5
 #define C0_HWRENA	7, 0
 #define C0_BADVADDR	8, 0
 #define C0_BADINSTR	8, 1
 #define C0_BADINSTRP	8, 2
 #define C0_ENTRYHI	10, 0
+#define C0_GUESTCTL1	10, 4
 #define C0_STATUS	12, 0
+#define C0_GUESTCTL0	12, 6
 #define C0_CAUSE	13, 0
 #define C0_EPC		14, 0
 #define C0_EBASE	15, 1
@@ -292,8 +295,8 @@ static void *kvm_mips_build_enter_guest(void *addr)
 	unsigned int i;
 	struct uasm_label labels[2];
 	struct uasm_reloc relocs[2];
-	struct uasm_label *l = labels;
-	struct uasm_reloc *r = relocs;
+	struct uasm_label __maybe_unused *l = labels;
+	struct uasm_reloc __maybe_unused *r = relocs;
 
 	memset(labels, 0, sizeof(labels));
 	memset(relocs, 0, sizeof(relocs));
@@ -302,7 +305,67 @@ static void *kvm_mips_build_enter_guest(void *addr)
 	UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, pc), K1);
 	UASM_i_MTC0(&p, T0, C0_EPC);
 
-	/* Set the ASID for the Guest Kernel */
+#ifdef CONFIG_KVM_MIPS_VZ
+	/* Save normal linux process pgd (VZ guarantees pgd_reg is set) */
+	UASM_i_MFC0(&p, K0, c0_kscratch(), pgd_reg);
+	UASM_i_SW(&p, K0, offsetof(struct kvm_vcpu_arch, host_pgd), K1);
+
+	/*
+	 * Set up KVM GPA pgd.
+	 * This does roughly the same as TLBMISS_HANDLER_SETUP_PGD():
+	 * - call tlbmiss_handler_setup_pgd(mm->pgd)
+	 * - write mm->pgd into CP0_PWBase
+	 *
+	 * We keep S0 pointing at struct kvm so we can load the ASID below.
+	 */
+	UASM_i_LW(&p, S0, (int)offsetof(struct kvm_vcpu, kvm) -
+			  (int)offsetof(struct kvm_vcpu, arch), K1);
+	UASM_i_LW(&p, A0, offsetof(struct kvm, arch.gpa_mm.pgd), S0);
+	UASM_i_LA(&p, T9, (unsigned long)tlbmiss_handler_setup_pgd);
+	uasm_i_jalr(&p, RA, T9);
+	/* delay slot */
+	if (cpu_has_htw)
+		UASM_i_MTC0(&p, A0, C0_PWBASE);
+	else
+		uasm_i_nop(&p);
+
+	/* Set GM bit to setup eret to VZ guest context */
+	uasm_i_addiu(&p, V1, ZERO, 1);
+	uasm_i_mfc0(&p, K0, C0_GUESTCTL0);
+	uasm_i_ins(&p, K0, V1, MIPS_GCTL0_GM_SHIFT, 1);
+	uasm_i_mtc0(&p, K0, C0_GUESTCTL0);
+
+	if (cpu_has_guestid) {
+		/*
+		 * Set root mode GuestID, so that root TLB refill handler can
+		 * use the correct GuestID in the root TLB.
+		 */
+
+		/* Get current GuestID */
+		uasm_i_mfc0(&p, T0, C0_GUESTCTL1);
+		/* Set GuestCtl1.RID = GuestCtl1.ID */
+		uasm_i_ext(&p, T1, T0, MIPS_GCTL1_ID_SHIFT,
+			   MIPS_GCTL1_ID_WIDTH);
+		uasm_i_ins(&p, T0, T1, MIPS_GCTL1_RID_SHIFT,
+			   MIPS_GCTL1_RID_WIDTH);
+		uasm_i_mtc0(&p, T0, C0_GUESTCTL1);
+
+		/* GuestID handles dealiasing so we don't need to touch ASID */
+		goto skip_asid_restore;
+	}
+
+	/* Root ASID Dealias (RAD) */
+
+	/* Save host ASID */
+	UASM_i_MFC0(&p, K0, C0_ENTRYHI);
+	UASM_i_SW(&p, K0, offsetof(struct kvm_vcpu_arch, host_entryhi),
+		  K1);
+
+	/* Set the root ASID for the Guest */
+	UASM_i_ADDIU(&p, T1, S0,
+		     offsetof(struct kvm, arch.gpa_mm.context.asid));
+#else
+	/* Set the ASID for the Guest Kernel or User */
 	UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, cop0), K1);
 	UASM_i_LW(&p, T0, offsetof(struct mips_coproc, reg[MIPS_CP0_STATUS][0]),
 		  T0);
@@ -315,6 +378,7 @@ static void *kvm_mips_build_enter_guest(void *addr)
 	UASM_i_ADDIU(&p, T1, K1, offsetof(struct kvm_vcpu_arch,
 					  guest_user_mm.context.asid));
 	uasm_l_kernel_asid(&l, p);
+#endif
 
 	/* t1: contains the base of the ASID array, need to get the cpu id  */
 	/* smp_processor_id */
@@ -339,6 +403,7 @@ static void *kvm_mips_build_enter_guest(void *addr)
 	uasm_i_andi(&p, K0, K0, MIPS_ENTRYHI_ASID);
 #endif
 
+#ifndef CONFIG_KVM_MIPS_VZ
 	/*
 	 * Set up KVM T&E GVA pgd.
 	 * This does roughly the same as TLBMISS_HANDLER_SETUP_PGD():
@@ -351,7 +416,11 @@ static void *kvm_mips_build_enter_guest(void *addr)
 	UASM_i_LA(&p, T9, (unsigned long)tlbmiss_handler_setup_pgd);
 	uasm_i_jalr(&p, RA, T9);
 	 uasm_i_mtc0(&p, K0, C0_ENTRYHI);
-
+#else
+	/* Set up KVM VZ root ASID (!guestid) */
+	uasm_i_mtc0(&p, K0, C0_ENTRYHI);
+skip_asid_restore:
+#endif
 	uasm_i_ehb(&p);
 
 	/* Disable RDHWR access */
@@ -559,13 +628,10 @@ void *kvm_mips_build_exit(void *addr)
 	/* Now that context has been saved, we can use other registers */
 
 	/* Restore vcpu */
-	UASM_i_MFC0(&p, A1, scratch_vcpu[0], scratch_vcpu[1]);
-	uasm_i_move(&p, S1, A1);
+	UASM_i_MFC0(&p, S1, scratch_vcpu[0], scratch_vcpu[1]);
 
 	/* Restore run (vcpu->run) */
-	UASM_i_LW(&p, A0, offsetof(struct kvm_vcpu, run), A1);
-	/* Save pointer to run in s0, will be saved by the compiler */
-	uasm_i_move(&p, S0, A0);
+	UASM_i_LW(&p, S0, offsetof(struct kvm_vcpu, run), S1);
 
 	/*
 	 * Save Host level EPC, BadVaddr and Cause to VCPU, useful to process
@@ -641,6 +707,52 @@ void *kvm_mips_build_exit(void *addr)
 		uasm_l_msa_1(&l, p);
 	}
 
+#ifdef CONFIG_KVM_MIPS_VZ
+	/* Restore host ASID */
+	if (!cpu_has_guestid) {
+		UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu_arch, host_entryhi),
+			  K1);
+		UASM_i_MTC0(&p, K0, C0_ENTRYHI);
+	}
+
+	/*
+	 * Set up normal Linux process pgd.
+	 * This does roughly the same as TLBMISS_HANDLER_SETUP_PGD():
+	 * - call tlbmiss_handler_setup_pgd(mm->pgd)
+	 * - write mm->pgd into CP0_PWBase
+	 */
+	UASM_i_LW(&p, A0,
+		  offsetof(struct kvm_vcpu_arch, host_pgd), K1);
+	UASM_i_LA(&p, T9, (unsigned long)tlbmiss_handler_setup_pgd);
+	uasm_i_jalr(&p, RA, T9);
+	/* delay slot */
+	if (cpu_has_htw)
+		UASM_i_MTC0(&p, A0, C0_PWBASE);
+	else
+		uasm_i_nop(&p);
+
+	/* Clear GM bit so we don't enter guest mode when EXL is cleared */
+	uasm_i_mfc0(&p, K0, C0_GUESTCTL0);
+	uasm_i_ins(&p, K0, ZERO, MIPS_GCTL0_GM_SHIFT, 1);
+	uasm_i_mtc0(&p, K0, C0_GUESTCTL0);
+
+	/* Save GuestCtl0 so we can access GExcCode after CPU migration */
+	uasm_i_sw(&p, K0,
+		  offsetof(struct kvm_vcpu_arch, host_cp0_guestctl0), K1);
+
+	if (cpu_has_guestid) {
+		/*
+		 * Clear root mode GuestID, so that root TLB operations use the
+		 * root GuestID in the root TLB.
+		 */
+		uasm_i_mfc0(&p, T0, C0_GUESTCTL1);
+		/* Set GuestCtl1.RID = MIPS_GCTL1_ROOT_GUESTID (i.e. 0) */
+		uasm_i_ins(&p, T0, ZERO, MIPS_GCTL1_RID_SHIFT,
+			   MIPS_GCTL1_RID_WIDTH);
+		uasm_i_mtc0(&p, T0, C0_GUESTCTL1);
+	}
+#endif
+
 	/* Now that the new EBASE has been loaded, unset BEV and KSU_USER */
 	uasm_i_addiu(&p, AT, ZERO, ~(ST0_EXL | KSU_USER | ST0_IE));
 	uasm_i_and(&p, V0, V0, AT);
@@ -680,6 +792,8 @@ void *kvm_mips_build_exit(void *addr)
 	 * Now jump to the kvm_mips_handle_exit() to see if we can deal
 	 * with this in the kernel
 	 */
+	uasm_i_move(&p, A0, S0);
+	uasm_i_move(&p, A1, S1);
 	UASM_i_LA(&p, T9, (unsigned long)kvm_mips_handle_exit);
 	uasm_i_jalr(&p, RA, T9);
 	 UASM_i_ADDIU(&p, SP, SP, -CALLFRAME_SIZ);
diff --git a/arch/mips/kvm/hypcall.c b/arch/mips/kvm/hypcall.c
new file mode 100644
index 0000000..8306343
--- /dev/null
+++ b/arch/mips/kvm/hypcall.c
@@ -0,0 +1,53 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * KVM/MIPS: Hypercall handling.
+ *
+ * Copyright (C) 2015  Imagination Technologies Ltd.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+#include <linux/kvm_para.h>
+
+#define MAX_HYPCALL_ARGS	4
+
+enum emulation_result kvm_mips_emul_hypcall(struct kvm_vcpu *vcpu,
+					    union mips_instruction inst)
+{
+	unsigned int code = (inst.co_format.code >> 5) & 0x3ff;
+
+	kvm_debug("[%#lx] HYPCALL %#03x\n", vcpu->arch.pc, code);
+
+	switch (code) {
+	case 0:
+		return EMULATE_HYPERCALL;
+	default:
+		return EMULATE_FAIL;
+	};
+}
+
+static int kvm_mips_hypercall(struct kvm_vcpu *vcpu, unsigned long num,
+			      const unsigned long *args, unsigned long *hret)
+{
+	/* Report unimplemented hypercall to guest */
+	*hret = -KVM_ENOSYS;
+	return RESUME_GUEST;
+}
+
+int kvm_mips_handle_hypcall(struct kvm_vcpu *vcpu)
+{
+	unsigned long num, args[MAX_HYPCALL_ARGS];
+
+	/* read hypcall number and arguments */
+	num = vcpu->arch.gprs[2];	/* v0 */
+	args[0] = vcpu->arch.gprs[4];	/* a0 */
+	args[1] = vcpu->arch.gprs[5];	/* a1 */
+	args[2] = vcpu->arch.gprs[6];	/* a2 */
+	args[3] = vcpu->arch.gprs[7];	/* a3 */
+
+	return kvm_mips_hypercall(vcpu, num,
+				  args, &vcpu->arch.gprs[2] /* v0 */);
+}
diff --git a/arch/mips/kvm/interrupt.h b/arch/mips/kvm/interrupt.h
index fb118a2..3bf0a49 100644
--- a/arch/mips/kvm/interrupt.h
+++ b/arch/mips/kvm/interrupt.h
@@ -30,8 +30,13 @@
 
 #define C_TI        (_ULCAST_(1) << 30)
 
+#ifdef CONFIG_KVM_MIPS_VZ
+#define KVM_MIPS_IRQ_DELIVER_ALL_AT_ONCE (1)
+#define KVM_MIPS_IRQ_CLEAR_ALL_AT_ONCE   (1)
+#else
 #define KVM_MIPS_IRQ_DELIVER_ALL_AT_ONCE (0)
 #define KVM_MIPS_IRQ_CLEAR_ALL_AT_ONCE   (0)
+#endif
 
 void kvm_mips_queue_irq(struct kvm_vcpu *vcpu, unsigned int priority);
 void kvm_mips_dequeue_irq(struct kvm_vcpu *vcpu, unsigned int priority);
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 15a1b17..d4b2ad1 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -59,6 +59,16 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "fpe",	  VCPU_STAT(fpe_exits),		 KVM_STAT_VCPU },
 	{ "msa_disabled", VCPU_STAT(msa_disabled_exits), KVM_STAT_VCPU },
 	{ "flush_dcache", VCPU_STAT(flush_dcache_exits), KVM_STAT_VCPU },
+#ifdef CONFIG_KVM_MIPS_VZ
+	{ "vz_gpsi",	  VCPU_STAT(vz_gpsi_exits),	 KVM_STAT_VCPU },
+	{ "vz_gsfc",	  VCPU_STAT(vz_gsfc_exits),	 KVM_STAT_VCPU },
+	{ "vz_hc",	  VCPU_STAT(vz_hc_exits),	 KVM_STAT_VCPU },
+	{ "vz_grr",	  VCPU_STAT(vz_grr_exits),	 KVM_STAT_VCPU },
+	{ "vz_gva",	  VCPU_STAT(vz_gva_exits),	 KVM_STAT_VCPU },
+	{ "vz_ghfc",	  VCPU_STAT(vz_ghfc_exits),	 KVM_STAT_VCPU },
+	{ "vz_gpa",	  VCPU_STAT(vz_gpa_exits),	 KVM_STAT_VCPU },
+	{ "vz_resvd",	  VCPU_STAT(vz_resvd_exits),	 KVM_STAT_VCPU },
+#endif
 	{ "halt_successful_poll", VCPU_STAT(halt_successful_poll), KVM_STAT_VCPU },
 	{ "halt_attempted_poll", VCPU_STAT(halt_attempted_poll), KVM_STAT_VCPU },
 	{ "halt_poll_invalid", VCPU_STAT(halt_poll_invalid), KVM_STAT_VCPU },
@@ -66,6 +76,19 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{NULL}
 };
 
+bool kvm_trace_guest_mode_change;
+
+int kvm_guest_mode_change_trace_reg(void)
+{
+	kvm_trace_guest_mode_change = 1;
+	return 0;
+}
+
+void kvm_guest_mode_change_trace_unreg(void)
+{
+	kvm_trace_guest_mode_change = 0;
+}
+
 /*
  * XXXKYMA: We are simulatoring a processor that has the WII bit set in
  * Config7, so we are "runnable" if interrupts are pending
@@ -82,7 +105,12 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
 
 int kvm_arch_hardware_enable(void)
 {
-	return 0;
+	return kvm_mips_callbacks->hardware_enable();
+}
+
+void kvm_arch_hardware_disable(void)
+{
+	kvm_mips_callbacks->hardware_disable();
 }
 
 int kvm_arch_hardware_setup(void)
@@ -97,6 +125,18 @@ void kvm_arch_check_processor_compat(void *rtn)
 
 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 {
+	switch (type) {
+#ifdef CONFIG_KVM_MIPS_VZ
+	case KVM_VM_MIPS_VZ:
+#else
+	case KVM_VM_MIPS_TE:
+#endif
+		break;
+	default:
+		/* Unsupported KVM type */
+		return -EINVAL;
+	};
+
 	/* Allocate page table to map GPA -> RPA */
 	kvm->arch.gpa_mm.pgd = kvm_pgd_alloc();
 	if (!kvm->arch.gpa_mm.pgd)
@@ -301,8 +341,10 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
 	/* Build guest exception vectors dynamically in unmapped memory */
 	handler = gebase + 0x2000;
 
-	/* TLB refill */
+	/* TLB refill (or XTLB refill on 64-bit VZ where KX=1) */
 	refill_start = gebase;
+	if (IS_ENABLED(CONFIG_KVM_MIPS_VZ) && IS_ENABLED(CONFIG_64BIT))
+		refill_start += 0x080;
 	refill_end = kvm_mips_build_tlb_refill_exception(refill_start, handler);
 
 	/* General Exception Entry point */
@@ -353,9 +395,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
 
 	/* Init */
 	vcpu->arch.last_sched_cpu = -1;
-
-	/* Start off the timer */
-	kvm_mips_init_count(vcpu);
+	vcpu->arch.last_exec_cpu = -1;
 
 	return vcpu;
 
@@ -1030,9 +1070,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_IMMEDIATE_EXIT:
 		r = 1;
 		break;
-	case KVM_CAP_COALESCED_MMIO:
-		r = KVM_COALESCED_MMIO_PAGE_OFFSET;
-		break;
 	case KVM_CAP_NR_VCPUS:
 		r = num_online_cpus();
 		break;
@@ -1059,7 +1096,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		r = cpu_has_msa && !(boot_cpu_data.msa_id & MSA_IR_WRPF);
 		break;
 	default:
-		r = 0;
+		r = kvm_mips_callbacks->check_extension(kvm, ext);
 		break;
 	}
 	return r;
@@ -1067,7 +1104,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 {
-	return kvm_mips_pending_timer(vcpu);
+	return kvm_mips_pending_timer(vcpu) ||
+		kvm_read_c0_guest_cause(vcpu->arch.cop0) & C_TI;
 }
 
 int kvm_arch_vcpu_dump_regs(struct kvm_vcpu *vcpu)
@@ -1092,7 +1130,7 @@ int kvm_arch_vcpu_dump_regs(struct kvm_vcpu *vcpu)
 	kvm_debug("\tlo: 0x%08lx\n", vcpu->arch.lo);
 
 	cop0 = vcpu->arch.cop0;
-	kvm_debug("\tStatus: 0x%08lx, Cause: 0x%08lx\n",
+	kvm_debug("\tStatus: 0x%08x, Cause: 0x%08x\n",
 		  kvm_read_c0_guest_status(cop0),
 		  kvm_read_c0_guest_cause(cop0));
 
@@ -1208,7 +1246,8 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	vcpu->mode = OUTSIDE_GUEST_MODE;
 
 	/* re-enable HTW before enabling interrupts */
-	htw_start();
+	if (!IS_ENABLED(CONFIG_KVM_MIPS_VZ))
+		htw_start();
 
 	/* Set a default exit reason */
 	run->exit_reason = KVM_EXIT_UNKNOWN;
@@ -1226,17 +1265,20 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			cause, opc, run, vcpu);
 	trace_kvm_exit(vcpu, exccode);
 
-	/*
-	 * Do a privilege check, if in UM most of these exit conditions end up
-	 * causing an exception to be delivered to the Guest Kernel
-	 */
-	er = kvm_mips_check_privilege(cause, opc, run, vcpu);
-	if (er == EMULATE_PRIV_FAIL) {
-		goto skip_emul;
-	} else if (er == EMULATE_FAIL) {
-		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-		ret = RESUME_HOST;
-		goto skip_emul;
+	if (!IS_ENABLED(CONFIG_KVM_MIPS_VZ)) {
+		/*
+		 * Do a privilege check, if in UM most of these exit conditions
+		 * end up causing an exception to be delivered to the Guest
+		 * Kernel
+		 */
+		er = kvm_mips_check_privilege(cause, opc, run, vcpu);
+		if (er == EMULATE_PRIV_FAIL) {
+			goto skip_emul;
+		} else if (er == EMULATE_FAIL) {
+			run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+			ret = RESUME_HOST;
+			goto skip_emul;
+		}
 	}
 
 	switch (exccode) {
@@ -1267,7 +1309,7 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
 		break;
 
 	case EXCCODE_TLBS:
-		kvm_debug("TLB ST fault:  cause %#x, status %#lx, PC: %p, BadVaddr: %#lx\n",
+		kvm_debug("TLB ST fault:  cause %#x, status %#x, PC: %p, BadVaddr: %#lx\n",
 			  cause, kvm_read_c0_guest_status(vcpu->arch.cop0), opc,
 			  badvaddr);
 
@@ -1328,12 +1370,17 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
 		ret = kvm_mips_callbacks->handle_msa_disabled(vcpu);
 		break;
 
+	case EXCCODE_GE:
+		/* defer exit accounting to handler */
+		ret = kvm_mips_callbacks->handle_guest_exit(vcpu);
+		break;
+
 	default:
 		if (cause & CAUSEF_BD)
 			opc += 1;
 		inst = 0;
 		kvm_get_badinstr(opc, vcpu, &inst);
-		kvm_err("Exception Code: %d, not yet handled, @ PC: %p, inst: 0x%08x  BadVaddr: %#lx Status: %#lx\n",
+		kvm_err("Exception Code: %d, not yet handled, @ PC: %p, inst: 0x%08x  BadVaddr: %#lx Status: %#x\n",
 			exccode, opc, inst, badvaddr,
 			kvm_read_c0_guest_status(vcpu->arch.cop0));
 		kvm_arch_vcpu_dump_regs(vcpu);
@@ -1346,6 +1393,9 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
 skip_emul:
 	local_irq_disable();
 
+	if (ret == RESUME_GUEST)
+		kvm_vz_acquire_htimer(vcpu);
+
 	if (er == EMULATE_DONE && !(ret & RESUME_HOST))
 		kvm_mips_deliver_interrupts(vcpu, cause);
 
@@ -1391,7 +1441,8 @@ skip_emul:
 	}
 
 	/* Disable HTW before returning to guest or host */
-	htw_stop();
+	if (!IS_ENABLED(CONFIG_KVM_MIPS_VZ))
+		htw_stop();
 
 	return ret;
 }
@@ -1527,16 +1578,18 @@ void kvm_drop_fpu(struct kvm_vcpu *vcpu)
 void kvm_lose_fpu(struct kvm_vcpu *vcpu)
 {
 	/*
-	 * FPU & MSA get disabled in root context (hardware) when it is disabled
-	 * in guest context (software), but the register state in the hardware
-	 * may still be in use. This is why we explicitly re-enable the hardware
-	 * before saving.
+	 * With T&E, FPU & MSA get disabled in root context (hardware) when it
+	 * is disabled in guest context (software), but the register state in
+	 * the hardware may still be in use.
+	 * This is why we explicitly re-enable the hardware before saving.
 	 */
 
 	preempt_disable();
 	if (cpu_has_msa && vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA) {
-		set_c0_config5(MIPS_CONF5_MSAEN);
-		enable_fpu_hazard();
+		if (!IS_ENABLED(CONFIG_KVM_MIPS_VZ)) {
+			set_c0_config5(MIPS_CONF5_MSAEN);
+			enable_fpu_hazard();
+		}
 
 		__kvm_save_msa(&vcpu->arch);
 		trace_kvm_aux(vcpu, KVM_TRACE_AUX_SAVE, KVM_TRACE_AUX_FPU_MSA);
@@ -1549,8 +1602,10 @@ void kvm_lose_fpu(struct kvm_vcpu *vcpu)
 		}
 		vcpu->arch.aux_inuse &= ~(KVM_MIPS_AUX_FPU | KVM_MIPS_AUX_MSA);
 	} else if (vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU) {
-		set_c0_status(ST0_CU1);
-		enable_fpu_hazard();
+		if (!IS_ENABLED(CONFIG_KVM_MIPS_VZ)) {
+			set_c0_status(ST0_CU1);
+			enable_fpu_hazard();
+		}
 
 		__kvm_save_fpu(&vcpu->arch);
 		vcpu->arch.aux_inuse &= ~KVM_MIPS_AUX_FPU;
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index cb0faad..ee64db0 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -992,6 +992,22 @@ static pte_t kvm_mips_gpa_pte_to_gva_mapped(pte_t pte, long entrylo)
 	return kvm_mips_gpa_pte_to_gva_unmapped(pte);
 }
 
+#ifdef CONFIG_KVM_MIPS_VZ
+int kvm_mips_handle_vz_root_tlb_fault(unsigned long badvaddr,
+				      struct kvm_vcpu *vcpu,
+				      bool write_fault)
+{
+	int ret;
+
+	ret = kvm_mips_map_page(vcpu, badvaddr, write_fault, NULL, NULL);
+	if (ret)
+		return ret;
+
+	/* Invalidate this entry in the TLB */
+	return kvm_vz_host_tlb_inv(vcpu, badvaddr);
+}
+#endif
+
 /* XXXKYMA: Must be called with interrupts disabled */
 int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr,
 				    struct kvm_vcpu *vcpu,
@@ -1225,6 +1241,10 @@ int kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu, u32 *out)
 {
 	int err;
 
+	if (WARN(IS_ENABLED(CONFIG_KVM_MIPS_VZ),
+		 "Expect BadInstr/BadInstrP registers to be used with VZ\n"))
+		return -EINVAL;
+
 retry:
 	kvm_trap_emul_gva_lockless_begin(vcpu);
 	err = get_user(*out, opc);
diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c
index 2819eb7..7c6336d 100644
--- a/arch/mips/kvm/tlb.c
+++ b/arch/mips/kvm/tlb.c
@@ -33,6 +33,25 @@
 #define KVM_GUEST_PC_TLB    0
 #define KVM_GUEST_SP_TLB    1
 
+#ifdef CONFIG_KVM_MIPS_VZ
+unsigned long GUESTID_MASK;
+EXPORT_SYMBOL_GPL(GUESTID_MASK);
+unsigned long GUESTID_FIRST_VERSION;
+EXPORT_SYMBOL_GPL(GUESTID_FIRST_VERSION);
+unsigned long GUESTID_VERSION_MASK;
+EXPORT_SYMBOL_GPL(GUESTID_VERSION_MASK);
+
+static u32 kvm_mips_get_root_asid(struct kvm_vcpu *vcpu)
+{
+	struct mm_struct *gpa_mm = &vcpu->kvm->arch.gpa_mm;
+
+	if (cpu_has_guestid)
+		return 0;
+	else
+		return cpu_asid(smp_processor_id(), gpa_mm);
+}
+#endif
+
 static u32 kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu)
 {
 	struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm;
@@ -166,6 +185,13 @@ int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va,
 
 	local_irq_restore(flags);
 
+	/*
+	 * We don't want to get reserved instruction exceptions for missing tlb
+	 * entries.
+	 */
+	if (cpu_has_vtag_icache)
+		flush_icache_all();
+
 	if (user && idx_user >= 0)
 		kvm_debug("%s: Invalidated guest user entryhi %#lx @ idx %d\n",
 			  __func__, (va & VPN2_MASK) |
@@ -179,6 +205,421 @@ int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va,
 }
 EXPORT_SYMBOL_GPL(kvm_mips_host_tlb_inv);
 
+#ifdef CONFIG_KVM_MIPS_VZ
+
+/* GuestID management */
+
+/**
+ * clear_root_gid() - Set GuestCtl1.RID for normal root operation.
+ */
+static inline void clear_root_gid(void)
+{
+	if (cpu_has_guestid) {
+		clear_c0_guestctl1(MIPS_GCTL1_RID);
+		mtc0_tlbw_hazard();
+	}
+}
+
+/**
+ * set_root_gid_to_guest_gid() - Set GuestCtl1.RID to match GuestCtl1.ID.
+ *
+ * Sets the root GuestID to match the current guest GuestID, for TLB operation
+ * on the GPA->RPA mappings in the root TLB.
+ *
+ * The caller must be sure to disable HTW while the root GID is set, and
+ * possibly longer if TLB registers are modified.
+ */
+static inline void set_root_gid_to_guest_gid(void)
+{
+	unsigned int guestctl1;
+
+	if (cpu_has_guestid) {
+		back_to_back_c0_hazard();
+		guestctl1 = read_c0_guestctl1();
+		guestctl1 = (guestctl1 & ~MIPS_GCTL1_RID) |
+			((guestctl1 & MIPS_GCTL1_ID) >> MIPS_GCTL1_ID_SHIFT)
+						     << MIPS_GCTL1_RID_SHIFT;
+		write_c0_guestctl1(guestctl1);
+		mtc0_tlbw_hazard();
+	}
+}
+
+int kvm_vz_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va)
+{
+	int idx;
+	unsigned long flags, old_entryhi;
+
+	local_irq_save(flags);
+	htw_stop();
+
+	/* Set root GuestID for root probe and write of guest TLB entry */
+	set_root_gid_to_guest_gid();
+
+	old_entryhi = read_c0_entryhi();
+
+	idx = _kvm_mips_host_tlb_inv((va & VPN2_MASK) |
+				     kvm_mips_get_root_asid(vcpu));
+
+	write_c0_entryhi(old_entryhi);
+	clear_root_gid();
+	mtc0_tlbw_hazard();
+
+	htw_start();
+	local_irq_restore(flags);
+
+	/*
+	 * We don't want to get reserved instruction exceptions for missing tlb
+	 * entries.
+	 */
+	if (cpu_has_vtag_icache)
+		flush_icache_all();
+
+	if (idx > 0)
+		kvm_debug("%s: Invalidated root entryhi %#lx @ idx %d\n",
+			  __func__, (va & VPN2_MASK) |
+				    kvm_mips_get_root_asid(vcpu), idx);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_vz_host_tlb_inv);
+
+/**
+ * kvm_vz_guest_tlb_lookup() - Lookup a guest VZ TLB mapping.
+ * @vcpu:	KVM VCPU pointer.
+ * @gpa:	Guest virtual address in a TLB mapped guest segment.
+ * @gpa:	Ponter to output guest physical address it maps to.
+ *
+ * Converts a guest virtual address in a guest TLB mapped segment to a guest
+ * physical address, by probing the guest TLB.
+ *
+ * Returns:	0 if guest TLB mapping exists for @gva. *@gpa will have been
+ *		written.
+ *		-EFAULT if no guest TLB mapping exists for @gva. *@gpa may not
+ *		have been written.
+ */
+int kvm_vz_guest_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long gva,
+			    unsigned long *gpa)
+{
+	unsigned long o_entryhi, o_entrylo[2], o_pagemask;
+	unsigned int o_index;
+	unsigned long entrylo[2], pagemask, pagemaskbit, pa;
+	unsigned long flags;
+	int index;
+
+	/* Probe the guest TLB for a mapping */
+	local_irq_save(flags);
+	/* Set root GuestID for root probe of guest TLB entry */
+	htw_stop();
+	set_root_gid_to_guest_gid();
+
+	o_entryhi = read_gc0_entryhi();
+	o_index = read_gc0_index();
+
+	write_gc0_entryhi((o_entryhi & 0x3ff) | (gva & ~0xfffl));
+	mtc0_tlbw_hazard();
+	guest_tlb_probe();
+	tlb_probe_hazard();
+
+	index = read_gc0_index();
+	if (index < 0) {
+		/* No match, fail */
+		write_gc0_entryhi(o_entryhi);
+		write_gc0_index(o_index);
+
+		clear_root_gid();
+		htw_start();
+		local_irq_restore(flags);
+		return -EFAULT;
+	}
+
+	/* Match! read the TLB entry */
+	o_entrylo[0] = read_gc0_entrylo0();
+	o_entrylo[1] = read_gc0_entrylo1();
+	o_pagemask = read_gc0_pagemask();
+
+	mtc0_tlbr_hazard();
+	guest_tlb_read();
+	tlb_read_hazard();
+
+	entrylo[0] = read_gc0_entrylo0();
+	entrylo[1] = read_gc0_entrylo1();
+	pagemask = ~read_gc0_pagemask() & ~0x1fffl;
+
+	write_gc0_entryhi(o_entryhi);
+	write_gc0_index(o_index);
+	write_gc0_entrylo0(o_entrylo[0]);
+	write_gc0_entrylo1(o_entrylo[1]);
+	write_gc0_pagemask(o_pagemask);
+
+	clear_root_gid();
+	htw_start();
+	local_irq_restore(flags);
+
+	/* Select one of the EntryLo values and interpret the GPA */
+	pagemaskbit = (pagemask ^ (pagemask & (pagemask - 1))) >> 1;
+	pa = entrylo[!!(gva & pagemaskbit)];
+
+	/*
+	 * TLB entry may have become invalid since TLB probe if physical FTLB
+	 * entries are shared between threads (e.g. I6400).
+	 */
+	if (!(pa & ENTRYLO_V))
+		return -EFAULT;
+
+	/*
+	 * Note, this doesn't take guest MIPS32 XPA into account, where PFN is
+	 * split with XI/RI in the middle.
+	 */
+	pa = (pa << 6) & ~0xfffl;
+	pa |= gva & ~(pagemask | pagemaskbit);
+
+	*gpa = pa;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_vz_guest_tlb_lookup);
+
+/**
+ * kvm_vz_local_flush_roottlb_all_guests() - Flush all root TLB entries for
+ * guests.
+ *
+ * Invalidate all entries in root tlb which are GPA mappings.
+ */
+void kvm_vz_local_flush_roottlb_all_guests(void)
+{
+	unsigned long flags;
+	unsigned long old_entryhi, old_pagemask, old_guestctl1;
+	int entry;
+
+	if (WARN_ON(!cpu_has_guestid))
+		return;
+
+	local_irq_save(flags);
+	htw_stop();
+
+	/* TLBR may clobber EntryHi.ASID, PageMask, and GuestCtl1.RID */
+	old_entryhi = read_c0_entryhi();
+	old_pagemask = read_c0_pagemask();
+	old_guestctl1 = read_c0_guestctl1();
+
+	/*
+	 * Invalidate guest entries in root TLB while leaving root entries
+	 * intact when possible.
+	 */
+	for (entry = 0; entry < current_cpu_data.tlbsize; entry++) {
+		write_c0_index(entry);
+		mtc0_tlbw_hazard();
+		tlb_read();
+		tlb_read_hazard();
+
+		/* Don't invalidate non-guest (RVA) mappings in the root TLB */
+		if (!(read_c0_guestctl1() & MIPS_GCTL1_RID))
+			continue;
+
+		/* Make sure all entries differ. */
+		write_c0_entryhi(UNIQUE_ENTRYHI(entry));
+		write_c0_entrylo0(0);
+		write_c0_entrylo1(0);
+		write_c0_guestctl1(0);
+		mtc0_tlbw_hazard();
+		tlb_write_indexed();
+	}
+
+	write_c0_entryhi(old_entryhi);
+	write_c0_pagemask(old_pagemask);
+	write_c0_guestctl1(old_guestctl1);
+	tlbw_use_hazard();
+
+	htw_start();
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(kvm_vz_local_flush_roottlb_all_guests);
+
+/**
+ * kvm_vz_local_flush_guesttlb_all() - Flush all guest TLB entries.
+ *
+ * Invalidate all entries in guest tlb irrespective of guestid.
+ */
+void kvm_vz_local_flush_guesttlb_all(void)
+{
+	unsigned long flags;
+	unsigned long old_index;
+	unsigned long old_entryhi;
+	unsigned long old_entrylo[2];
+	unsigned long old_pagemask;
+	int entry;
+	u64 cvmmemctl2 = 0;
+
+	local_irq_save(flags);
+
+	/* Preserve all clobbered guest registers */
+	old_index = read_gc0_index();
+	old_entryhi = read_gc0_entryhi();
+	old_entrylo[0] = read_gc0_entrylo0();
+	old_entrylo[1] = read_gc0_entrylo1();
+	old_pagemask = read_gc0_pagemask();
+
+	switch (current_cpu_type()) {
+	case CPU_CAVIUM_OCTEON3:
+		/* Inhibit machine check due to multiple matching TLB entries */
+		cvmmemctl2 = read_c0_cvmmemctl2();
+		cvmmemctl2 |= CVMMEMCTL2_INHIBITTS;
+		write_c0_cvmmemctl2(cvmmemctl2);
+		break;
+	};
+
+	/* Invalidate guest entries in guest TLB */
+	write_gc0_entrylo0(0);
+	write_gc0_entrylo1(0);
+	write_gc0_pagemask(0);
+	for (entry = 0; entry < current_cpu_data.guest.tlbsize; entry++) {
+		/* Make sure all entries differ. */
+		write_gc0_index(entry);
+		write_gc0_entryhi(UNIQUE_GUEST_ENTRYHI(entry));
+		mtc0_tlbw_hazard();
+		guest_tlb_write_indexed();
+	}
+
+	if (cvmmemctl2) {
+		cvmmemctl2 &= ~CVMMEMCTL2_INHIBITTS;
+		write_c0_cvmmemctl2(cvmmemctl2);
+	};
+
+	write_gc0_index(old_index);
+	write_gc0_entryhi(old_entryhi);
+	write_gc0_entrylo0(old_entrylo[0]);
+	write_gc0_entrylo1(old_entrylo[1]);
+	write_gc0_pagemask(old_pagemask);
+	tlbw_use_hazard();
+
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(kvm_vz_local_flush_guesttlb_all);
+
+/**
+ * kvm_vz_save_guesttlb() - Save a range of guest TLB entries.
+ * @buf:	Buffer to write TLB entries into.
+ * @index:	Start index.
+ * @count:	Number of entries to save.
+ *
+ * Save a range of guest TLB entries. The caller must ensure interrupts are
+ * disabled.
+ */
+void kvm_vz_save_guesttlb(struct kvm_mips_tlb *buf, unsigned int index,
+			  unsigned int count)
+{
+	unsigned int end = index + count;
+	unsigned long old_entryhi, old_entrylo0, old_entrylo1, old_pagemask;
+	unsigned int guestctl1 = 0;
+	int old_index, i;
+
+	/* Save registers we're about to clobber */
+	old_index = read_gc0_index();
+	old_entryhi = read_gc0_entryhi();
+	old_entrylo0 = read_gc0_entrylo0();
+	old_entrylo1 = read_gc0_entrylo1();
+	old_pagemask = read_gc0_pagemask();
+
+	/* Set root GuestID for root probe */
+	htw_stop();
+	set_root_gid_to_guest_gid();
+	if (cpu_has_guestid)
+		guestctl1 = read_c0_guestctl1();
+
+	/* Read each entry from guest TLB */
+	for (i = index; i < end; ++i, ++buf) {
+		write_gc0_index(i);
+
+		mtc0_tlbr_hazard();
+		guest_tlb_read();
+		tlb_read_hazard();
+
+		if (cpu_has_guestid &&
+		    (read_c0_guestctl1() ^ guestctl1) & MIPS_GCTL1_RID) {
+			/* Entry invalid or belongs to another guest */
+			buf->tlb_hi = UNIQUE_GUEST_ENTRYHI(i);
+			buf->tlb_lo[0] = 0;
+			buf->tlb_lo[1] = 0;
+			buf->tlb_mask = 0;
+		} else {
+			/* Entry belongs to the right guest */
+			buf->tlb_hi = read_gc0_entryhi();
+			buf->tlb_lo[0] = read_gc0_entrylo0();
+			buf->tlb_lo[1] = read_gc0_entrylo1();
+			buf->tlb_mask = read_gc0_pagemask();
+		}
+	}
+
+	/* Clear root GuestID again */
+	clear_root_gid();
+	htw_start();
+
+	/* Restore clobbered registers */
+	write_gc0_index(old_index);
+	write_gc0_entryhi(old_entryhi);
+	write_gc0_entrylo0(old_entrylo0);
+	write_gc0_entrylo1(old_entrylo1);
+	write_gc0_pagemask(old_pagemask);
+
+	tlbw_use_hazard();
+}
+EXPORT_SYMBOL_GPL(kvm_vz_save_guesttlb);
+
+/**
+ * kvm_vz_load_guesttlb() - Save a range of guest TLB entries.
+ * @buf:	Buffer to read TLB entries from.
+ * @index:	Start index.
+ * @count:	Number of entries to load.
+ *
+ * Load a range of guest TLB entries. The caller must ensure interrupts are
+ * disabled.
+ */
+void kvm_vz_load_guesttlb(const struct kvm_mips_tlb *buf, unsigned int index,
+			  unsigned int count)
+{
+	unsigned int end = index + count;
+	unsigned long old_entryhi, old_entrylo0, old_entrylo1, old_pagemask;
+	int old_index, i;
+
+	/* Save registers we're about to clobber */
+	old_index = read_gc0_index();
+	old_entryhi = read_gc0_entryhi();
+	old_entrylo0 = read_gc0_entrylo0();
+	old_entrylo1 = read_gc0_entrylo1();
+	old_pagemask = read_gc0_pagemask();
+
+	/* Set root GuestID for root probe */
+	htw_stop();
+	set_root_gid_to_guest_gid();
+
+	/* Write each entry to guest TLB */
+	for (i = index; i < end; ++i, ++buf) {
+		write_gc0_index(i);
+		write_gc0_entryhi(buf->tlb_hi);
+		write_gc0_entrylo0(buf->tlb_lo[0]);
+		write_gc0_entrylo1(buf->tlb_lo[1]);
+		write_gc0_pagemask(buf->tlb_mask);
+
+		mtc0_tlbw_hazard();
+		guest_tlb_write_indexed();
+	}
+
+	/* Clear root GuestID again */
+	clear_root_gid();
+	htw_start();
+
+	/* Restore clobbered registers */
+	write_gc0_index(old_index);
+	write_gc0_entryhi(old_entryhi);
+	write_gc0_entrylo0(old_entrylo0);
+	write_gc0_entrylo1(old_entrylo1);
+	write_gc0_pagemask(old_pagemask);
+
+	tlbw_use_hazard();
+}
+EXPORT_SYMBOL_GPL(kvm_vz_load_guesttlb);
+
+#endif
+
 /**
  * kvm_mips_suspend_mm() - Suspend the active mm.
  * @cpu		The CPU we're running on.
diff --git a/arch/mips/kvm/trace.h b/arch/mips/kvm/trace.h
index c858cf1..a8c7fd7 100644
--- a/arch/mips/kvm/trace.h
+++ b/arch/mips/kvm/trace.h
@@ -18,6 +18,13 @@
 #define TRACE_INCLUDE_FILE trace
 
 /*
+ * arch/mips/kvm/mips.c
+ */
+extern bool kvm_trace_guest_mode_change;
+int kvm_guest_mode_change_trace_reg(void);
+void kvm_guest_mode_change_trace_unreg(void);
+
+/*
  * Tracepoints for VM enters
  */
 DECLARE_EVENT_CLASS(kvm_transition,
@@ -62,10 +69,20 @@ DEFINE_EVENT(kvm_transition, kvm_out,
 #define KVM_TRACE_EXIT_MSA_FPE		14
 #define KVM_TRACE_EXIT_FPE		15
 #define KVM_TRACE_EXIT_MSA_DISABLED	21
+#define KVM_TRACE_EXIT_GUEST_EXIT	27
 /* Further exit reasons */
 #define KVM_TRACE_EXIT_WAIT		32
 #define KVM_TRACE_EXIT_CACHE		33
 #define KVM_TRACE_EXIT_SIGNAL		34
+/* 32 exit reasons correspond to GuestCtl0.GExcCode (VZ) */
+#define KVM_TRACE_EXIT_GEXCCODE_BASE	64
+#define KVM_TRACE_EXIT_GPSI		64	/*  0 */
+#define KVM_TRACE_EXIT_GSFC		65	/*  1 */
+#define KVM_TRACE_EXIT_HC		66	/*  2 */
+#define KVM_TRACE_EXIT_GRR		67	/*  3 */
+#define KVM_TRACE_EXIT_GVA		72	/*  8 */
+#define KVM_TRACE_EXIT_GHFC		73	/*  9 */
+#define KVM_TRACE_EXIT_GPA		74	/* 10 */
 
 /* Tracepoints for VM exits */
 #define kvm_trace_symbol_exit_types				\
@@ -83,9 +100,17 @@ DEFINE_EVENT(kvm_transition, kvm_out,
 	{ KVM_TRACE_EXIT_MSA_FPE,	"MSA FPE" },		\
 	{ KVM_TRACE_EXIT_FPE,		"FPE" },		\
 	{ KVM_TRACE_EXIT_MSA_DISABLED,	"MSA Disabled" },	\
+	{ KVM_TRACE_EXIT_GUEST_EXIT,	"Guest Exit" },		\
 	{ KVM_TRACE_EXIT_WAIT,		"WAIT" },		\
 	{ KVM_TRACE_EXIT_CACHE,		"CACHE" },		\
-	{ KVM_TRACE_EXIT_SIGNAL,	"Signal" }
+	{ KVM_TRACE_EXIT_SIGNAL,	"Signal" },		\
+	{ KVM_TRACE_EXIT_GPSI,		"GPSI" },		\
+	{ KVM_TRACE_EXIT_GSFC,		"GSFC" },		\
+	{ KVM_TRACE_EXIT_HC,		"HC" },			\
+	{ KVM_TRACE_EXIT_GRR,		"GRR" },		\
+	{ KVM_TRACE_EXIT_GVA,		"GVA" },		\
+	{ KVM_TRACE_EXIT_GHFC,		"GHFC" },		\
+	{ KVM_TRACE_EXIT_GPA,		"GPA" }
 
 TRACE_EVENT(kvm_exit,
 	    TP_PROTO(struct kvm_vcpu *vcpu, unsigned int reason),
@@ -158,6 +183,8 @@ TRACE_EVENT(kvm_exit,
 	{ KVM_TRACE_COP0(16, 4),	"Config4" },		\
 	{ KVM_TRACE_COP0(16, 5),	"Config5" },		\
 	{ KVM_TRACE_COP0(16, 7),	"Config7" },		\
+	{ KVM_TRACE_COP0(17, 1),	"MAAR" },		\
+	{ KVM_TRACE_COP0(17, 2),	"MAARI" },		\
 	{ KVM_TRACE_COP0(26, 0),	"ECC" },		\
 	{ KVM_TRACE_COP0(30, 0),	"ErrorEPC" },		\
 	{ KVM_TRACE_COP0(31, 2),	"KScratch1" },		\
@@ -268,6 +295,51 @@ TRACE_EVENT(kvm_asid_change,
 		      __entry->new_asid)
 );
 
+TRACE_EVENT(kvm_guestid_change,
+	    TP_PROTO(struct kvm_vcpu *vcpu, unsigned int guestid),
+	    TP_ARGS(vcpu, guestid),
+	    TP_STRUCT__entry(
+			__field(unsigned int, guestid)
+	    ),
+
+	    TP_fast_assign(
+			__entry->guestid = guestid;
+	    ),
+
+	    TP_printk("GuestID: 0x%02x",
+		      __entry->guestid)
+);
+
+TRACE_EVENT_FN(kvm_guest_mode_change,
+	    TP_PROTO(struct kvm_vcpu *vcpu),
+	    TP_ARGS(vcpu),
+	    TP_STRUCT__entry(
+			__field(unsigned long, epc)
+			__field(unsigned long, pc)
+			__field(unsigned long, badvaddr)
+			__field(unsigned int, status)
+			__field(unsigned int, cause)
+	    ),
+
+	    TP_fast_assign(
+			__entry->epc = kvm_read_c0_guest_epc(vcpu->arch.cop0);
+			__entry->pc = vcpu->arch.pc;
+			__entry->badvaddr = kvm_read_c0_guest_badvaddr(vcpu->arch.cop0);
+			__entry->status = kvm_read_c0_guest_status(vcpu->arch.cop0);
+			__entry->cause = kvm_read_c0_guest_cause(vcpu->arch.cop0);
+	    ),
+
+	    TP_printk("EPC: 0x%08lx PC: 0x%08lx Status: 0x%08x Cause: 0x%08x BadVAddr: 0x%08lx",
+		      __entry->epc,
+		      __entry->pc,
+		      __entry->status,
+		      __entry->cause,
+		      __entry->badvaddr),
+
+	    kvm_guest_mode_change_trace_reg,
+	    kvm_guest_mode_change_trace_unreg
+);
+
 #endif /* _TRACE_KVM_H */
 
 /* This part must be outside protection */
diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index b1fa53b..a563759 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -12,6 +12,7 @@
 #include <linux/errno.h>
 #include <linux/err.h>
 #include <linux/kvm_host.h>
+#include <linux/log2.h>
 #include <linux/uaccess.h>
 #include <linux/vmalloc.h>
 #include <asm/mmu_context.h>
@@ -40,6 +41,29 @@ static gpa_t kvm_trap_emul_gva_to_gpa_cb(gva_t gva)
 	return gpa;
 }
 
+static int kvm_trap_emul_no_handler(struct kvm_vcpu *vcpu)
+{
+	u32 __user *opc = (u32 __user *) vcpu->arch.pc;
+	u32 cause = vcpu->arch.host_cp0_cause;
+	u32 exccode = (cause & CAUSEF_EXCCODE) >> CAUSEB_EXCCODE;
+	unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
+	u32 inst = 0;
+
+	/*
+	 *  Fetch the instruction.
+	 */
+	if (cause & CAUSEF_BD)
+		opc += 1;
+	kvm_get_badinstr(opc, vcpu, &inst);
+
+	kvm_err("Exception Code: %d not handled @ PC: %p, inst: 0x%08x BadVaddr: %#lx Status: %#x\n",
+		exccode, opc, inst, badvaddr,
+		kvm_read_c0_guest_status(vcpu->arch.cop0));
+	kvm_arch_vcpu_dump_regs(vcpu);
+	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+	return RESUME_HOST;
+}
+
 static int kvm_trap_emul_handle_cop_unusable(struct kvm_vcpu *vcpu)
 {
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
@@ -82,6 +106,10 @@ static int kvm_trap_emul_handle_cop_unusable(struct kvm_vcpu *vcpu)
 		ret = RESUME_HOST;
 		break;
 
+	case EMULATE_HYPERCALL:
+		ret = kvm_mips_handle_hypcall(vcpu);
+		break;
+
 	default:
 		BUG();
 	}
@@ -484,6 +512,31 @@ static int kvm_trap_emul_handle_msa_disabled(struct kvm_vcpu *vcpu)
 	return ret;
 }
 
+static int kvm_trap_emul_hardware_enable(void)
+{
+	return 0;
+}
+
+static void kvm_trap_emul_hardware_disable(void)
+{
+}
+
+static int kvm_trap_emul_check_extension(struct kvm *kvm, long ext)
+{
+	int r;
+
+	switch (ext) {
+	case KVM_CAP_MIPS_TE:
+		r = 1;
+		break;
+	default:
+		r = 0;
+		break;
+	}
+
+	return r;
+}
+
 static int kvm_trap_emul_vcpu_init(struct kvm_vcpu *vcpu)
 {
 	struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm;
@@ -561,6 +614,9 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu)
 	u32 config, config1;
 	int vcpu_id = vcpu->vcpu_id;
 
+	/* Start off the timer at 100 MHz */
+	kvm_mips_init_count(vcpu, 100*1000*1000);
+
 	/*
 	 * Arch specific stuff, set up config registers properly so that the
 	 * guest will come up as expected
@@ -589,6 +645,13 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu)
 	/* Read the cache characteristics from the host Config1 Register */
 	config1 = (read_c0_config1() & ~0x7f);
 
+	/* DCache line size not correctly reported in Config1 on Octeon CPUs */
+	if (cpu_dcache_line_size()) {
+		config1 &= ~MIPS_CONF1_DL;
+		config1 |= ((ilog2(cpu_dcache_line_size()) - 1) <<
+			    MIPS_CONF1_DL_SHF) & MIPS_CONF1_DL;
+	}
+
 	/* Set up MMU size */
 	config1 &= ~(0x3f << 25);
 	config1 |= ((KVM_MIPS_GUEST_TLB_SIZE - 1) << 25);
@@ -892,10 +955,12 @@ static int kvm_trap_emul_set_one_reg(struct kvm_vcpu *vcpu,
 			if (v & CAUSEF_DC) {
 				/* disable timer first */
 				kvm_mips_count_disable_cause(vcpu);
-				kvm_change_c0_guest_cause(cop0, ~CAUSEF_DC, v);
+				kvm_change_c0_guest_cause(cop0, (u32)~CAUSEF_DC,
+							  v);
 			} else {
 				/* enable timer last */
-				kvm_change_c0_guest_cause(cop0, ~CAUSEF_DC, v);
+				kvm_change_c0_guest_cause(cop0, (u32)~CAUSEF_DC,
+							  v);
 				kvm_mips_count_enable_cause(vcpu);
 			}
 		} else {
@@ -1230,7 +1295,11 @@ static struct kvm_mips_callbacks kvm_trap_emul_callbacks = {
 	.handle_msa_fpe = kvm_trap_emul_handle_msa_fpe,
 	.handle_fpe = kvm_trap_emul_handle_fpe,
 	.handle_msa_disabled = kvm_trap_emul_handle_msa_disabled,
+	.handle_guest_exit = kvm_trap_emul_no_handler,
 
+	.hardware_enable = kvm_trap_emul_hardware_enable,
+	.hardware_disable = kvm_trap_emul_hardware_disable,
+	.check_extension = kvm_trap_emul_check_extension,
 	.vcpu_init = kvm_trap_emul_vcpu_init,
 	.vcpu_uninit = kvm_trap_emul_vcpu_uninit,
 	.vcpu_setup = kvm_trap_emul_vcpu_setup,
diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c
new file mode 100644
index 0000000..71d8856
--- /dev/null
+++ b/arch/mips/kvm/vz.c
@@ -0,0 +1,3223 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * KVM/MIPS: Support for hardware virtualization extensions
+ *
+ * Copyright (C) 2012  MIPS Technologies, Inc.  All rights reserved.
+ * Authors: Yann Le Du <ledu@kymasys.com>
+ */
+
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/preempt.h>
+#include <linux/vmalloc.h>
+#include <asm/cacheflush.h>
+#include <asm/cacheops.h>
+#include <asm/cmpxchg.h>
+#include <asm/fpu.h>
+#include <asm/hazards.h>
+#include <asm/inst.h>
+#include <asm/mmu_context.h>
+#include <asm/r4kcache.h>
+#include <asm/time.h>
+#include <asm/tlb.h>
+#include <asm/tlbex.h>
+
+#include <linux/kvm_host.h>
+
+#include "interrupt.h"
+
+#include "trace.h"
+
+/* Pointers to last VCPU loaded on each physical CPU */
+static struct kvm_vcpu *last_vcpu[NR_CPUS];
+/* Pointers to last VCPU executed on each physical CPU */
+static struct kvm_vcpu *last_exec_vcpu[NR_CPUS];
+
+/*
+ * Number of guest VTLB entries to use, so we can catch inconsistency between
+ * CPUs.
+ */
+static unsigned int kvm_vz_guest_vtlb_size;
+
+static inline long kvm_vz_read_gc0_ebase(void)
+{
+	if (sizeof(long) == 8 && cpu_has_ebase_wg)
+		return read_gc0_ebase_64();
+	else
+		return read_gc0_ebase();
+}
+
+static inline void kvm_vz_write_gc0_ebase(long v)
+{
+	/*
+	 * First write with WG=1 to write upper bits, then write again in case
+	 * WG should be left at 0.
+	 * write_gc0_ebase_64() is no longer UNDEFINED since R6.
+	 */
+	if (sizeof(long) == 8 &&
+	    (cpu_has_mips64r6 || cpu_has_ebase_wg)) {
+		write_gc0_ebase_64(v | MIPS_EBASE_WG);
+		write_gc0_ebase_64(v);
+	} else {
+		write_gc0_ebase(v | MIPS_EBASE_WG);
+		write_gc0_ebase(v);
+	}
+}
+
+/*
+ * These Config bits may be writable by the guest:
+ * Config:	[K23, KU] (!TLB), K0
+ * Config1:	(none)
+ * Config2:	[TU, SU] (impl)
+ * Config3:	ISAOnExc
+ * Config4:	FTLBPageSize
+ * Config5:	K, CV, MSAEn, UFE, FRE, SBRI, UFR
+ */
+
+static inline unsigned int kvm_vz_config_guest_wrmask(struct kvm_vcpu *vcpu)
+{
+	return CONF_CM_CMASK;
+}
+
+static inline unsigned int kvm_vz_config1_guest_wrmask(struct kvm_vcpu *vcpu)
+{
+	return 0;
+}
+
+static inline unsigned int kvm_vz_config2_guest_wrmask(struct kvm_vcpu *vcpu)
+{
+	return 0;
+}
+
+static inline unsigned int kvm_vz_config3_guest_wrmask(struct kvm_vcpu *vcpu)
+{
+	return MIPS_CONF3_ISA_OE;
+}
+
+static inline unsigned int kvm_vz_config4_guest_wrmask(struct kvm_vcpu *vcpu)
+{
+	/* no need to be exact */
+	return MIPS_CONF4_VFTLBPAGESIZE;
+}
+
+static inline unsigned int kvm_vz_config5_guest_wrmask(struct kvm_vcpu *vcpu)
+{
+	unsigned int mask = MIPS_CONF5_K | MIPS_CONF5_CV | MIPS_CONF5_SBRI;
+
+	/* Permit MSAEn changes if MSA supported and enabled */
+	if (kvm_mips_guest_has_msa(&vcpu->arch))
+		mask |= MIPS_CONF5_MSAEN;
+
+	/*
+	 * Permit guest FPU mode changes if FPU is enabled and the relevant
+	 * feature exists according to FIR register.
+	 */
+	if (kvm_mips_guest_has_fpu(&vcpu->arch)) {
+		if (cpu_has_ufr)
+			mask |= MIPS_CONF5_UFR;
+		if (cpu_has_fre)
+			mask |= MIPS_CONF5_FRE | MIPS_CONF5_UFE;
+	}
+
+	return mask;
+}
+
+/*
+ * VZ optionally allows these additional Config bits to be written by root:
+ * Config:	M, [MT]
+ * Config1:	M, [MMUSize-1, C2, MD, PC, WR, CA], FP
+ * Config2:	M
+ * Config3:	M, MSAP, [BPG], ULRI, [DSP2P, DSPP], CTXTC, [ITL, LPA, VEIC,
+ *		VInt, SP, CDMM, MT, SM, TL]
+ * Config4:	M, [VTLBSizeExt, MMUSizeExt]
+ * Config5:	MRP
+ */
+
+static inline unsigned int kvm_vz_config_user_wrmask(struct kvm_vcpu *vcpu)
+{
+	return kvm_vz_config_guest_wrmask(vcpu) | MIPS_CONF_M;
+}
+
+static inline unsigned int kvm_vz_config1_user_wrmask(struct kvm_vcpu *vcpu)
+{
+	unsigned int mask = kvm_vz_config1_guest_wrmask(vcpu) | MIPS_CONF_M;
+
+	/* Permit FPU to be present if FPU is supported */
+	if (kvm_mips_guest_can_have_fpu(&vcpu->arch))
+		mask |= MIPS_CONF1_FP;
+
+	return mask;
+}
+
+static inline unsigned int kvm_vz_config2_user_wrmask(struct kvm_vcpu *vcpu)
+{
+	return kvm_vz_config2_guest_wrmask(vcpu) | MIPS_CONF_M;
+}
+
+static inline unsigned int kvm_vz_config3_user_wrmask(struct kvm_vcpu *vcpu)
+{
+	unsigned int mask = kvm_vz_config3_guest_wrmask(vcpu) | MIPS_CONF_M |
+		MIPS_CONF3_ULRI | MIPS_CONF3_CTXTC;
+
+	/* Permit MSA to be present if MSA is supported */
+	if (kvm_mips_guest_can_have_msa(&vcpu->arch))
+		mask |= MIPS_CONF3_MSA;
+
+	return mask;
+}
+
+static inline unsigned int kvm_vz_config4_user_wrmask(struct kvm_vcpu *vcpu)
+{
+	return kvm_vz_config4_guest_wrmask(vcpu) | MIPS_CONF_M;
+}
+
+static inline unsigned int kvm_vz_config5_user_wrmask(struct kvm_vcpu *vcpu)
+{
+	return kvm_vz_config5_guest_wrmask(vcpu) | MIPS_CONF5_MRP;
+}
+
+static gpa_t kvm_vz_gva_to_gpa_cb(gva_t gva)
+{
+	/* VZ guest has already converted gva to gpa */
+	return gva;
+}
+
+static void kvm_vz_queue_irq(struct kvm_vcpu *vcpu, unsigned int priority)
+{
+	set_bit(priority, &vcpu->arch.pending_exceptions);
+	clear_bit(priority, &vcpu->arch.pending_exceptions_clr);
+}
+
+static void kvm_vz_dequeue_irq(struct kvm_vcpu *vcpu, unsigned int priority)
+{
+	clear_bit(priority, &vcpu->arch.pending_exceptions);
+	set_bit(priority, &vcpu->arch.pending_exceptions_clr);
+}
+
+static void kvm_vz_queue_timer_int_cb(struct kvm_vcpu *vcpu)
+{
+	/*
+	 * timer expiry is asynchronous to vcpu execution therefore defer guest
+	 * cp0 accesses
+	 */
+	kvm_vz_queue_irq(vcpu, MIPS_EXC_INT_TIMER);
+}
+
+static void kvm_vz_dequeue_timer_int_cb(struct kvm_vcpu *vcpu)
+{
+	/*
+	 * timer expiry is asynchronous to vcpu execution therefore defer guest
+	 * cp0 accesses
+	 */
+	kvm_vz_dequeue_irq(vcpu, MIPS_EXC_INT_TIMER);
+}
+
+static void kvm_vz_queue_io_int_cb(struct kvm_vcpu *vcpu,
+				   struct kvm_mips_interrupt *irq)
+{
+	int intr = (int)irq->irq;
+
+	/*
+	 * interrupts are asynchronous to vcpu execution therefore defer guest
+	 * cp0 accesses
+	 */
+	switch (intr) {
+	case 2:
+		kvm_vz_queue_irq(vcpu, MIPS_EXC_INT_IO);
+		break;
+
+	case 3:
+		kvm_vz_queue_irq(vcpu, MIPS_EXC_INT_IPI_1);
+		break;
+
+	case 4:
+		kvm_vz_queue_irq(vcpu, MIPS_EXC_INT_IPI_2);
+		break;
+
+	default:
+		break;
+	}
+
+}
+
+static void kvm_vz_dequeue_io_int_cb(struct kvm_vcpu *vcpu,
+				     struct kvm_mips_interrupt *irq)
+{
+	int intr = (int)irq->irq;
+
+	/*
+	 * interrupts are asynchronous to vcpu execution therefore defer guest
+	 * cp0 accesses
+	 */
+	switch (intr) {
+	case -2:
+		kvm_vz_dequeue_irq(vcpu, MIPS_EXC_INT_IO);
+		break;
+
+	case -3:
+		kvm_vz_dequeue_irq(vcpu, MIPS_EXC_INT_IPI_1);
+		break;
+
+	case -4:
+		kvm_vz_dequeue_irq(vcpu, MIPS_EXC_INT_IPI_2);
+		break;
+
+	default:
+		break;
+	}
+
+}
+
+static u32 kvm_vz_priority_to_irq[MIPS_EXC_MAX] = {
+	[MIPS_EXC_INT_TIMER] = C_IRQ5,
+	[MIPS_EXC_INT_IO]    = C_IRQ0,
+	[MIPS_EXC_INT_IPI_1] = C_IRQ1,
+	[MIPS_EXC_INT_IPI_2] = C_IRQ2,
+};
+
+static int kvm_vz_irq_deliver_cb(struct kvm_vcpu *vcpu, unsigned int priority,
+				 u32 cause)
+{
+	u32 irq = (priority < MIPS_EXC_MAX) ?
+		kvm_vz_priority_to_irq[priority] : 0;
+
+	switch (priority) {
+	case MIPS_EXC_INT_TIMER:
+		set_gc0_cause(C_TI);
+		break;
+
+	case MIPS_EXC_INT_IO:
+	case MIPS_EXC_INT_IPI_1:
+	case MIPS_EXC_INT_IPI_2:
+		if (cpu_has_guestctl2)
+			set_c0_guestctl2(irq);
+		else
+			set_gc0_cause(irq);
+		break;
+
+	default:
+		break;
+	}
+
+	clear_bit(priority, &vcpu->arch.pending_exceptions);
+	return 1;
+}
+
+static int kvm_vz_irq_clear_cb(struct kvm_vcpu *vcpu, unsigned int priority,
+			       u32 cause)
+{
+	u32 irq = (priority < MIPS_EXC_MAX) ?
+		kvm_vz_priority_to_irq[priority] : 0;
+
+	switch (priority) {
+	case MIPS_EXC_INT_TIMER:
+		/*
+		 * Call to kvm_write_c0_guest_compare() clears Cause.TI in
+		 * kvm_mips_emulate_CP0(). Explicitly clear irq associated with
+		 * Cause.IP[IPTI] if GuestCtl2 virtual interrupt register not
+		 * supported or if not using GuestCtl2 Hardware Clear.
+		 */
+		if (cpu_has_guestctl2) {
+			if (!(read_c0_guestctl2() & (irq << 14)))
+				clear_c0_guestctl2(irq);
+		} else {
+			clear_gc0_cause(irq);
+		}
+		break;
+
+	case MIPS_EXC_INT_IO:
+	case MIPS_EXC_INT_IPI_1:
+	case MIPS_EXC_INT_IPI_2:
+		/* Clear GuestCtl2.VIP irq if not using Hardware Clear */
+		if (cpu_has_guestctl2) {
+			if (!(read_c0_guestctl2() & (irq << 14)))
+				clear_c0_guestctl2(irq);
+		} else {
+			clear_gc0_cause(irq);
+		}
+		break;
+
+	default:
+		break;
+	}
+
+	clear_bit(priority, &vcpu->arch.pending_exceptions_clr);
+	return 1;
+}
+
+/*
+ * VZ guest timer handling.
+ */
+
+/**
+ * kvm_vz_should_use_htimer() - Find whether to use the VZ hard guest timer.
+ * @vcpu:	Virtual CPU.
+ *
+ * Returns:	true if the VZ GTOffset & real guest CP0_Count should be used
+ *		instead of software emulation of guest timer.
+ *		false otherwise.
+ */
+static bool kvm_vz_should_use_htimer(struct kvm_vcpu *vcpu)
+{
+	if (kvm_mips_count_disabled(vcpu))
+		return false;
+
+	/* Chosen frequency must match real frequency */
+	if (mips_hpt_frequency != vcpu->arch.count_hz)
+		return false;
+
+	/* We don't support a CP0_GTOffset with fewer bits than CP0_Count */
+	if (current_cpu_data.gtoffset_mask != 0xffffffff)
+		return false;
+
+	return true;
+}
+
+/**
+ * _kvm_vz_restore_stimer() - Restore soft timer state.
+ * @vcpu:	Virtual CPU.
+ * @compare:	CP0_Compare register value, restored by caller.
+ * @cause:	CP0_Cause register to restore.
+ *
+ * Restore VZ state relating to the soft timer. The hard timer can be enabled
+ * later.
+ */
+static void _kvm_vz_restore_stimer(struct kvm_vcpu *vcpu, u32 compare,
+				   u32 cause)
+{
+	/*
+	 * Avoid spurious counter interrupts by setting Guest CP0_Count to just
+	 * after Guest CP0_Compare.
+	 */
+	write_c0_gtoffset(compare - read_c0_count());
+
+	back_to_back_c0_hazard();
+	write_gc0_cause(cause);
+}
+
+/**
+ * _kvm_vz_restore_htimer() - Restore hard timer state.
+ * @vcpu:	Virtual CPU.
+ * @compare:	CP0_Compare register value, restored by caller.
+ * @cause:	CP0_Cause register to restore.
+ *
+ * Restore hard timer Guest.Count & Guest.Cause taking care to preserve the
+ * value of Guest.CP0_Cause.TI while restoring Guest.CP0_Cause.
+ */
+static void _kvm_vz_restore_htimer(struct kvm_vcpu *vcpu,
+				   u32 compare, u32 cause)
+{
+	u32 start_count, after_count;
+	ktime_t freeze_time;
+	unsigned long flags;
+
+	/*
+	 * Freeze the soft-timer and sync the guest CP0_Count with it. We do
+	 * this with interrupts disabled to avoid latency.
+	 */
+	local_irq_save(flags);
+	freeze_time = kvm_mips_freeze_hrtimer(vcpu, &start_count);
+	write_c0_gtoffset(start_count - read_c0_count());
+	local_irq_restore(flags);
+
+	/* restore guest CP0_Cause, as TI may already be set */
+	back_to_back_c0_hazard();
+	write_gc0_cause(cause);
+
+	/*
+	 * The above sequence isn't atomic and would result in lost timer
+	 * interrupts if we're not careful. Detect if a timer interrupt is due
+	 * and assert it.
+	 */
+	back_to_back_c0_hazard();
+	after_count = read_gc0_count();
+	if (after_count - start_count > compare - start_count - 1)
+		kvm_vz_queue_irq(vcpu, MIPS_EXC_INT_TIMER);
+}
+
+/**
+ * kvm_vz_restore_timer() - Restore timer state.
+ * @vcpu:	Virtual CPU.
+ *
+ * Restore soft timer state from saved context.
+ */
+static void kvm_vz_restore_timer(struct kvm_vcpu *vcpu)
+{
+	struct mips_coproc *cop0 = vcpu->arch.cop0;
+	u32 cause, compare;
+
+	compare = kvm_read_sw_gc0_compare(cop0);
+	cause = kvm_read_sw_gc0_cause(cop0);
+
+	write_gc0_compare(compare);
+	_kvm_vz_restore_stimer(vcpu, compare, cause);
+}
+
+/**
+ * kvm_vz_acquire_htimer() - Switch to hard timer state.
+ * @vcpu:	Virtual CPU.
+ *
+ * Restore hard timer state on top of existing soft timer state if possible.
+ *
+ * Since hard timer won't remain active over preemption, preemption should be
+ * disabled by the caller.
+ */
+void kvm_vz_acquire_htimer(struct kvm_vcpu *vcpu)
+{
+	u32 gctl0;
+
+	gctl0 = read_c0_guestctl0();
+	if (!(gctl0 & MIPS_GCTL0_GT) && kvm_vz_should_use_htimer(vcpu)) {
+		/* enable guest access to hard timer */
+		write_c0_guestctl0(gctl0 | MIPS_GCTL0_GT);
+
+		_kvm_vz_restore_htimer(vcpu, read_gc0_compare(),
+				       read_gc0_cause());
+	}
+}
+
+/**
+ * _kvm_vz_save_htimer() - Switch to software emulation of guest timer.
+ * @vcpu:	Virtual CPU.
+ * @compare:	Pointer to write compare value to.
+ * @cause:	Pointer to write cause value to.
+ *
+ * Save VZ guest timer state and switch to software emulation of guest CP0
+ * timer. The hard timer must already be in use, so preemption should be
+ * disabled.
+ */
+static void _kvm_vz_save_htimer(struct kvm_vcpu *vcpu,
+				u32 *out_compare, u32 *out_cause)
+{
+	u32 cause, compare, before_count, end_count;
+	ktime_t before_time;
+
+	compare = read_gc0_compare();
+	*out_compare = compare;
+
+	before_time = ktime_get();
+
+	/*
+	 * Record the CP0_Count *prior* to saving CP0_Cause, so we have a time
+	 * at which no pending timer interrupt is missing.
+	 */
+	before_count = read_gc0_count();
+	back_to_back_c0_hazard();
+	cause = read_gc0_cause();
+	*out_cause = cause;
+
+	/*
+	 * Record a final CP0_Count which we will transfer to the soft-timer.
+	 * This is recorded *after* saving CP0_Cause, so we don't get any timer
+	 * interrupts from just after the final CP0_Count point.
+	 */
+	back_to_back_c0_hazard();
+	end_count = read_gc0_count();
+
+	/*
+	 * The above sequence isn't atomic, so we could miss a timer interrupt
+	 * between reading CP0_Cause and end_count. Detect and record any timer
+	 * interrupt due between before_count and end_count.
+	 */
+	if (end_count - before_count > compare - before_count - 1)
+		kvm_vz_queue_irq(vcpu, MIPS_EXC_INT_TIMER);
+
+	/*
+	 * Restore soft-timer, ignoring a small amount of negative drift due to
+	 * delay between freeze_hrtimer and setting CP0_GTOffset.
+	 */
+	kvm_mips_restore_hrtimer(vcpu, before_time, end_count, -0x10000);
+}
+
+/**
+ * kvm_vz_save_timer() - Save guest timer state.
+ * @vcpu:	Virtual CPU.
+ *
+ * Save VZ guest timer state and switch to soft guest timer if hard timer was in
+ * use.
+ */
+static void kvm_vz_save_timer(struct kvm_vcpu *vcpu)
+{
+	struct mips_coproc *cop0 = vcpu->arch.cop0;
+	u32 gctl0, compare, cause;
+
+	gctl0 = read_c0_guestctl0();
+	if (gctl0 & MIPS_GCTL0_GT) {
+		/* disable guest use of hard timer */
+		write_c0_guestctl0(gctl0 & ~MIPS_GCTL0_GT);
+
+		/* save hard timer state */
+		_kvm_vz_save_htimer(vcpu, &compare, &cause);
+	} else {
+		compare = read_gc0_compare();
+		cause = read_gc0_cause();
+	}
+
+	/* save timer-related state to VCPU context */
+	kvm_write_sw_gc0_cause(cop0, cause);
+	kvm_write_sw_gc0_compare(cop0, compare);
+}
+
+/**
+ * kvm_vz_lose_htimer() - Ensure hard guest timer is not in use.
+ * @vcpu:	Virtual CPU.
+ *
+ * Transfers the state of the hard guest timer to the soft guest timer, leaving
+ * guest state intact so it can continue to be used with the soft timer.
+ */
+void kvm_vz_lose_htimer(struct kvm_vcpu *vcpu)
+{
+	u32 gctl0, compare, cause;
+
+	preempt_disable();
+	gctl0 = read_c0_guestctl0();
+	if (gctl0 & MIPS_GCTL0_GT) {
+		/* disable guest use of timer */
+		write_c0_guestctl0(gctl0 & ~MIPS_GCTL0_GT);
+
+		/* switch to soft timer */
+		_kvm_vz_save_htimer(vcpu, &compare, &cause);
+
+		/* leave soft timer in usable state */
+		_kvm_vz_restore_stimer(vcpu, compare, cause);
+	}
+	preempt_enable();
+}
+
+/**
+ * is_eva_access() - Find whether an instruction is an EVA memory accessor.
+ * @inst:	32-bit instruction encoding.
+ *
+ * Finds whether @inst encodes an EVA memory access instruction, which would
+ * indicate that emulation of it should access the user mode address space
+ * instead of the kernel mode address space. This matters for MUSUK segments
+ * which are TLB mapped for user mode but unmapped for kernel mode.
+ *
+ * Returns:	Whether @inst encodes an EVA accessor instruction.
+ */
+static bool is_eva_access(union mips_instruction inst)
+{
+	if (inst.spec3_format.opcode != spec3_op)
+		return false;
+
+	switch (inst.spec3_format.func) {
+	case lwle_op:
+	case lwre_op:
+	case cachee_op:
+	case sbe_op:
+	case she_op:
+	case sce_op:
+	case swe_op:
+	case swle_op:
+	case swre_op:
+	case prefe_op:
+	case lbue_op:
+	case lhue_op:
+	case lbe_op:
+	case lhe_op:
+	case lle_op:
+	case lwe_op:
+		return true;
+	default:
+		return false;
+	}
+}
+
+/**
+ * is_eva_am_mapped() - Find whether an access mode is mapped.
+ * @vcpu:	KVM VCPU state.
+ * @am:		3-bit encoded access mode.
+ * @eu:		Segment becomes unmapped and uncached when Status.ERL=1.
+ *
+ * Decode @am to find whether it encodes a mapped segment for the current VCPU
+ * state. Where necessary @eu and the actual instruction causing the fault are
+ * taken into account to make the decision.
+ *
+ * Returns:	Whether the VCPU faulted on a TLB mapped address.
+ */
+static bool is_eva_am_mapped(struct kvm_vcpu *vcpu, unsigned int am, bool eu)
+{
+	u32 am_lookup;
+	int err;
+
+	/*
+	 * Interpret access control mode. We assume address errors will already
+	 * have been caught by the guest, leaving us with:
+	 *      AM      UM  SM  KM  31..24 23..16
+	 * UK    0 000          Unm   0      0
+	 * MK    1 001          TLB   1
+	 * MSK   2 010      TLB TLB   1
+	 * MUSK  3 011  TLB TLB TLB   1
+	 * MUSUK 4 100  TLB TLB Unm   0      1
+	 * USK   5 101      Unm Unm   0      0
+	 * -     6 110                0      0
+	 * UUSK  7 111  Unm Unm Unm   0      0
+	 *
+	 * We shift a magic value by AM across the sign bit to find if always
+	 * TLB mapped, and if not shift by 8 again to find if it depends on KM.
+	 */
+	am_lookup = 0x70080000 << am;
+	if ((s32)am_lookup < 0) {
+		/*
+		 * MK, MSK, MUSK
+		 * Always TLB mapped, unless SegCtl.EU && ERL
+		 */
+		if (!eu || !(read_gc0_status() & ST0_ERL))
+			return true;
+	} else {
+		am_lookup <<= 8;
+		if ((s32)am_lookup < 0) {
+			union mips_instruction inst;
+			unsigned int status;
+			u32 *opc;
+
+			/*
+			 * MUSUK
+			 * TLB mapped if not in kernel mode
+			 */
+			status = read_gc0_status();
+			if (!(status & (ST0_EXL | ST0_ERL)) &&
+			    (status & ST0_KSU))
+				return true;
+			/*
+			 * EVA access instructions in kernel
+			 * mode access user address space.
+			 */
+			opc = (u32 *)vcpu->arch.pc;
+			if (vcpu->arch.host_cp0_cause & CAUSEF_BD)
+				opc += 1;
+			err = kvm_get_badinstr(opc, vcpu, &inst.word);
+			if (!err && is_eva_access(inst))
+				return true;
+		}
+	}
+
+	return false;
+}
+
+/**
+ * kvm_vz_gva_to_gpa() - Convert valid GVA to GPA.
+ * @vcpu:	KVM VCPU state.
+ * @gva:	Guest virtual address to convert.
+ * @gpa:	Output guest physical address.
+ *
+ * Convert a guest virtual address (GVA) which is valid according to the guest
+ * context, to a guest physical address (GPA).
+ *
+ * Returns:	0 on success.
+ *		-errno on failure.
+ */
+static int kvm_vz_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
+			     unsigned long *gpa)
+{
+	u32 gva32 = gva;
+	unsigned long segctl;
+
+	if ((long)gva == (s32)gva32) {
+		/* Handle canonical 32-bit virtual address */
+		if (cpu_guest_has_segments) {
+			unsigned long mask, pa;
+
+			switch (gva32 >> 29) {
+			case 0:
+			case 1: /* CFG5 (1GB) */
+				segctl = read_gc0_segctl2() >> 16;
+				mask = (unsigned long)0xfc0000000ull;
+				break;
+			case 2:
+			case 3: /* CFG4 (1GB) */
+				segctl = read_gc0_segctl2();
+				mask = (unsigned long)0xfc0000000ull;
+				break;
+			case 4: /* CFG3 (512MB) */
+				segctl = read_gc0_segctl1() >> 16;
+				mask = (unsigned long)0xfe0000000ull;
+				break;
+			case 5: /* CFG2 (512MB) */
+				segctl = read_gc0_segctl1();
+				mask = (unsigned long)0xfe0000000ull;
+				break;
+			case 6: /* CFG1 (512MB) */
+				segctl = read_gc0_segctl0() >> 16;
+				mask = (unsigned long)0xfe0000000ull;
+				break;
+			case 7: /* CFG0 (512MB) */
+				segctl = read_gc0_segctl0();
+				mask = (unsigned long)0xfe0000000ull;
+				break;
+			default:
+				/*
+				 * GCC 4.9 isn't smart enough to figure out that
+				 * segctl and mask are always initialised.
+				 */
+				unreachable();
+			}
+
+			if (is_eva_am_mapped(vcpu, (segctl >> 4) & 0x7,
+					     segctl & 0x0008))
+				goto tlb_mapped;
+
+			/* Unmapped, find guest physical address */
+			pa = (segctl << 20) & mask;
+			pa |= gva32 & ~mask;
+			*gpa = pa;
+			return 0;
+		} else if ((s32)gva32 < (s32)0xc0000000) {
+			/* legacy unmapped KSeg0 or KSeg1 */
+			*gpa = gva32 & 0x1fffffff;
+			return 0;
+		}
+#ifdef CONFIG_64BIT
+	} else if ((gva & 0xc000000000000000) == 0x8000000000000000) {
+		/* XKPHYS */
+		if (cpu_guest_has_segments) {
+			/*
+			 * Each of the 8 regions can be overridden by SegCtl2.XR
+			 * to use SegCtl1.XAM.
+			 */
+			segctl = read_gc0_segctl2();
+			if (segctl & (1ull << (56 + ((gva >> 59) & 0x7)))) {
+				segctl = read_gc0_segctl1();
+				if (is_eva_am_mapped(vcpu, (segctl >> 59) & 0x7,
+						     0))
+					goto tlb_mapped;
+			}
+
+		}
+		/*
+		 * Traditionally fully unmapped.
+		 * Bits 61:59 specify the CCA, which we can just mask off here.
+		 * Bits 58:PABITS should be zero, but we shouldn't have got here
+		 * if it wasn't.
+		 */
+		*gpa = gva & 0x07ffffffffffffff;
+		return 0;
+#endif
+	}
+
+tlb_mapped:
+	return kvm_vz_guest_tlb_lookup(vcpu, gva, gpa);
+}
+
+/**
+ * kvm_vz_badvaddr_to_gpa() - Convert GVA BadVAddr from root exception to GPA.
+ * @vcpu:	KVM VCPU state.
+ * @badvaddr:	Root BadVAddr.
+ * @gpa:	Output guest physical address.
+ *
+ * VZ implementations are permitted to report guest virtual addresses (GVA) in
+ * BadVAddr on a root exception during guest execution, instead of the more
+ * convenient guest physical addresses (GPA). When we get a GVA, this function
+ * converts it to a GPA, taking into account guest segmentation and guest TLB
+ * state.
+ *
+ * Returns:	0 on success.
+ *		-errno on failure.
+ */
+static int kvm_vz_badvaddr_to_gpa(struct kvm_vcpu *vcpu, unsigned long badvaddr,
+				  unsigned long *gpa)
+{
+	unsigned int gexccode = (vcpu->arch.host_cp0_guestctl0 &
+				 MIPS_GCTL0_GEXC) >> MIPS_GCTL0_GEXC_SHIFT;
+
+	/* If BadVAddr is GPA, then all is well in the world */
+	if (likely(gexccode == MIPS_GCTL0_GEXC_GPA)) {
+		*gpa = badvaddr;
+		return 0;
+	}
+
+	/* Otherwise we'd expect it to be GVA ... */
+	if (WARN(gexccode != MIPS_GCTL0_GEXC_GVA,
+		 "Unexpected gexccode %#x\n", gexccode))
+		return -EINVAL;
+
+	/* ... and we need to perform the GVA->GPA translation in software */
+	return kvm_vz_gva_to_gpa(vcpu, badvaddr, gpa);
+}
+
+static int kvm_trap_vz_no_handler(struct kvm_vcpu *vcpu)
+{
+	u32 *opc = (u32 *) vcpu->arch.pc;
+	u32 cause = vcpu->arch.host_cp0_cause;
+	u32 exccode = (cause & CAUSEF_EXCCODE) >> CAUSEB_EXCCODE;
+	unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
+	u32 inst = 0;
+
+	/*
+	 *  Fetch the instruction.
+	 */
+	if (cause & CAUSEF_BD)
+		opc += 1;
+	kvm_get_badinstr(opc, vcpu, &inst);
+
+	kvm_err("Exception Code: %d not handled @ PC: %p, inst: 0x%08x BadVaddr: %#lx Status: %#x\n",
+		exccode, opc, inst, badvaddr,
+		read_gc0_status());
+	kvm_arch_vcpu_dump_regs(vcpu);
+	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+	return RESUME_HOST;
+}
+
+static unsigned long mips_process_maar(unsigned int op, unsigned long val)
+{
+	/* Mask off unused bits */
+	unsigned long mask = 0xfffff000 | MIPS_MAAR_S | MIPS_MAAR_VL;
+
+	if (read_gc0_pagegrain() & PG_ELPA)
+		mask |= 0x00ffffff00000000ull;
+	if (cpu_guest_has_mvh)
+		mask |= MIPS_MAAR_VH;
+
+	/* Set or clear VH */
+	if (op == mtc_op) {
+		/* clear VH */
+		val &= ~MIPS_MAAR_VH;
+	} else if (op == dmtc_op) {
+		/* set VH to match VL */
+		val &= ~MIPS_MAAR_VH;
+		if (val & MIPS_MAAR_VL)
+			val |= MIPS_MAAR_VH;
+	}
+
+	return val & mask;
+}
+
+static void kvm_write_maari(struct kvm_vcpu *vcpu, unsigned long val)
+{
+	struct mips_coproc *cop0 = vcpu->arch.cop0;
+
+	val &= MIPS_MAARI_INDEX;
+	if (val == MIPS_MAARI_INDEX)
+		kvm_write_sw_gc0_maari(cop0, ARRAY_SIZE(vcpu->arch.maar) - 1);
+	else if (val < ARRAY_SIZE(vcpu->arch.maar))
+		kvm_write_sw_gc0_maari(cop0, val);
+}
+
+static enum emulation_result kvm_vz_gpsi_cop0(union mips_instruction inst,
+					      u32 *opc, u32 cause,
+					      struct kvm_run *run,
+					      struct kvm_vcpu *vcpu)
+{
+	struct mips_coproc *cop0 = vcpu->arch.cop0;
+	enum emulation_result er = EMULATE_DONE;
+	u32 rt, rd, sel;
+	unsigned long curr_pc;
+	unsigned long val;
+
+	/*
+	 * Update PC and hold onto current PC in case there is
+	 * an error and we want to rollback the PC
+	 */
+	curr_pc = vcpu->arch.pc;
+	er = update_pc(vcpu, cause);
+	if (er == EMULATE_FAIL)
+		return er;
+
+	if (inst.co_format.co) {
+		switch (inst.co_format.func) {
+		case wait_op:
+			er = kvm_mips_emul_wait(vcpu);
+			break;
+		default:
+			er = EMULATE_FAIL;
+		}
+	} else {
+		rt = inst.c0r_format.rt;
+		rd = inst.c0r_format.rd;
+		sel = inst.c0r_format.sel;
+
+		switch (inst.c0r_format.rs) {
+		case dmfc_op:
+		case mfc_op:
+#ifdef CONFIG_KVM_MIPS_DEBUG_COP0_COUNTERS
+			cop0->stat[rd][sel]++;
+#endif
+			if (rd == MIPS_CP0_COUNT &&
+			    sel == 0) {			/* Count */
+				val = kvm_mips_read_count(vcpu);
+			} else if (rd == MIPS_CP0_COMPARE &&
+				   sel == 0) {		/* Compare */
+				val = read_gc0_compare();
+			} else if (rd == MIPS_CP0_LLADDR &&
+				   sel == 0) {		/* LLAddr */
+				if (cpu_guest_has_rw_llb)
+					val = read_gc0_lladdr() &
+						MIPS_LLADDR_LLB;
+				else
+					val = 0;
+			} else if (rd == MIPS_CP0_LLADDR &&
+				   sel == 1 &&		/* MAAR */
+				   cpu_guest_has_maar &&
+				   !cpu_guest_has_dyn_maar) {
+				/* MAARI must be in range */
+				BUG_ON(kvm_read_sw_gc0_maari(cop0) >=
+						ARRAY_SIZE(vcpu->arch.maar));
+				val = vcpu->arch.maar[
+					kvm_read_sw_gc0_maari(cop0)];
+			} else if ((rd == MIPS_CP0_PRID &&
+				    (sel == 0 ||	/* PRid */
+				     sel == 2 ||	/* CDMMBase */
+				     sel == 3)) ||	/* CMGCRBase */
+				   (rd == MIPS_CP0_STATUS &&
+				    (sel == 2 ||	/* SRSCtl */
+				     sel == 3)) ||	/* SRSMap */
+				   (rd == MIPS_CP0_CONFIG &&
+				    (sel == 7)) ||	/* Config7 */
+				   (rd == MIPS_CP0_LLADDR &&
+				    (sel == 2) &&	/* MAARI */
+				    cpu_guest_has_maar &&
+				    !cpu_guest_has_dyn_maar) ||
+				   (rd == MIPS_CP0_ERRCTL &&
+				    (sel == 0))) {	/* ErrCtl */
+				val = cop0->reg[rd][sel];
+			} else {
+				val = 0;
+				er = EMULATE_FAIL;
+			}
+
+			if (er != EMULATE_FAIL) {
+				/* Sign extend */
+				if (inst.c0r_format.rs == mfc_op)
+					val = (int)val;
+				vcpu->arch.gprs[rt] = val;
+			}
+
+			trace_kvm_hwr(vcpu, (inst.c0r_format.rs == mfc_op) ?
+					KVM_TRACE_MFC0 : KVM_TRACE_DMFC0,
+				      KVM_TRACE_COP0(rd, sel), val);
+			break;
+
+		case dmtc_op:
+		case mtc_op:
+#ifdef CONFIG_KVM_MIPS_DEBUG_COP0_COUNTERS
+			cop0->stat[rd][sel]++;
+#endif
+			val = vcpu->arch.gprs[rt];
+			trace_kvm_hwr(vcpu, (inst.c0r_format.rs == mtc_op) ?
+					KVM_TRACE_MTC0 : KVM_TRACE_DMTC0,
+				      KVM_TRACE_COP0(rd, sel), val);
+
+			if (rd == MIPS_CP0_COUNT &&
+			    sel == 0) {			/* Count */
+				kvm_vz_lose_htimer(vcpu);
+				kvm_mips_write_count(vcpu, vcpu->arch.gprs[rt]);
+			} else if (rd == MIPS_CP0_COMPARE &&
+				   sel == 0) {		/* Compare */
+				kvm_mips_write_compare(vcpu,
+						       vcpu->arch.gprs[rt],
+						       true);
+			} else if (rd == MIPS_CP0_LLADDR &&
+				   sel == 0) {		/* LLAddr */
+				/*
+				 * P5600 generates GPSI on guest MTC0 LLAddr.
+				 * Only allow the guest to clear LLB.
+				 */
+				if (cpu_guest_has_rw_llb &&
+				    !(val & MIPS_LLADDR_LLB))
+					write_gc0_lladdr(0);
+			} else if (rd == MIPS_CP0_LLADDR &&
+				   sel == 1 &&		/* MAAR */
+				   cpu_guest_has_maar &&
+				   !cpu_guest_has_dyn_maar) {
+				val = mips_process_maar(inst.c0r_format.rs,
+							val);
+
+				/* MAARI must be in range */
+				BUG_ON(kvm_read_sw_gc0_maari(cop0) >=
+						ARRAY_SIZE(vcpu->arch.maar));
+				vcpu->arch.maar[kvm_read_sw_gc0_maari(cop0)] =
+									val;
+			} else if (rd == MIPS_CP0_LLADDR &&
+				   (sel == 2) &&	/* MAARI */
+				   cpu_guest_has_maar &&
+				   !cpu_guest_has_dyn_maar) {
+				kvm_write_maari(vcpu, val);
+			} else if (rd == MIPS_CP0_ERRCTL &&
+				   (sel == 0)) {	/* ErrCtl */
+				/* ignore the written value */
+			} else {
+				er = EMULATE_FAIL;
+			}
+			break;
+
+		default:
+			er = EMULATE_FAIL;
+			break;
+		}
+	}
+	/* Rollback PC only if emulation was unsuccessful */
+	if (er == EMULATE_FAIL) {
+		kvm_err("[%#lx]%s: unsupported cop0 instruction 0x%08x\n",
+			curr_pc, __func__, inst.word);
+
+		vcpu->arch.pc = curr_pc;
+	}
+
+	return er;
+}
+
+static enum emulation_result kvm_vz_gpsi_cache(union mips_instruction inst,
+					       u32 *opc, u32 cause,
+					       struct kvm_run *run,
+					       struct kvm_vcpu *vcpu)
+{
+	enum emulation_result er = EMULATE_DONE;
+	u32 cache, op_inst, op, base;
+	s16 offset;
+	struct kvm_vcpu_arch *arch = &vcpu->arch;
+	unsigned long va, curr_pc;
+
+	/*
+	 * Update PC and hold onto current PC in case there is
+	 * an error and we want to rollback the PC
+	 */
+	curr_pc = vcpu->arch.pc;
+	er = update_pc(vcpu, cause);
+	if (er == EMULATE_FAIL)
+		return er;
+
+	base = inst.i_format.rs;
+	op_inst = inst.i_format.rt;
+	if (cpu_has_mips_r6)
+		offset = inst.spec3_format.simmediate;
+	else
+		offset = inst.i_format.simmediate;
+	cache = op_inst & CacheOp_Cache;
+	op = op_inst & CacheOp_Op;
+
+	va = arch->gprs[base] + offset;
+
+	kvm_debug("CACHE (cache: %#x, op: %#x, base[%d]: %#lx, offset: %#x\n",
+		  cache, op, base, arch->gprs[base], offset);
+
+	/* Secondary or tirtiary cache ops ignored */
+	if (cache != Cache_I && cache != Cache_D)
+		return EMULATE_DONE;
+
+	switch (op_inst) {
+	case Index_Invalidate_I:
+		flush_icache_line_indexed(va);
+		return EMULATE_DONE;
+	case Index_Writeback_Inv_D:
+		flush_dcache_line_indexed(va);
+		return EMULATE_DONE;
+	case Hit_Invalidate_I:
+	case Hit_Invalidate_D:
+	case Hit_Writeback_Inv_D:
+		if (boot_cpu_type() == CPU_CAVIUM_OCTEON3) {
+			/* We can just flush entire icache */
+			local_flush_icache_range(0, 0);
+			return EMULATE_DONE;
+		}
+
+		/* So far, other platforms support guest hit cache ops */
+		break;
+	default:
+		break;
+	};
+
+	kvm_err("@ %#lx/%#lx CACHE (cache: %#x, op: %#x, base[%d]: %#lx, offset: %#x\n",
+		curr_pc, vcpu->arch.gprs[31], cache, op, base, arch->gprs[base],
+		offset);
+	/* Rollback PC */
+	vcpu->arch.pc = curr_pc;
+
+	return EMULATE_FAIL;
+}
+
+static enum emulation_result kvm_trap_vz_handle_gpsi(u32 cause, u32 *opc,
+						     struct kvm_vcpu *vcpu)
+{
+	enum emulation_result er = EMULATE_DONE;
+	struct kvm_vcpu_arch *arch = &vcpu->arch;
+	struct kvm_run *run = vcpu->run;
+	union mips_instruction inst;
+	int rd, rt, sel;
+	int err;
+
+	/*
+	 *  Fetch the instruction.
+	 */
+	if (cause & CAUSEF_BD)
+		opc += 1;
+	err = kvm_get_badinstr(opc, vcpu, &inst.word);
+	if (err)
+		return EMULATE_FAIL;
+
+	switch (inst.r_format.opcode) {
+	case cop0_op:
+		er = kvm_vz_gpsi_cop0(inst, opc, cause, run, vcpu);
+		break;
+#ifndef CONFIG_CPU_MIPSR6
+	case cache_op:
+		trace_kvm_exit(vcpu, KVM_TRACE_EXIT_CACHE);
+		er = kvm_vz_gpsi_cache(inst, opc, cause, run, vcpu);
+		break;
+#endif
+	case spec3_op:
+		switch (inst.spec3_format.func) {
+#ifdef CONFIG_CPU_MIPSR6
+		case cache6_op:
+			trace_kvm_exit(vcpu, KVM_TRACE_EXIT_CACHE);
+			er = kvm_vz_gpsi_cache(inst, opc, cause, run, vcpu);
+			break;
+#endif
+		case rdhwr_op:
+			if (inst.r_format.rs || (inst.r_format.re >> 3))
+				goto unknown;
+
+			rd = inst.r_format.rd;
+			rt = inst.r_format.rt;
+			sel = inst.r_format.re & 0x7;
+
+			switch (rd) {
+			case MIPS_HWR_CC:	/* Read count register */
+				arch->gprs[rt] =
+					(long)(int)kvm_mips_read_count(vcpu);
+				break;
+			default:
+				trace_kvm_hwr(vcpu, KVM_TRACE_RDHWR,
+					      KVM_TRACE_HWR(rd, sel), 0);
+				goto unknown;
+			};
+
+			trace_kvm_hwr(vcpu, KVM_TRACE_RDHWR,
+				      KVM_TRACE_HWR(rd, sel), arch->gprs[rt]);
+
+			er = update_pc(vcpu, cause);
+			break;
+		default:
+			goto unknown;
+		};
+		break;
+unknown:
+
+	default:
+		kvm_err("GPSI exception not supported (%p/%#x)\n",
+				opc, inst.word);
+		kvm_arch_vcpu_dump_regs(vcpu);
+		er = EMULATE_FAIL;
+		break;
+	}
+
+	return er;
+}
+
+static enum emulation_result kvm_trap_vz_handle_gsfc(u32 cause, u32 *opc,
+						     struct kvm_vcpu *vcpu)
+{
+	enum emulation_result er = EMULATE_DONE;
+	struct kvm_vcpu_arch *arch = &vcpu->arch;
+	union mips_instruction inst;
+	int err;
+
+	/*
+	 *  Fetch the instruction.
+	 */
+	if (cause & CAUSEF_BD)
+		opc += 1;
+	err = kvm_get_badinstr(opc, vcpu, &inst.word);
+	if (err)
+		return EMULATE_FAIL;
+
+	/* complete MTC0 on behalf of guest and advance EPC */
+	if (inst.c0r_format.opcode == cop0_op &&
+	    inst.c0r_format.rs == mtc_op &&
+	    inst.c0r_format.z == 0) {
+		int rt = inst.c0r_format.rt;
+		int rd = inst.c0r_format.rd;
+		int sel = inst.c0r_format.sel;
+		unsigned int val = arch->gprs[rt];
+		unsigned int old_val, change;
+
+		trace_kvm_hwr(vcpu, KVM_TRACE_MTC0, KVM_TRACE_COP0(rd, sel),
+			      val);
+
+		if ((rd == MIPS_CP0_STATUS) && (sel == 0)) {
+			/* FR bit should read as zero if no FPU */
+			if (!kvm_mips_guest_has_fpu(&vcpu->arch))
+				val &= ~(ST0_CU1 | ST0_FR);
+
+			/*
+			 * Also don't allow FR to be set if host doesn't support
+			 * it.
+			 */
+			if (!(boot_cpu_data.fpu_id & MIPS_FPIR_F64))
+				val &= ~ST0_FR;
+
+			old_val = read_gc0_status();
+			change = val ^ old_val;
+
+			if (change & ST0_FR) {
+				/*
+				 * FPU and Vector register state is made
+				 * UNPREDICTABLE by a change of FR, so don't
+				 * even bother saving it.
+				 */
+				kvm_drop_fpu(vcpu);
+			}
+
+			/*
+			 * If MSA state is already live, it is undefined how it
+			 * interacts with FR=0 FPU state, and we don't want to
+			 * hit reserved instruction exceptions trying to save
+			 * the MSA state later when CU=1 && FR=1, so play it
+			 * safe and save it first.
+			 */
+			if (change & ST0_CU1 && !(val & ST0_FR) &&
+			    vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA)
+				kvm_lose_fpu(vcpu);
+
+			write_gc0_status(val);
+		} else if ((rd == MIPS_CP0_CAUSE) && (sel == 0)) {
+			u32 old_cause = read_gc0_cause();
+			u32 change = old_cause ^ val;
+
+			/* DC bit enabling/disabling timer? */
+			if (change & CAUSEF_DC) {
+				if (val & CAUSEF_DC) {
+					kvm_vz_lose_htimer(vcpu);
+					kvm_mips_count_disable_cause(vcpu);
+				} else {
+					kvm_mips_count_enable_cause(vcpu);
+				}
+			}
+
+			/* Only certain bits are RW to the guest */
+			change &= (CAUSEF_DC | CAUSEF_IV | CAUSEF_WP |
+				   CAUSEF_IP0 | CAUSEF_IP1);
+
+			/* WP can only be cleared */
+			change &= ~CAUSEF_WP | old_cause;
+
+			write_gc0_cause(old_cause ^ change);
+		} else if ((rd == MIPS_CP0_STATUS) && (sel == 1)) { /* IntCtl */
+			write_gc0_intctl(val);
+		} else if ((rd == MIPS_CP0_CONFIG) && (sel == 5)) {
+			old_val = read_gc0_config5();
+			change = val ^ old_val;
+			/* Handle changes in FPU/MSA modes */
+			preempt_disable();
+
+			/*
+			 * Propagate FRE changes immediately if the FPU
+			 * context is already loaded.
+			 */
+			if (change & MIPS_CONF5_FRE &&
+			    vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU)
+				change_c0_config5(MIPS_CONF5_FRE, val);
+
+			preempt_enable();
+
+			val = old_val ^
+				(change & kvm_vz_config5_guest_wrmask(vcpu));
+			write_gc0_config5(val);
+		} else {
+			kvm_err("Handle GSFC, unsupported field change @ %p: %#x\n",
+			    opc, inst.word);
+			er = EMULATE_FAIL;
+		}
+
+		if (er != EMULATE_FAIL)
+			er = update_pc(vcpu, cause);
+	} else {
+		kvm_err("Handle GSFC, unrecognized instruction @ %p: %#x\n",
+			opc, inst.word);
+		er = EMULATE_FAIL;
+	}
+
+	return er;
+}
+
+static enum emulation_result kvm_trap_vz_handle_ghfc(u32 cause, u32 *opc,
+						     struct kvm_vcpu *vcpu)
+{
+	/*
+	 * Presumably this is due to MC (guest mode change), so lets trace some
+	 * relevant info.
+	 */
+	trace_kvm_guest_mode_change(vcpu);
+
+	return EMULATE_DONE;
+}
+
+static enum emulation_result kvm_trap_vz_handle_hc(u32 cause, u32 *opc,
+						   struct kvm_vcpu *vcpu)
+{
+	enum emulation_result er;
+	union mips_instruction inst;
+	unsigned long curr_pc;
+	int err;
+
+	if (cause & CAUSEF_BD)
+		opc += 1;
+	err = kvm_get_badinstr(opc, vcpu, &inst.word);
+	if (err)
+		return EMULATE_FAIL;
+
+	/*
+	 * Update PC and hold onto current PC in case there is
+	 * an error and we want to rollback the PC
+	 */
+	curr_pc = vcpu->arch.pc;
+	er = update_pc(vcpu, cause);
+	if (er == EMULATE_FAIL)
+		return er;
+
+	er = kvm_mips_emul_hypcall(vcpu, inst);
+	if (er == EMULATE_FAIL)
+		vcpu->arch.pc = curr_pc;
+
+	return er;
+}
+
+static enum emulation_result kvm_trap_vz_no_handler_guest_exit(u32 gexccode,
+							u32 cause,
+							u32 *opc,
+							struct kvm_vcpu *vcpu)
+{
+	u32 inst;
+
+	/*
+	 *  Fetch the instruction.
+	 */
+	if (cause & CAUSEF_BD)
+		opc += 1;
+	kvm_get_badinstr(opc, vcpu, &inst);
+
+	kvm_err("Guest Exception Code: %d not yet handled @ PC: %p, inst: 0x%08x  Status: %#x\n",
+		gexccode, opc, inst, read_gc0_status());
+
+	return EMULATE_FAIL;
+}
+
+static int kvm_trap_vz_handle_guest_exit(struct kvm_vcpu *vcpu)
+{
+	u32 *opc = (u32 *) vcpu->arch.pc;
+	u32 cause = vcpu->arch.host_cp0_cause;
+	enum emulation_result er = EMULATE_DONE;
+	u32 gexccode = (vcpu->arch.host_cp0_guestctl0 &
+			MIPS_GCTL0_GEXC) >> MIPS_GCTL0_GEXC_SHIFT;
+	int ret = RESUME_GUEST;
+
+	trace_kvm_exit(vcpu, KVM_TRACE_EXIT_GEXCCODE_BASE + gexccode);
+	switch (gexccode) {
+	case MIPS_GCTL0_GEXC_GPSI:
+		++vcpu->stat.vz_gpsi_exits;
+		er = kvm_trap_vz_handle_gpsi(cause, opc, vcpu);
+		break;
+	case MIPS_GCTL0_GEXC_GSFC:
+		++vcpu->stat.vz_gsfc_exits;
+		er = kvm_trap_vz_handle_gsfc(cause, opc, vcpu);
+		break;
+	case MIPS_GCTL0_GEXC_HC:
+		++vcpu->stat.vz_hc_exits;
+		er = kvm_trap_vz_handle_hc(cause, opc, vcpu);
+		break;
+	case MIPS_GCTL0_GEXC_GRR:
+		++vcpu->stat.vz_grr_exits;
+		er = kvm_trap_vz_no_handler_guest_exit(gexccode, cause, opc,
+						       vcpu);
+		break;
+	case MIPS_GCTL0_GEXC_GVA:
+		++vcpu->stat.vz_gva_exits;
+		er = kvm_trap_vz_no_handler_guest_exit(gexccode, cause, opc,
+						       vcpu);
+		break;
+	case MIPS_GCTL0_GEXC_GHFC:
+		++vcpu->stat.vz_ghfc_exits;
+		er = kvm_trap_vz_handle_ghfc(cause, opc, vcpu);
+		break;
+	case MIPS_GCTL0_GEXC_GPA:
+		++vcpu->stat.vz_gpa_exits;
+		er = kvm_trap_vz_no_handler_guest_exit(gexccode, cause, opc,
+						       vcpu);
+		break;
+	default:
+		++vcpu->stat.vz_resvd_exits;
+		er = kvm_trap_vz_no_handler_guest_exit(gexccode, cause, opc,
+						       vcpu);
+		break;
+
+	}
+
+	if (er == EMULATE_DONE) {
+		ret = RESUME_GUEST;
+	} else if (er == EMULATE_HYPERCALL) {
+		ret = kvm_mips_handle_hypcall(vcpu);
+	} else {
+		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		ret = RESUME_HOST;
+	}
+	return ret;
+}
+
+/**
+ * kvm_trap_vz_handle_cop_unusuable() - Guest used unusable coprocessor.
+ * @vcpu:	Virtual CPU context.
+ *
+ * Handle when the guest attempts to use a coprocessor which hasn't been allowed
+ * by the root context.
+ */
+static int kvm_trap_vz_handle_cop_unusable(struct kvm_vcpu *vcpu)
+{
+	struct kvm_run *run = vcpu->run;
+	u32 cause = vcpu->arch.host_cp0_cause;
+	enum emulation_result er = EMULATE_FAIL;
+	int ret = RESUME_GUEST;
+
+	if (((cause & CAUSEF_CE) >> CAUSEB_CE) == 1) {
+		/*
+		 * If guest FPU not present, the FPU operation should have been
+		 * treated as a reserved instruction!
+		 * If FPU already in use, we shouldn't get this at all.
+		 */
+		if (WARN_ON(!kvm_mips_guest_has_fpu(&vcpu->arch) ||
+			    vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU)) {
+			preempt_enable();
+			return EMULATE_FAIL;
+		}
+
+		kvm_own_fpu(vcpu);
+		er = EMULATE_DONE;
+	}
+	/* other coprocessors not handled */
+
+	switch (er) {
+	case EMULATE_DONE:
+		ret = RESUME_GUEST;
+		break;
+
+	case EMULATE_FAIL:
+		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		ret = RESUME_HOST;
+		break;
+
+	default:
+		BUG();
+	}
+	return ret;
+}
+
+/**
+ * kvm_trap_vz_handle_msa_disabled() - Guest used MSA while disabled in root.
+ * @vcpu:	Virtual CPU context.
+ *
+ * Handle when the guest attempts to use MSA when it is disabled in the root
+ * context.
+ */
+static int kvm_trap_vz_handle_msa_disabled(struct kvm_vcpu *vcpu)
+{
+	struct kvm_run *run = vcpu->run;
+
+	/*
+	 * If MSA not present or not exposed to guest or FR=0, the MSA operation
+	 * should have been treated as a reserved instruction!
+	 * Same if CU1=1, FR=0.
+	 * If MSA already in use, we shouldn't get this at all.
+	 */
+	if (!kvm_mips_guest_has_msa(&vcpu->arch) ||
+	    (read_gc0_status() & (ST0_CU1 | ST0_FR)) == ST0_CU1 ||
+	    !(read_gc0_config5() & MIPS_CONF5_MSAEN) ||
+	    vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA) {
+		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		return RESUME_HOST;
+	}
+
+	kvm_own_msa(vcpu);
+
+	return RESUME_GUEST;
+}
+
+static int kvm_trap_vz_handle_tlb_ld_miss(struct kvm_vcpu *vcpu)
+{
+	struct kvm_run *run = vcpu->run;
+	u32 *opc = (u32 *) vcpu->arch.pc;
+	u32 cause = vcpu->arch.host_cp0_cause;
+	ulong badvaddr = vcpu->arch.host_cp0_badvaddr;
+	union mips_instruction inst;
+	enum emulation_result er = EMULATE_DONE;
+	int err, ret = RESUME_GUEST;
+
+	if (kvm_mips_handle_vz_root_tlb_fault(badvaddr, vcpu, false)) {
+		/* A code fetch fault doesn't count as an MMIO */
+		if (kvm_is_ifetch_fault(&vcpu->arch)) {
+			run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+			return RESUME_HOST;
+		}
+
+		/* Fetch the instruction */
+		if (cause & CAUSEF_BD)
+			opc += 1;
+		err = kvm_get_badinstr(opc, vcpu, &inst.word);
+		if (err) {
+			run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+			return RESUME_HOST;
+		}
+
+		/* Treat as MMIO */
+		er = kvm_mips_emulate_load(inst, cause, run, vcpu);
+		if (er == EMULATE_FAIL) {
+			kvm_err("Guest Emulate Load from MMIO space failed: PC: %p, BadVaddr: %#lx\n",
+				opc, badvaddr);
+			run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		}
+	}
+
+	if (er == EMULATE_DONE) {
+		ret = RESUME_GUEST;
+	} else if (er == EMULATE_DO_MMIO) {
+		run->exit_reason = KVM_EXIT_MMIO;
+		ret = RESUME_HOST;
+	} else {
+		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		ret = RESUME_HOST;
+	}
+	return ret;
+}
+
+static int kvm_trap_vz_handle_tlb_st_miss(struct kvm_vcpu *vcpu)
+{
+	struct kvm_run *run = vcpu->run;
+	u32 *opc = (u32 *) vcpu->arch.pc;
+	u32 cause = vcpu->arch.host_cp0_cause;
+	ulong badvaddr = vcpu->arch.host_cp0_badvaddr;
+	union mips_instruction inst;
+	enum emulation_result er = EMULATE_DONE;
+	int err;
+	int ret = RESUME_GUEST;
+
+	/* Just try the access again if we couldn't do the translation */
+	if (kvm_vz_badvaddr_to_gpa(vcpu, badvaddr, &badvaddr))
+		return RESUME_GUEST;
+	vcpu->arch.host_cp0_badvaddr = badvaddr;
+
+	if (kvm_mips_handle_vz_root_tlb_fault(badvaddr, vcpu, true)) {
+		/* Fetch the instruction */
+		if (cause & CAUSEF_BD)
+			opc += 1;
+		err = kvm_get_badinstr(opc, vcpu, &inst.word);
+		if (err) {
+			run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+			return RESUME_HOST;
+		}
+
+		/* Treat as MMIO */
+		er = kvm_mips_emulate_store(inst, cause, run, vcpu);
+		if (er == EMULATE_FAIL) {
+			kvm_err("Guest Emulate Store to MMIO space failed: PC: %p, BadVaddr: %#lx\n",
+				opc, badvaddr);
+			run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		}
+	}
+
+	if (er == EMULATE_DONE) {
+		ret = RESUME_GUEST;
+	} else if (er == EMULATE_DO_MMIO) {
+		run->exit_reason = KVM_EXIT_MMIO;
+		ret = RESUME_HOST;
+	} else {
+		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		ret = RESUME_HOST;
+	}
+	return ret;
+}
+
+static u64 kvm_vz_get_one_regs[] = {
+	KVM_REG_MIPS_CP0_INDEX,
+	KVM_REG_MIPS_CP0_ENTRYLO0,
+	KVM_REG_MIPS_CP0_ENTRYLO1,
+	KVM_REG_MIPS_CP0_CONTEXT,
+	KVM_REG_MIPS_CP0_PAGEMASK,
+	KVM_REG_MIPS_CP0_PAGEGRAIN,
+	KVM_REG_MIPS_CP0_WIRED,
+	KVM_REG_MIPS_CP0_HWRENA,
+	KVM_REG_MIPS_CP0_BADVADDR,
+	KVM_REG_MIPS_CP0_COUNT,
+	KVM_REG_MIPS_CP0_ENTRYHI,
+	KVM_REG_MIPS_CP0_COMPARE,
+	KVM_REG_MIPS_CP0_STATUS,
+	KVM_REG_MIPS_CP0_INTCTL,
+	KVM_REG_MIPS_CP0_CAUSE,
+	KVM_REG_MIPS_CP0_EPC,
+	KVM_REG_MIPS_CP0_PRID,
+	KVM_REG_MIPS_CP0_EBASE,
+	KVM_REG_MIPS_CP0_CONFIG,
+	KVM_REG_MIPS_CP0_CONFIG1,
+	KVM_REG_MIPS_CP0_CONFIG2,
+	KVM_REG_MIPS_CP0_CONFIG3,
+	KVM_REG_MIPS_CP0_CONFIG4,
+	KVM_REG_MIPS_CP0_CONFIG5,
+#ifdef CONFIG_64BIT
+	KVM_REG_MIPS_CP0_XCONTEXT,
+#endif
+	KVM_REG_MIPS_CP0_ERROREPC,
+
+	KVM_REG_MIPS_COUNT_CTL,
+	KVM_REG_MIPS_COUNT_RESUME,
+	KVM_REG_MIPS_COUNT_HZ,
+};
+
+static u64 kvm_vz_get_one_regs_contextconfig[] = {
+	KVM_REG_MIPS_CP0_CONTEXTCONFIG,
+#ifdef CONFIG_64BIT
+	KVM_REG_MIPS_CP0_XCONTEXTCONFIG,
+#endif
+};
+
+static u64 kvm_vz_get_one_regs_segments[] = {
+	KVM_REG_MIPS_CP0_SEGCTL0,
+	KVM_REG_MIPS_CP0_SEGCTL1,
+	KVM_REG_MIPS_CP0_SEGCTL2,
+};
+
+static u64 kvm_vz_get_one_regs_htw[] = {
+	KVM_REG_MIPS_CP0_PWBASE,
+	KVM_REG_MIPS_CP0_PWFIELD,
+	KVM_REG_MIPS_CP0_PWSIZE,
+	KVM_REG_MIPS_CP0_PWCTL,
+};
+
+static u64 kvm_vz_get_one_regs_kscratch[] = {
+	KVM_REG_MIPS_CP0_KSCRATCH1,
+	KVM_REG_MIPS_CP0_KSCRATCH2,
+	KVM_REG_MIPS_CP0_KSCRATCH3,
+	KVM_REG_MIPS_CP0_KSCRATCH4,
+	KVM_REG_MIPS_CP0_KSCRATCH5,
+	KVM_REG_MIPS_CP0_KSCRATCH6,
+};
+
+static unsigned long kvm_vz_num_regs(struct kvm_vcpu *vcpu)
+{
+	unsigned long ret;
+
+	ret = ARRAY_SIZE(kvm_vz_get_one_regs);
+	if (cpu_guest_has_userlocal)
+		++ret;
+	if (cpu_guest_has_badinstr)
+		++ret;
+	if (cpu_guest_has_badinstrp)
+		++ret;
+	if (cpu_guest_has_contextconfig)
+		ret += ARRAY_SIZE(kvm_vz_get_one_regs_contextconfig);
+	if (cpu_guest_has_segments)
+		ret += ARRAY_SIZE(kvm_vz_get_one_regs_segments);
+	if (cpu_guest_has_htw)
+		ret += ARRAY_SIZE(kvm_vz_get_one_regs_htw);
+	if (cpu_guest_has_maar && !cpu_guest_has_dyn_maar)
+		ret += 1 + ARRAY_SIZE(vcpu->arch.maar);
+	ret += __arch_hweight8(cpu_data[0].guest.kscratch_mask);
+
+	return ret;
+}
+
+static int kvm_vz_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices)
+{
+	u64 index;
+	unsigned int i;
+
+	if (copy_to_user(indices, kvm_vz_get_one_regs,
+			 sizeof(kvm_vz_get_one_regs)))
+		return -EFAULT;
+	indices += ARRAY_SIZE(kvm_vz_get_one_regs);
+
+	if (cpu_guest_has_userlocal) {
+		index = KVM_REG_MIPS_CP0_USERLOCAL;
+		if (copy_to_user(indices, &index, sizeof(index)))
+			return -EFAULT;
+		++indices;
+	}
+	if (cpu_guest_has_badinstr) {
+		index = KVM_REG_MIPS_CP0_BADINSTR;
+		if (copy_to_user(indices, &index, sizeof(index)))
+			return -EFAULT;
+		++indices;
+	}
+	if (cpu_guest_has_badinstrp) {
+		index = KVM_REG_MIPS_CP0_BADINSTRP;
+		if (copy_to_user(indices, &index, sizeof(index)))
+			return -EFAULT;
+		++indices;
+	}
+	if (cpu_guest_has_contextconfig) {
+		if (copy_to_user(indices, kvm_vz_get_one_regs_contextconfig,
+				 sizeof(kvm_vz_get_one_regs_contextconfig)))
+			return -EFAULT;
+		indices += ARRAY_SIZE(kvm_vz_get_one_regs_contextconfig);
+	}
+	if (cpu_guest_has_segments) {
+		if (copy_to_user(indices, kvm_vz_get_one_regs_segments,
+				 sizeof(kvm_vz_get_one_regs_segments)))
+			return -EFAULT;
+		indices += ARRAY_SIZE(kvm_vz_get_one_regs_segments);
+	}
+	if (cpu_guest_has_htw) {
+		if (copy_to_user(indices, kvm_vz_get_one_regs_htw,
+				 sizeof(kvm_vz_get_one_regs_htw)))
+			return -EFAULT;
+		indices += ARRAY_SIZE(kvm_vz_get_one_regs_htw);
+	}
+	if (cpu_guest_has_maar && !cpu_guest_has_dyn_maar) {
+		for (i = 0; i < ARRAY_SIZE(vcpu->arch.maar); ++i) {
+			index = KVM_REG_MIPS_CP0_MAAR(i);
+			if (copy_to_user(indices, &index, sizeof(index)))
+				return -EFAULT;
+			++indices;
+		}
+
+		index = KVM_REG_MIPS_CP0_MAARI;
+		if (copy_to_user(indices, &index, sizeof(index)))
+			return -EFAULT;
+		++indices;
+	}
+	for (i = 0; i < 6; ++i) {
+		if (!cpu_guest_has_kscr(i + 2))
+			continue;
+
+		if (copy_to_user(indices, &kvm_vz_get_one_regs_kscratch[i],
+				 sizeof(kvm_vz_get_one_regs_kscratch[i])))
+			return -EFAULT;
+		++indices;
+	}
+
+	return 0;
+}
+
+static inline s64 entrylo_kvm_to_user(unsigned long v)
+{
+	s64 mask, ret = v;
+
+	if (BITS_PER_LONG == 32) {
+		/*
+		 * KVM API exposes 64-bit version of the register, so move the
+		 * RI/XI bits up into place.
+		 */
+		mask = MIPS_ENTRYLO_RI | MIPS_ENTRYLO_XI;
+		ret &= ~mask;
+		ret |= ((s64)v & mask) << 32;
+	}
+	return ret;
+}
+
+static inline unsigned long entrylo_user_to_kvm(s64 v)
+{
+	unsigned long mask, ret = v;
+
+	if (BITS_PER_LONG == 32) {
+		/*
+		 * KVM API exposes 64-bit versiono of the register, so move the
+		 * RI/XI bits down into place.
+		 */
+		mask = MIPS_ENTRYLO_RI | MIPS_ENTRYLO_XI;
+		ret &= ~mask;
+		ret |= (v >> 32) & mask;
+	}
+	return ret;
+}
+
+static int kvm_vz_get_one_reg(struct kvm_vcpu *vcpu,
+			      const struct kvm_one_reg *reg,
+			      s64 *v)
+{
+	struct mips_coproc *cop0 = vcpu->arch.cop0;
+	unsigned int idx;
+
+	switch (reg->id) {
+	case KVM_REG_MIPS_CP0_INDEX:
+		*v = (long)read_gc0_index();
+		break;
+	case KVM_REG_MIPS_CP0_ENTRYLO0:
+		*v = entrylo_kvm_to_user(read_gc0_entrylo0());
+		break;
+	case KVM_REG_MIPS_CP0_ENTRYLO1:
+		*v = entrylo_kvm_to_user(read_gc0_entrylo1());
+		break;
+	case KVM_REG_MIPS_CP0_CONTEXT:
+		*v = (long)read_gc0_context();
+		break;
+	case KVM_REG_MIPS_CP0_CONTEXTCONFIG:
+		if (!cpu_guest_has_contextconfig)
+			return -EINVAL;
+		*v = read_gc0_contextconfig();
+		break;
+	case KVM_REG_MIPS_CP0_USERLOCAL:
+		if (!cpu_guest_has_userlocal)
+			return -EINVAL;
+		*v = read_gc0_userlocal();
+		break;
+#ifdef CONFIG_64BIT
+	case KVM_REG_MIPS_CP0_XCONTEXTCONFIG:
+		if (!cpu_guest_has_contextconfig)
+			return -EINVAL;
+		*v = read_gc0_xcontextconfig();
+		break;
+#endif
+	case KVM_REG_MIPS_CP0_PAGEMASK:
+		*v = (long)read_gc0_pagemask();
+		break;
+	case KVM_REG_MIPS_CP0_PAGEGRAIN:
+		*v = (long)read_gc0_pagegrain();
+		break;
+	case KVM_REG_MIPS_CP0_SEGCTL0:
+		if (!cpu_guest_has_segments)
+			return -EINVAL;
+		*v = read_gc0_segctl0();
+		break;
+	case KVM_REG_MIPS_CP0_SEGCTL1:
+		if (!cpu_guest_has_segments)
+			return -EINVAL;
+		*v = read_gc0_segctl1();
+		break;
+	case KVM_REG_MIPS_CP0_SEGCTL2:
+		if (!cpu_guest_has_segments)
+			return -EINVAL;
+		*v = read_gc0_segctl2();
+		break;
+	case KVM_REG_MIPS_CP0_PWBASE:
+		if (!cpu_guest_has_htw)
+			return -EINVAL;
+		*v = read_gc0_pwbase();
+		break;
+	case KVM_REG_MIPS_CP0_PWFIELD:
+		if (!cpu_guest_has_htw)
+			return -EINVAL;
+		*v = read_gc0_pwfield();
+		break;
+	case KVM_REG_MIPS_CP0_PWSIZE:
+		if (!cpu_guest_has_htw)
+			return -EINVAL;
+		*v = read_gc0_pwsize();
+		break;
+	case KVM_REG_MIPS_CP0_WIRED:
+		*v = (long)read_gc0_wired();
+		break;
+	case KVM_REG_MIPS_CP0_PWCTL:
+		if (!cpu_guest_has_htw)
+			return -EINVAL;
+		*v = read_gc0_pwctl();
+		break;
+	case KVM_REG_MIPS_CP0_HWRENA:
+		*v = (long)read_gc0_hwrena();
+		break;
+	case KVM_REG_MIPS_CP0_BADVADDR:
+		*v = (long)read_gc0_badvaddr();
+		break;
+	case KVM_REG_MIPS_CP0_BADINSTR:
+		if (!cpu_guest_has_badinstr)
+			return -EINVAL;
+		*v = read_gc0_badinstr();
+		break;
+	case KVM_REG_MIPS_CP0_BADINSTRP:
+		if (!cpu_guest_has_badinstrp)
+			return -EINVAL;
+		*v = read_gc0_badinstrp();
+		break;
+	case KVM_REG_MIPS_CP0_COUNT:
+		*v = kvm_mips_read_count(vcpu);
+		break;
+	case KVM_REG_MIPS_CP0_ENTRYHI:
+		*v = (long)read_gc0_entryhi();
+		break;
+	case KVM_REG_MIPS_CP0_COMPARE:
+		*v = (long)read_gc0_compare();
+		break;
+	case KVM_REG_MIPS_CP0_STATUS:
+		*v = (long)read_gc0_status();
+		break;
+	case KVM_REG_MIPS_CP0_INTCTL:
+		*v = read_gc0_intctl();
+		break;
+	case KVM_REG_MIPS_CP0_CAUSE:
+		*v = (long)read_gc0_cause();
+		break;
+	case KVM_REG_MIPS_CP0_EPC:
+		*v = (long)read_gc0_epc();
+		break;
+	case KVM_REG_MIPS_CP0_PRID:
+		switch (boot_cpu_type()) {
+		case CPU_CAVIUM_OCTEON3:
+			/* Octeon III has a read-only guest.PRid */
+			*v = read_gc0_prid();
+			break;
+		default:
+			*v = (long)kvm_read_c0_guest_prid(cop0);
+			break;
+		};
+		break;
+	case KVM_REG_MIPS_CP0_EBASE:
+		*v = kvm_vz_read_gc0_ebase();
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG:
+		*v = read_gc0_config();
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG1:
+		if (!cpu_guest_has_conf1)
+			return -EINVAL;
+		*v = read_gc0_config1();
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG2:
+		if (!cpu_guest_has_conf2)
+			return -EINVAL;
+		*v = read_gc0_config2();
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG3:
+		if (!cpu_guest_has_conf3)
+			return -EINVAL;
+		*v = read_gc0_config3();
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG4:
+		if (!cpu_guest_has_conf4)
+			return -EINVAL;
+		*v = read_gc0_config4();
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG5:
+		if (!cpu_guest_has_conf5)
+			return -EINVAL;
+		*v = read_gc0_config5();
+		break;
+	case KVM_REG_MIPS_CP0_MAAR(0) ... KVM_REG_MIPS_CP0_MAAR(0x3f):
+		if (!cpu_guest_has_maar || cpu_guest_has_dyn_maar)
+			return -EINVAL;
+		idx = reg->id - KVM_REG_MIPS_CP0_MAAR(0);
+		if (idx >= ARRAY_SIZE(vcpu->arch.maar))
+			return -EINVAL;
+		*v = vcpu->arch.maar[idx];
+		break;
+	case KVM_REG_MIPS_CP0_MAARI:
+		if (!cpu_guest_has_maar || cpu_guest_has_dyn_maar)
+			return -EINVAL;
+		*v = kvm_read_sw_gc0_maari(vcpu->arch.cop0);
+		break;
+#ifdef CONFIG_64BIT
+	case KVM_REG_MIPS_CP0_XCONTEXT:
+		*v = read_gc0_xcontext();
+		break;
+#endif
+	case KVM_REG_MIPS_CP0_ERROREPC:
+		*v = (long)read_gc0_errorepc();
+		break;
+	case KVM_REG_MIPS_CP0_KSCRATCH1 ... KVM_REG_MIPS_CP0_KSCRATCH6:
+		idx = reg->id - KVM_REG_MIPS_CP0_KSCRATCH1 + 2;
+		if (!cpu_guest_has_kscr(idx))
+			return -EINVAL;
+		switch (idx) {
+		case 2:
+			*v = (long)read_gc0_kscratch1();
+			break;
+		case 3:
+			*v = (long)read_gc0_kscratch2();
+			break;
+		case 4:
+			*v = (long)read_gc0_kscratch3();
+			break;
+		case 5:
+			*v = (long)read_gc0_kscratch4();
+			break;
+		case 6:
+			*v = (long)read_gc0_kscratch5();
+			break;
+		case 7:
+			*v = (long)read_gc0_kscratch6();
+			break;
+		}
+		break;
+	case KVM_REG_MIPS_COUNT_CTL:
+		*v = vcpu->arch.count_ctl;
+		break;
+	case KVM_REG_MIPS_COUNT_RESUME:
+		*v = ktime_to_ns(vcpu->arch.count_resume);
+		break;
+	case KVM_REG_MIPS_COUNT_HZ:
+		*v = vcpu->arch.count_hz;
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int kvm_vz_set_one_reg(struct kvm_vcpu *vcpu,
+			      const struct kvm_one_reg *reg,
+			      s64 v)
+{
+	struct mips_coproc *cop0 = vcpu->arch.cop0;
+	unsigned int idx;
+	int ret = 0;
+	unsigned int cur, change;
+
+	switch (reg->id) {
+	case KVM_REG_MIPS_CP0_INDEX:
+		write_gc0_index(v);
+		break;
+	case KVM_REG_MIPS_CP0_ENTRYLO0:
+		write_gc0_entrylo0(entrylo_user_to_kvm(v));
+		break;
+	case KVM_REG_MIPS_CP0_ENTRYLO1:
+		write_gc0_entrylo1(entrylo_user_to_kvm(v));
+		break;
+	case KVM_REG_MIPS_CP0_CONTEXT:
+		write_gc0_context(v);
+		break;
+	case KVM_REG_MIPS_CP0_CONTEXTCONFIG:
+		if (!cpu_guest_has_contextconfig)
+			return -EINVAL;
+		write_gc0_contextconfig(v);
+		break;
+	case KVM_REG_MIPS_CP0_USERLOCAL:
+		if (!cpu_guest_has_userlocal)
+			return -EINVAL;
+		write_gc0_userlocal(v);
+		break;
+#ifdef CONFIG_64BIT
+	case KVM_REG_MIPS_CP0_XCONTEXTCONFIG:
+		if (!cpu_guest_has_contextconfig)
+			return -EINVAL;
+		write_gc0_xcontextconfig(v);
+		break;
+#endif
+	case KVM_REG_MIPS_CP0_PAGEMASK:
+		write_gc0_pagemask(v);
+		break;
+	case KVM_REG_MIPS_CP0_PAGEGRAIN:
+		write_gc0_pagegrain(v);
+		break;
+	case KVM_REG_MIPS_CP0_SEGCTL0:
+		if (!cpu_guest_has_segments)
+			return -EINVAL;
+		write_gc0_segctl0(v);
+		break;
+	case KVM_REG_MIPS_CP0_SEGCTL1:
+		if (!cpu_guest_has_segments)
+			return -EINVAL;
+		write_gc0_segctl1(v);
+		break;
+	case KVM_REG_MIPS_CP0_SEGCTL2:
+		if (!cpu_guest_has_segments)
+			return -EINVAL;
+		write_gc0_segctl2(v);
+		break;
+	case KVM_REG_MIPS_CP0_PWBASE:
+		if (!cpu_guest_has_htw)
+			return -EINVAL;
+		write_gc0_pwbase(v);
+		break;
+	case KVM_REG_MIPS_CP0_PWFIELD:
+		if (!cpu_guest_has_htw)
+			return -EINVAL;
+		write_gc0_pwfield(v);
+		break;
+	case KVM_REG_MIPS_CP0_PWSIZE:
+		if (!cpu_guest_has_htw)
+			return -EINVAL;
+		write_gc0_pwsize(v);
+		break;
+	case KVM_REG_MIPS_CP0_WIRED:
+		change_gc0_wired(MIPSR6_WIRED_WIRED, v);
+		break;
+	case KVM_REG_MIPS_CP0_PWCTL:
+		if (!cpu_guest_has_htw)
+			return -EINVAL;
+		write_gc0_pwctl(v);
+		break;
+	case KVM_REG_MIPS_CP0_HWRENA:
+		write_gc0_hwrena(v);
+		break;
+	case KVM_REG_MIPS_CP0_BADVADDR:
+		write_gc0_badvaddr(v);
+		break;
+	case KVM_REG_MIPS_CP0_BADINSTR:
+		if (!cpu_guest_has_badinstr)
+			return -EINVAL;
+		write_gc0_badinstr(v);
+		break;
+	case KVM_REG_MIPS_CP0_BADINSTRP:
+		if (!cpu_guest_has_badinstrp)
+			return -EINVAL;
+		write_gc0_badinstrp(v);
+		break;
+	case KVM_REG_MIPS_CP0_COUNT:
+		kvm_mips_write_count(vcpu, v);
+		break;
+	case KVM_REG_MIPS_CP0_ENTRYHI:
+		write_gc0_entryhi(v);
+		break;
+	case KVM_REG_MIPS_CP0_COMPARE:
+		kvm_mips_write_compare(vcpu, v, false);
+		break;
+	case KVM_REG_MIPS_CP0_STATUS:
+		write_gc0_status(v);
+		break;
+	case KVM_REG_MIPS_CP0_INTCTL:
+		write_gc0_intctl(v);
+		break;
+	case KVM_REG_MIPS_CP0_CAUSE:
+		/*
+		 * If the timer is stopped or started (DC bit) it must look
+		 * atomic with changes to the timer interrupt pending bit (TI).
+		 * A timer interrupt should not happen in between.
+		 */
+		if ((read_gc0_cause() ^ v) & CAUSEF_DC) {
+			if (v & CAUSEF_DC) {
+				/* disable timer first */
+				kvm_mips_count_disable_cause(vcpu);
+				change_gc0_cause((u32)~CAUSEF_DC, v);
+			} else {
+				/* enable timer last */
+				change_gc0_cause((u32)~CAUSEF_DC, v);
+				kvm_mips_count_enable_cause(vcpu);
+			}
+		} else {
+			write_gc0_cause(v);
+		}
+		break;
+	case KVM_REG_MIPS_CP0_EPC:
+		write_gc0_epc(v);
+		break;
+	case KVM_REG_MIPS_CP0_PRID:
+		switch (boot_cpu_type()) {
+		case CPU_CAVIUM_OCTEON3:
+			/* Octeon III has a guest.PRid, but its read-only */
+			break;
+		default:
+			kvm_write_c0_guest_prid(cop0, v);
+			break;
+		};
+		break;
+	case KVM_REG_MIPS_CP0_EBASE:
+		kvm_vz_write_gc0_ebase(v);
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG:
+		cur = read_gc0_config();
+		change = (cur ^ v) & kvm_vz_config_user_wrmask(vcpu);
+		if (change) {
+			v = cur ^ change;
+			write_gc0_config(v);
+		}
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG1:
+		if (!cpu_guest_has_conf1)
+			break;
+		cur = read_gc0_config1();
+		change = (cur ^ v) & kvm_vz_config1_user_wrmask(vcpu);
+		if (change) {
+			v = cur ^ change;
+			write_gc0_config1(v);
+		}
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG2:
+		if (!cpu_guest_has_conf2)
+			break;
+		cur = read_gc0_config2();
+		change = (cur ^ v) & kvm_vz_config2_user_wrmask(vcpu);
+		if (change) {
+			v = cur ^ change;
+			write_gc0_config2(v);
+		}
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG3:
+		if (!cpu_guest_has_conf3)
+			break;
+		cur = read_gc0_config3();
+		change = (cur ^ v) & kvm_vz_config3_user_wrmask(vcpu);
+		if (change) {
+			v = cur ^ change;
+			write_gc0_config3(v);
+		}
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG4:
+		if (!cpu_guest_has_conf4)
+			break;
+		cur = read_gc0_config4();
+		change = (cur ^ v) & kvm_vz_config4_user_wrmask(vcpu);
+		if (change) {
+			v = cur ^ change;
+			write_gc0_config4(v);
+		}
+		break;
+	case KVM_REG_MIPS_CP0_CONFIG5:
+		if (!cpu_guest_has_conf5)
+			break;
+		cur = read_gc0_config5();
+		change = (cur ^ v) & kvm_vz_config5_user_wrmask(vcpu);
+		if (change) {
+			v = cur ^ change;
+			write_gc0_config5(v);
+		}
+		break;
+	case KVM_REG_MIPS_CP0_MAAR(0) ... KVM_REG_MIPS_CP0_MAAR(0x3f):
+		if (!cpu_guest_has_maar || cpu_guest_has_dyn_maar)
+			return -EINVAL;
+		idx = reg->id - KVM_REG_MIPS_CP0_MAAR(0);
+		if (idx >= ARRAY_SIZE(vcpu->arch.maar))
+			return -EINVAL;
+		vcpu->arch.maar[idx] = mips_process_maar(dmtc_op, v);
+		break;
+	case KVM_REG_MIPS_CP0_MAARI:
+		if (!cpu_guest_has_maar || cpu_guest_has_dyn_maar)
+			return -EINVAL;
+		kvm_write_maari(vcpu, v);
+		break;
+#ifdef CONFIG_64BIT
+	case KVM_REG_MIPS_CP0_XCONTEXT:
+		write_gc0_xcontext(v);
+		break;
+#endif
+	case KVM_REG_MIPS_CP0_ERROREPC:
+		write_gc0_errorepc(v);
+		break;
+	case KVM_REG_MIPS_CP0_KSCRATCH1 ... KVM_REG_MIPS_CP0_KSCRATCH6:
+		idx = reg->id - KVM_REG_MIPS_CP0_KSCRATCH1 + 2;
+		if (!cpu_guest_has_kscr(idx))
+			return -EINVAL;
+		switch (idx) {
+		case 2:
+			write_gc0_kscratch1(v);
+			break;
+		case 3:
+			write_gc0_kscratch2(v);
+			break;
+		case 4:
+			write_gc0_kscratch3(v);
+			break;
+		case 5:
+			write_gc0_kscratch4(v);
+			break;
+		case 6:
+			write_gc0_kscratch5(v);
+			break;
+		case 7:
+			write_gc0_kscratch6(v);
+			break;
+		}
+		break;
+	case KVM_REG_MIPS_COUNT_CTL:
+		ret = kvm_mips_set_count_ctl(vcpu, v);
+		break;
+	case KVM_REG_MIPS_COUNT_RESUME:
+		ret = kvm_mips_set_count_resume(vcpu, v);
+		break;
+	case KVM_REG_MIPS_COUNT_HZ:
+		ret = kvm_mips_set_count_hz(vcpu, v);
+		break;
+	default:
+		return -EINVAL;
+	}
+	return ret;
+}
+
+#define guestid_cache(cpu)	(cpu_data[cpu].guestid_cache)
+static void kvm_vz_get_new_guestid(unsigned long cpu, struct kvm_vcpu *vcpu)
+{
+	unsigned long guestid = guestid_cache(cpu);
+
+	if (!(++guestid & GUESTID_MASK)) {
+		if (cpu_has_vtag_icache)
+			flush_icache_all();
+
+		if (!guestid)		/* fix version if needed */
+			guestid = GUESTID_FIRST_VERSION;
+
+		++guestid;		/* guestid 0 reserved for root */
+
+		/* start new guestid cycle */
+		kvm_vz_local_flush_roottlb_all_guests();
+		kvm_vz_local_flush_guesttlb_all();
+	}
+
+	guestid_cache(cpu) = guestid;
+}
+
+/* Returns 1 if the guest TLB may be clobbered */
+static int kvm_vz_check_requests(struct kvm_vcpu *vcpu, int cpu)
+{
+	int ret = 0;
+	int i;
+
+	if (!vcpu->requests)
+		return 0;
+
+	if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) {
+		if (cpu_has_guestid) {
+			/* Drop all GuestIDs for this VCPU */
+			for_each_possible_cpu(i)
+				vcpu->arch.vzguestid[i] = 0;
+			/* This will clobber guest TLB contents too */
+			ret = 1;
+		}
+		/*
+		 * For Root ASID Dealias (RAD) we don't do anything here, but we
+		 * still need the request to ensure we recheck asid_flush_mask.
+		 * We can still return 0 as only the root TLB will be affected
+		 * by a root ASID flush.
+		 */
+	}
+
+	return ret;
+}
+
+static void kvm_vz_vcpu_save_wired(struct kvm_vcpu *vcpu)
+{
+	unsigned int wired = read_gc0_wired();
+	struct kvm_mips_tlb *tlbs;
+	int i;
+
+	/* Expand the wired TLB array if necessary */
+	wired &= MIPSR6_WIRED_WIRED;
+	if (wired > vcpu->arch.wired_tlb_limit) {
+		tlbs = krealloc(vcpu->arch.wired_tlb, wired *
+				sizeof(*vcpu->arch.wired_tlb), GFP_ATOMIC);
+		if (WARN_ON(!tlbs)) {
+			/* Save whatever we can */
+			wired = vcpu->arch.wired_tlb_limit;
+		} else {
+			vcpu->arch.wired_tlb = tlbs;
+			vcpu->arch.wired_tlb_limit = wired;
+		}
+	}
+
+	if (wired)
+		/* Save wired entries from the guest TLB */
+		kvm_vz_save_guesttlb(vcpu->arch.wired_tlb, 0, wired);
+	/* Invalidate any dropped entries since last time */
+	for (i = wired; i < vcpu->arch.wired_tlb_used; ++i) {
+		vcpu->arch.wired_tlb[i].tlb_hi = UNIQUE_GUEST_ENTRYHI(i);
+		vcpu->arch.wired_tlb[i].tlb_lo[0] = 0;
+		vcpu->arch.wired_tlb[i].tlb_lo[1] = 0;
+		vcpu->arch.wired_tlb[i].tlb_mask = 0;
+	}
+	vcpu->arch.wired_tlb_used = wired;
+}
+
+static void kvm_vz_vcpu_load_wired(struct kvm_vcpu *vcpu)
+{
+	/* Load wired entries into the guest TLB */
+	if (vcpu->arch.wired_tlb)
+		kvm_vz_load_guesttlb(vcpu->arch.wired_tlb, 0,
+				     vcpu->arch.wired_tlb_used);
+}
+
+static void kvm_vz_vcpu_load_tlb(struct kvm_vcpu *vcpu, int cpu)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct mm_struct *gpa_mm = &kvm->arch.gpa_mm;
+	bool migrated;
+
+	/*
+	 * Are we entering guest context on a different CPU to last time?
+	 * If so, the VCPU's guest TLB state on this CPU may be stale.
+	 */
+	migrated = (vcpu->arch.last_exec_cpu != cpu);
+	vcpu->arch.last_exec_cpu = cpu;
+
+	/*
+	 * A vcpu's GuestID is set in GuestCtl1.ID when the vcpu is loaded and
+	 * remains set until another vcpu is loaded in.  As a rule GuestRID
+	 * remains zeroed when in root context unless the kernel is busy
+	 * manipulating guest tlb entries.
+	 */
+	if (cpu_has_guestid) {
+		/*
+		 * Check if our GuestID is of an older version and thus invalid.
+		 *
+		 * We also discard the stored GuestID if we've executed on
+		 * another CPU, as the guest mappings may have changed without
+		 * hypervisor knowledge.
+		 */
+		if (migrated ||
+		    (vcpu->arch.vzguestid[cpu] ^ guestid_cache(cpu)) &
+					GUESTID_VERSION_MASK) {
+			kvm_vz_get_new_guestid(cpu, vcpu);
+			vcpu->arch.vzguestid[cpu] = guestid_cache(cpu);
+			trace_kvm_guestid_change(vcpu,
+						 vcpu->arch.vzguestid[cpu]);
+		}
+
+		/* Restore GuestID */
+		change_c0_guestctl1(GUESTID_MASK, vcpu->arch.vzguestid[cpu]);
+	} else {
+		/*
+		 * The Guest TLB only stores a single guest's TLB state, so
+		 * flush it if another VCPU has executed on this CPU.
+		 *
+		 * We also flush if we've executed on another CPU, as the guest
+		 * mappings may have changed without hypervisor knowledge.
+		 */
+		if (migrated || last_exec_vcpu[cpu] != vcpu)
+			kvm_vz_local_flush_guesttlb_all();
+		last_exec_vcpu[cpu] = vcpu;
+
+		/*
+		 * Root ASID dealiases guest GPA mappings in the root TLB.
+		 * Allocate new root ASID if needed.
+		 */
+		if (cpumask_test_and_clear_cpu(cpu, &kvm->arch.asid_flush_mask)
+		    || (cpu_context(cpu, gpa_mm) ^ asid_cache(cpu)) &
+						asid_version_mask(cpu))
+			get_new_mmu_context(gpa_mm, cpu);
+	}
+}
+
+static int kvm_vz_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+	struct mips_coproc *cop0 = vcpu->arch.cop0;
+	bool migrated, all;
+
+	/*
+	 * Have we migrated to a different CPU?
+	 * If so, any old guest TLB state may be stale.
+	 */
+	migrated = (vcpu->arch.last_sched_cpu != cpu);
+
+	/*
+	 * Was this the last VCPU to run on this CPU?
+	 * If not, any old guest state from this VCPU will have been clobbered.
+	 */
+	all = migrated || (last_vcpu[cpu] != vcpu);
+	last_vcpu[cpu] = vcpu;
+
+	/*
+	 * Restore CP0_Wired unconditionally as we clear it after use, and
+	 * restore wired guest TLB entries (while in guest context).
+	 */
+	kvm_restore_gc0_wired(cop0);
+	if (current->flags & PF_VCPU) {
+		tlbw_use_hazard();
+		kvm_vz_vcpu_load_tlb(vcpu, cpu);
+		kvm_vz_vcpu_load_wired(vcpu);
+	}
+
+	/*
+	 * Restore timer state regardless, as e.g. Cause.TI can change over time
+	 * if left unmaintained.
+	 */
+	kvm_vz_restore_timer(vcpu);
+
+	/* Set MC bit if we want to trace guest mode changes */
+	if (kvm_trace_guest_mode_change)
+		set_c0_guestctl0(MIPS_GCTL0_MC);
+	else
+		clear_c0_guestctl0(MIPS_GCTL0_MC);
+
+	/* Don't bother restoring registers multiple times unless necessary */
+	if (!all)
+		return 0;
+
+	/*
+	 * Restore config registers first, as some implementations restrict
+	 * writes to other registers when the corresponding feature bits aren't
+	 * set. For example Status.CU1 cannot be set unless Config1.FP is set.
+	 */
+	kvm_restore_gc0_config(cop0);
+	if (cpu_guest_has_conf1)
+		kvm_restore_gc0_config1(cop0);
+	if (cpu_guest_has_conf2)
+		kvm_restore_gc0_config2(cop0);
+	if (cpu_guest_has_conf3)
+		kvm_restore_gc0_config3(cop0);
+	if (cpu_guest_has_conf4)
+		kvm_restore_gc0_config4(cop0);
+	if (cpu_guest_has_conf5)
+		kvm_restore_gc0_config5(cop0);
+	if (cpu_guest_has_conf6)
+		kvm_restore_gc0_config6(cop0);
+	if (cpu_guest_has_conf7)
+		kvm_restore_gc0_config7(cop0);
+
+	kvm_restore_gc0_index(cop0);
+	kvm_restore_gc0_entrylo0(cop0);
+	kvm_restore_gc0_entrylo1(cop0);
+	kvm_restore_gc0_context(cop0);
+	if (cpu_guest_has_contextconfig)
+		kvm_restore_gc0_contextconfig(cop0);
+#ifdef CONFIG_64BIT
+	kvm_restore_gc0_xcontext(cop0);
+	if (cpu_guest_has_contextconfig)
+		kvm_restore_gc0_xcontextconfig(cop0);
+#endif
+	kvm_restore_gc0_pagemask(cop0);
+	kvm_restore_gc0_pagegrain(cop0);
+	kvm_restore_gc0_hwrena(cop0);
+	kvm_restore_gc0_badvaddr(cop0);
+	kvm_restore_gc0_entryhi(cop0);
+	kvm_restore_gc0_status(cop0);
+	kvm_restore_gc0_intctl(cop0);
+	kvm_restore_gc0_epc(cop0);
+	kvm_vz_write_gc0_ebase(kvm_read_sw_gc0_ebase(cop0));
+	if (cpu_guest_has_userlocal)
+		kvm_restore_gc0_userlocal(cop0);
+
+	kvm_restore_gc0_errorepc(cop0);
+
+	/* restore KScratch registers if enabled in guest */
+	if (cpu_guest_has_conf4) {
+		if (cpu_guest_has_kscr(2))
+			kvm_restore_gc0_kscratch1(cop0);
+		if (cpu_guest_has_kscr(3))
+			kvm_restore_gc0_kscratch2(cop0);
+		if (cpu_guest_has_kscr(4))
+			kvm_restore_gc0_kscratch3(cop0);
+		if (cpu_guest_has_kscr(5))
+			kvm_restore_gc0_kscratch4(cop0);
+		if (cpu_guest_has_kscr(6))
+			kvm_restore_gc0_kscratch5(cop0);
+		if (cpu_guest_has_kscr(7))
+			kvm_restore_gc0_kscratch6(cop0);
+	}
+
+	if (cpu_guest_has_badinstr)
+		kvm_restore_gc0_badinstr(cop0);
+	if (cpu_guest_has_badinstrp)
+		kvm_restore_gc0_badinstrp(cop0);
+
+	if (cpu_guest_has_segments) {
+		kvm_restore_gc0_segctl0(cop0);
+		kvm_restore_gc0_segctl1(cop0);
+		kvm_restore_gc0_segctl2(cop0);
+	}
+
+	/* restore HTW registers */
+	if (cpu_guest_has_htw) {
+		kvm_restore_gc0_pwbase(cop0);
+		kvm_restore_gc0_pwfield(cop0);
+		kvm_restore_gc0_pwsize(cop0);
+		kvm_restore_gc0_pwctl(cop0);
+	}
+
+	/* restore Root.GuestCtl2 from unused Guest guestctl2 register */
+	if (cpu_has_guestctl2)
+		write_c0_guestctl2(
+			cop0->reg[MIPS_CP0_GUESTCTL2][MIPS_CP0_GUESTCTL2_SEL]);
+
+	/*
+	 * We should clear linked load bit to break interrupted atomics. This
+	 * prevents a SC on the next VCPU from succeeding by matching a LL on
+	 * the previous VCPU.
+	 */
+	if (cpu_guest_has_rw_llb)
+		write_gc0_lladdr(0);
+
+	return 0;
+}
+
+static int kvm_vz_vcpu_put(struct kvm_vcpu *vcpu, int cpu)
+{
+	struct mips_coproc *cop0 = vcpu->arch.cop0;
+
+	if (current->flags & PF_VCPU)
+		kvm_vz_vcpu_save_wired(vcpu);
+
+	kvm_lose_fpu(vcpu);
+
+	kvm_save_gc0_index(cop0);
+	kvm_save_gc0_entrylo0(cop0);
+	kvm_save_gc0_entrylo1(cop0);
+	kvm_save_gc0_context(cop0);
+	if (cpu_guest_has_contextconfig)
+		kvm_save_gc0_contextconfig(cop0);
+#ifdef CONFIG_64BIT
+	kvm_save_gc0_xcontext(cop0);
+	if (cpu_guest_has_contextconfig)
+		kvm_save_gc0_xcontextconfig(cop0);
+#endif
+	kvm_save_gc0_pagemask(cop0);
+	kvm_save_gc0_pagegrain(cop0);
+	kvm_save_gc0_wired(cop0);
+	/* allow wired TLB entries to be overwritten */
+	clear_gc0_wired(MIPSR6_WIRED_WIRED);
+	kvm_save_gc0_hwrena(cop0);
+	kvm_save_gc0_badvaddr(cop0);
+	kvm_save_gc0_entryhi(cop0);
+	kvm_save_gc0_status(cop0);
+	kvm_save_gc0_intctl(cop0);
+	kvm_save_gc0_epc(cop0);
+	kvm_write_sw_gc0_ebase(cop0, kvm_vz_read_gc0_ebase());
+	if (cpu_guest_has_userlocal)
+		kvm_save_gc0_userlocal(cop0);
+
+	/* only save implemented config registers */
+	kvm_save_gc0_config(cop0);
+	if (cpu_guest_has_conf1)
+		kvm_save_gc0_config1(cop0);
+	if (cpu_guest_has_conf2)
+		kvm_save_gc0_config2(cop0);
+	if (cpu_guest_has_conf3)
+		kvm_save_gc0_config3(cop0);
+	if (cpu_guest_has_conf4)
+		kvm_save_gc0_config4(cop0);
+	if (cpu_guest_has_conf5)
+		kvm_save_gc0_config5(cop0);
+	if (cpu_guest_has_conf6)
+		kvm_save_gc0_config6(cop0);
+	if (cpu_guest_has_conf7)
+		kvm_save_gc0_config7(cop0);
+
+	kvm_save_gc0_errorepc(cop0);
+
+	/* save KScratch registers if enabled in guest */
+	if (cpu_guest_has_conf4) {
+		if (cpu_guest_has_kscr(2))
+			kvm_save_gc0_kscratch1(cop0);
+		if (cpu_guest_has_kscr(3))
+			kvm_save_gc0_kscratch2(cop0);
+		if (cpu_guest_has_kscr(4))
+			kvm_save_gc0_kscratch3(cop0);
+		if (cpu_guest_has_kscr(5))
+			kvm_save_gc0_kscratch4(cop0);
+		if (cpu_guest_has_kscr(6))
+			kvm_save_gc0_kscratch5(cop0);
+		if (cpu_guest_has_kscr(7))
+			kvm_save_gc0_kscratch6(cop0);
+	}
+
+	if (cpu_guest_has_badinstr)
+		kvm_save_gc0_badinstr(cop0);
+	if (cpu_guest_has_badinstrp)
+		kvm_save_gc0_badinstrp(cop0);
+
+	if (cpu_guest_has_segments) {
+		kvm_save_gc0_segctl0(cop0);
+		kvm_save_gc0_segctl1(cop0);
+		kvm_save_gc0_segctl2(cop0);
+	}
+
+	/* save HTW registers if enabled in guest */
+	if (cpu_guest_has_htw &&
+	    kvm_read_sw_gc0_config3(cop0) & MIPS_CONF3_PW) {
+		kvm_save_gc0_pwbase(cop0);
+		kvm_save_gc0_pwfield(cop0);
+		kvm_save_gc0_pwsize(cop0);
+		kvm_save_gc0_pwctl(cop0);
+	}
+
+	kvm_vz_save_timer(vcpu);
+
+	/* save Root.GuestCtl2 in unused Guest guestctl2 register */
+	if (cpu_has_guestctl2)
+		cop0->reg[MIPS_CP0_GUESTCTL2][MIPS_CP0_GUESTCTL2_SEL] =
+			read_c0_guestctl2();
+
+	return 0;
+}
+
+/**
+ * kvm_vz_resize_guest_vtlb() - Attempt to resize guest VTLB.
+ * @size:	Number of guest VTLB entries (0 < @size <= root VTLB entries).
+ *
+ * Attempt to resize the guest VTLB by writing guest Config registers. This is
+ * necessary for cores with a shared root/guest TLB to avoid overlap with wired
+ * entries in the root VTLB.
+ *
+ * Returns:	The resulting guest VTLB size.
+ */
+static unsigned int kvm_vz_resize_guest_vtlb(unsigned int size)
+{
+	unsigned int config4 = 0, ret = 0, limit;
+
+	/* Write MMUSize - 1 into guest Config registers */
+	if (cpu_guest_has_conf1)
+		change_gc0_config1(MIPS_CONF1_TLBS,
+				   (size - 1) << MIPS_CONF1_TLBS_SHIFT);
+	if (cpu_guest_has_conf4) {
+		config4 = read_gc0_config4();
+		if (cpu_has_mips_r6 || (config4 & MIPS_CONF4_MMUEXTDEF) ==
+		    MIPS_CONF4_MMUEXTDEF_VTLBSIZEEXT) {
+			config4 &= ~MIPS_CONF4_VTLBSIZEEXT;
+			config4 |= ((size - 1) >> MIPS_CONF1_TLBS_SIZE) <<
+				MIPS_CONF4_VTLBSIZEEXT_SHIFT;
+		} else if ((config4 & MIPS_CONF4_MMUEXTDEF) ==
+			   MIPS_CONF4_MMUEXTDEF_MMUSIZEEXT) {
+			config4 &= ~MIPS_CONF4_MMUSIZEEXT;
+			config4 |= ((size - 1) >> MIPS_CONF1_TLBS_SIZE) <<
+				MIPS_CONF4_MMUSIZEEXT_SHIFT;
+		}
+		write_gc0_config4(config4);
+	}
+
+	/*
+	 * Set Guest.Wired.Limit = 0 (no limit up to Guest.MMUSize-1), unless it
+	 * would exceed Root.Wired.Limit (clearing Guest.Wired.Wired so write
+	 * not dropped)
+	 */
+	if (cpu_has_mips_r6) {
+		limit = (read_c0_wired() & MIPSR6_WIRED_LIMIT) >>
+						MIPSR6_WIRED_LIMIT_SHIFT;
+		if (size - 1 <= limit)
+			limit = 0;
+		write_gc0_wired(limit << MIPSR6_WIRED_LIMIT_SHIFT);
+	}
+
+	/* Read back MMUSize - 1 */
+	back_to_back_c0_hazard();
+	if (cpu_guest_has_conf1)
+		ret = (read_gc0_config1() & MIPS_CONF1_TLBS) >>
+						MIPS_CONF1_TLBS_SHIFT;
+	if (config4) {
+		if (cpu_has_mips_r6 || (config4 & MIPS_CONF4_MMUEXTDEF) ==
+		    MIPS_CONF4_MMUEXTDEF_VTLBSIZEEXT)
+			ret |= ((config4 & MIPS_CONF4_VTLBSIZEEXT) >>
+				MIPS_CONF4_VTLBSIZEEXT_SHIFT) <<
+				MIPS_CONF1_TLBS_SIZE;
+		else if ((config4 & MIPS_CONF4_MMUEXTDEF) ==
+			 MIPS_CONF4_MMUEXTDEF_MMUSIZEEXT)
+			ret |= ((config4 & MIPS_CONF4_MMUSIZEEXT) >>
+				MIPS_CONF4_MMUSIZEEXT_SHIFT) <<
+				MIPS_CONF1_TLBS_SIZE;
+	}
+	return ret + 1;
+}
+
+static int kvm_vz_hardware_enable(void)
+{
+	unsigned int mmu_size, guest_mmu_size, ftlb_size;
+	u64 guest_cvmctl, cvmvmconfig;
+
+	switch (current_cpu_type()) {
+	case CPU_CAVIUM_OCTEON3:
+		/* Set up guest timer/perfcount IRQ lines */
+		guest_cvmctl = read_gc0_cvmctl();
+		guest_cvmctl &= ~CVMCTL_IPTI;
+		guest_cvmctl |= 7ull << CVMCTL_IPTI_SHIFT;
+		guest_cvmctl &= ~CVMCTL_IPPCI;
+		guest_cvmctl |= 6ull << CVMCTL_IPPCI_SHIFT;
+		write_gc0_cvmctl(guest_cvmctl);
+
+		cvmvmconfig = read_c0_cvmvmconfig();
+		/* No I/O hole translation. */
+		cvmvmconfig |= CVMVMCONF_DGHT;
+		/* Halve the root MMU size */
+		mmu_size = ((cvmvmconfig & CVMVMCONF_MMUSIZEM1)
+			    >> CVMVMCONF_MMUSIZEM1_S) + 1;
+		guest_mmu_size = mmu_size / 2;
+		mmu_size -= guest_mmu_size;
+		cvmvmconfig &= ~CVMVMCONF_RMMUSIZEM1;
+		cvmvmconfig |= mmu_size - 1;
+		write_c0_cvmvmconfig(cvmvmconfig);
+
+		/* Update our records */
+		current_cpu_data.tlbsize = mmu_size;
+		current_cpu_data.tlbsizevtlb = mmu_size;
+		current_cpu_data.guest.tlbsize = guest_mmu_size;
+
+		/* Flush moved entries in new (guest) context */
+		kvm_vz_local_flush_guesttlb_all();
+		break;
+	default:
+		/*
+		 * ImgTec cores tend to use a shared root/guest TLB. To avoid
+		 * overlap of root wired and guest entries, the guest TLB may
+		 * need resizing.
+		 */
+		mmu_size = current_cpu_data.tlbsizevtlb;
+		ftlb_size = current_cpu_data.tlbsize - mmu_size;
+
+		/* Try switching to maximum guest VTLB size for flush */
+		guest_mmu_size = kvm_vz_resize_guest_vtlb(mmu_size);
+		current_cpu_data.guest.tlbsize = guest_mmu_size + ftlb_size;
+		kvm_vz_local_flush_guesttlb_all();
+
+		/*
+		 * Reduce to make space for root wired entries and at least 2
+		 * root non-wired entries. This does assume that long-term wired
+		 * entries won't be added later.
+		 */
+		guest_mmu_size = mmu_size - num_wired_entries() - 2;
+		guest_mmu_size = kvm_vz_resize_guest_vtlb(guest_mmu_size);
+		current_cpu_data.guest.tlbsize = guest_mmu_size + ftlb_size;
+
+		/*
+		 * Write the VTLB size, but if another CPU has already written,
+		 * check it matches or we won't provide a consistent view to the
+		 * guest. If this ever happens it suggests an asymmetric number
+		 * of wired entries.
+		 */
+		if (cmpxchg(&kvm_vz_guest_vtlb_size, 0, guest_mmu_size) &&
+		    WARN(guest_mmu_size != kvm_vz_guest_vtlb_size,
+			 "Available guest VTLB size mismatch"))
+			return -EINVAL;
+		break;
+	}
+
+	/*
+	 * Enable virtualization features granting guest direct control of
+	 * certain features:
+	 * CP0=1:	Guest coprocessor 0 context.
+	 * AT=Guest:	Guest MMU.
+	 * CG=1:	Hit (virtual address) CACHE operations (optional).
+	 * CF=1:	Guest Config registers.
+	 * CGI=1:	Indexed flush CACHE operations (optional).
+	 */
+	write_c0_guestctl0(MIPS_GCTL0_CP0 |
+			   (MIPS_GCTL0_AT_GUEST << MIPS_GCTL0_AT_SHIFT) |
+			   MIPS_GCTL0_CG | MIPS_GCTL0_CF);
+	if (cpu_has_guestctl0ext)
+		set_c0_guestctl0ext(MIPS_GCTL0EXT_CGI);
+
+	if (cpu_has_guestid) {
+		write_c0_guestctl1(0);
+		kvm_vz_local_flush_roottlb_all_guests();
+
+		GUESTID_MASK = current_cpu_data.guestid_mask;
+		GUESTID_FIRST_VERSION = GUESTID_MASK + 1;
+		GUESTID_VERSION_MASK = ~GUESTID_MASK;
+
+		current_cpu_data.guestid_cache = GUESTID_FIRST_VERSION;
+	}
+
+	/* clear any pending injected virtual guest interrupts */
+	if (cpu_has_guestctl2)
+		clear_c0_guestctl2(0x3f << 10);
+
+	return 0;
+}
+
+static void kvm_vz_hardware_disable(void)
+{
+	u64 cvmvmconfig;
+	unsigned int mmu_size;
+
+	/* Flush any remaining guest TLB entries */
+	kvm_vz_local_flush_guesttlb_all();
+
+	switch (current_cpu_type()) {
+	case CPU_CAVIUM_OCTEON3:
+		/*
+		 * Allocate whole TLB for root. Existing guest TLB entries will
+		 * change ownership to the root TLB. We should be safe though as
+		 * they've already been flushed above while in guest TLB.
+		 */
+		cvmvmconfig = read_c0_cvmvmconfig();
+		mmu_size = ((cvmvmconfig & CVMVMCONF_MMUSIZEM1)
+			    >> CVMVMCONF_MMUSIZEM1_S) + 1;
+		cvmvmconfig &= ~CVMVMCONF_RMMUSIZEM1;
+		cvmvmconfig |= mmu_size - 1;
+		write_c0_cvmvmconfig(cvmvmconfig);
+
+		/* Update our records */
+		current_cpu_data.tlbsize = mmu_size;
+		current_cpu_data.tlbsizevtlb = mmu_size;
+		current_cpu_data.guest.tlbsize = 0;
+
+		/* Flush moved entries in new (root) context */
+		local_flush_tlb_all();
+		break;
+	}
+
+	if (cpu_has_guestid) {
+		write_c0_guestctl1(0);
+		kvm_vz_local_flush_roottlb_all_guests();
+	}
+}
+
+static int kvm_vz_check_extension(struct kvm *kvm, long ext)
+{
+	int r;
+
+	switch (ext) {
+	case KVM_CAP_MIPS_VZ:
+		/* we wouldn't be here unless cpu_has_vz */
+		r = 1;
+		break;
+#ifdef CONFIG_64BIT
+	case KVM_CAP_MIPS_64BIT:
+		/* We support 64-bit registers/operations and addresses */
+		r = 2;
+		break;
+#endif
+	default:
+		r = 0;
+		break;
+	}
+
+	return r;
+}
+
+static int kvm_vz_vcpu_init(struct kvm_vcpu *vcpu)
+{
+	int i;
+
+	for_each_possible_cpu(i)
+		vcpu->arch.vzguestid[i] = 0;
+
+	return 0;
+}
+
+static void kvm_vz_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+	int cpu;
+
+	/*
+	 * If the VCPU is freed and reused as another VCPU, we don't want the
+	 * matching pointer wrongly hanging around in last_vcpu[] or
+	 * last_exec_vcpu[].
+	 */
+	for_each_possible_cpu(cpu) {
+		if (last_vcpu[cpu] == vcpu)
+			last_vcpu[cpu] = NULL;
+		if (last_exec_vcpu[cpu] == vcpu)
+			last_exec_vcpu[cpu] = NULL;
+	}
+}
+
+static int kvm_vz_vcpu_setup(struct kvm_vcpu *vcpu)
+{
+	struct mips_coproc *cop0 = vcpu->arch.cop0;
+	unsigned long count_hz = 100*1000*1000; /* default to 100 MHz */
+
+	/*
+	 * Start off the timer at the same frequency as the host timer, but the
+	 * soft timer doesn't handle frequencies greater than 1GHz yet.
+	 */
+	if (mips_hpt_frequency && mips_hpt_frequency <= NSEC_PER_SEC)
+		count_hz = mips_hpt_frequency;
+	kvm_mips_init_count(vcpu, count_hz);
+
+	/*
+	 * Initialize guest register state to valid architectural reset state.
+	 */
+
+	/* PageGrain */
+	if (cpu_has_mips_r6)
+		kvm_write_sw_gc0_pagegrain(cop0, PG_RIE | PG_XIE | PG_IEC);
+	/* Wired */
+	if (cpu_has_mips_r6)
+		kvm_write_sw_gc0_wired(cop0,
+				       read_gc0_wired() & MIPSR6_WIRED_LIMIT);
+	/* Status */
+	kvm_write_sw_gc0_status(cop0, ST0_BEV | ST0_ERL);
+	if (cpu_has_mips_r6)
+		kvm_change_sw_gc0_status(cop0, ST0_FR, read_gc0_status());
+	/* IntCtl */
+	kvm_write_sw_gc0_intctl(cop0, read_gc0_intctl() &
+				(INTCTLF_IPFDC | INTCTLF_IPPCI | INTCTLF_IPTI));
+	/* PRId */
+	kvm_write_sw_gc0_prid(cop0, boot_cpu_data.processor_id);
+	/* EBase */
+	kvm_write_sw_gc0_ebase(cop0, (s32)0x80000000 | vcpu->vcpu_id);
+	/* Config */
+	kvm_save_gc0_config(cop0);
+	/* architecturally writable (e.g. from guest) */
+	kvm_change_sw_gc0_config(cop0, CONF_CM_CMASK,
+				 _page_cachable_default >> _CACHE_SHIFT);
+	/* architecturally read only, but maybe writable from root */
+	kvm_change_sw_gc0_config(cop0, MIPS_CONF_MT, read_c0_config());
+	if (cpu_guest_has_conf1) {
+		kvm_set_sw_gc0_config(cop0, MIPS_CONF_M);
+		/* Config1 */
+		kvm_save_gc0_config1(cop0);
+		/* architecturally read only, but maybe writable from root */
+		kvm_clear_sw_gc0_config1(cop0, MIPS_CONF1_C2	|
+					       MIPS_CONF1_MD	|
+					       MIPS_CONF1_PC	|
+					       MIPS_CONF1_WR	|
+					       MIPS_CONF1_CA	|
+					       MIPS_CONF1_FP);
+	}
+	if (cpu_guest_has_conf2) {
+		kvm_set_sw_gc0_config1(cop0, MIPS_CONF_M);
+		/* Config2 */
+		kvm_save_gc0_config2(cop0);
+	}
+	if (cpu_guest_has_conf3) {
+		kvm_set_sw_gc0_config2(cop0, MIPS_CONF_M);
+		/* Config3 */
+		kvm_save_gc0_config3(cop0);
+		/* architecturally writable (e.g. from guest) */
+		kvm_clear_sw_gc0_config3(cop0, MIPS_CONF3_ISA_OE);
+		/* architecturally read only, but maybe writable from root */
+		kvm_clear_sw_gc0_config3(cop0, MIPS_CONF3_MSA	|
+					       MIPS_CONF3_BPG	|
+					       MIPS_CONF3_ULRI	|
+					       MIPS_CONF3_DSP	|
+					       MIPS_CONF3_CTXTC	|
+					       MIPS_CONF3_ITL	|
+					       MIPS_CONF3_LPA	|
+					       MIPS_CONF3_VEIC	|
+					       MIPS_CONF3_VINT	|
+					       MIPS_CONF3_SP	|
+					       MIPS_CONF3_CDMM	|
+					       MIPS_CONF3_MT	|
+					       MIPS_CONF3_SM	|
+					       MIPS_CONF3_TL);
+	}
+	if (cpu_guest_has_conf4) {
+		kvm_set_sw_gc0_config3(cop0, MIPS_CONF_M);
+		/* Config4 */
+		kvm_save_gc0_config4(cop0);
+	}
+	if (cpu_guest_has_conf5) {
+		kvm_set_sw_gc0_config4(cop0, MIPS_CONF_M);
+		/* Config5 */
+		kvm_save_gc0_config5(cop0);
+		/* architecturally writable (e.g. from guest) */
+		kvm_clear_sw_gc0_config5(cop0, MIPS_CONF5_K	|
+					       MIPS_CONF5_CV	|
+					       MIPS_CONF5_MSAEN	|
+					       MIPS_CONF5_UFE	|
+					       MIPS_CONF5_FRE	|
+					       MIPS_CONF5_SBRI	|
+					       MIPS_CONF5_UFR);
+		/* architecturally read only, but maybe writable from root */
+		kvm_clear_sw_gc0_config5(cop0, MIPS_CONF5_MRP);
+	}
+
+	if (cpu_guest_has_contextconfig) {
+		/* ContextConfig */
+		kvm_write_sw_gc0_contextconfig(cop0, 0x007ffff0);
+#ifdef CONFIG_64BIT
+		/* XContextConfig */
+		/* bits SEGBITS-13+3:4 set */
+		kvm_write_sw_gc0_xcontextconfig(cop0,
+					((1ull << (cpu_vmbits - 13)) - 1) << 4);
+#endif
+	}
+
+	/* Implementation dependent, use the legacy layout */
+	if (cpu_guest_has_segments) {
+		/* SegCtl0, SegCtl1, SegCtl2 */
+		kvm_write_sw_gc0_segctl0(cop0, 0x00200010);
+		kvm_write_sw_gc0_segctl1(cop0, 0x00000002 |
+				(_page_cachable_default >> _CACHE_SHIFT) <<
+						(16 + MIPS_SEGCFG_C_SHIFT));
+		kvm_write_sw_gc0_segctl2(cop0, 0x00380438);
+	}
+
+	/* reset HTW registers */
+	if (cpu_guest_has_htw && cpu_has_mips_r6) {
+		/* PWField */
+		kvm_write_sw_gc0_pwfield(cop0, 0x0c30c302);
+		/* PWSize */
+		kvm_write_sw_gc0_pwsize(cop0, 1 << MIPS_PWSIZE_PTW_SHIFT);
+	}
+
+	/* start with no pending virtual guest interrupts */
+	if (cpu_has_guestctl2)
+		cop0->reg[MIPS_CP0_GUESTCTL2][MIPS_CP0_GUESTCTL2_SEL] = 0;
+
+	/* Put PC at reset vector */
+	vcpu->arch.pc = CKSEG1ADDR(0x1fc00000);
+
+	return 0;
+}
+
+static void kvm_vz_flush_shadow_all(struct kvm *kvm)
+{
+	if (cpu_has_guestid) {
+		/* Flush GuestID for each VCPU individually */
+		kvm_flush_remote_tlbs(kvm);
+	} else {
+		/*
+		 * For each CPU there is a single GPA ASID used by all VCPUs in
+		 * the VM, so it doesn't make sense for the VCPUs to handle
+		 * invalidation of these ASIDs individually.
+		 *
+		 * Instead mark all CPUs as needing ASID invalidation in
+		 * asid_flush_mask, and just use kvm_flush_remote_tlbs(kvm) to
+		 * kick any running VCPUs so they check asid_flush_mask.
+		 */
+		cpumask_setall(&kvm->arch.asid_flush_mask);
+		kvm_flush_remote_tlbs(kvm);
+	}
+}
+
+static void kvm_vz_flush_shadow_memslot(struct kvm *kvm,
+					const struct kvm_memory_slot *slot)
+{
+	kvm_vz_flush_shadow_all(kvm);
+}
+
+static void kvm_vz_vcpu_reenter(struct kvm_run *run, struct kvm_vcpu *vcpu)
+{
+	int cpu = smp_processor_id();
+	int preserve_guest_tlb;
+
+	preserve_guest_tlb = kvm_vz_check_requests(vcpu, cpu);
+
+	if (preserve_guest_tlb)
+		kvm_vz_vcpu_save_wired(vcpu);
+
+	kvm_vz_vcpu_load_tlb(vcpu, cpu);
+
+	if (preserve_guest_tlb)
+		kvm_vz_vcpu_load_wired(vcpu);
+}
+
+static int kvm_vz_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
+{
+	int cpu = smp_processor_id();
+	int r;
+
+	kvm_vz_acquire_htimer(vcpu);
+	/* Check if we have any exceptions/interrupts pending */
+	kvm_mips_deliver_interrupts(vcpu, read_gc0_cause());
+
+	kvm_vz_check_requests(vcpu, cpu);
+	kvm_vz_vcpu_load_tlb(vcpu, cpu);
+	kvm_vz_vcpu_load_wired(vcpu);
+
+	r = vcpu->arch.vcpu_run(run, vcpu);
+
+	kvm_vz_vcpu_save_wired(vcpu);
+
+	return r;
+}
+
+static struct kvm_mips_callbacks kvm_vz_callbacks = {
+	.handle_cop_unusable = kvm_trap_vz_handle_cop_unusable,
+	.handle_tlb_mod = kvm_trap_vz_handle_tlb_st_miss,
+	.handle_tlb_ld_miss = kvm_trap_vz_handle_tlb_ld_miss,
+	.handle_tlb_st_miss = kvm_trap_vz_handle_tlb_st_miss,
+	.handle_addr_err_st = kvm_trap_vz_no_handler,
+	.handle_addr_err_ld = kvm_trap_vz_no_handler,
+	.handle_syscall = kvm_trap_vz_no_handler,
+	.handle_res_inst = kvm_trap_vz_no_handler,
+	.handle_break = kvm_trap_vz_no_handler,
+	.handle_msa_disabled = kvm_trap_vz_handle_msa_disabled,
+	.handle_guest_exit = kvm_trap_vz_handle_guest_exit,
+
+	.hardware_enable = kvm_vz_hardware_enable,
+	.hardware_disable = kvm_vz_hardware_disable,
+	.check_extension = kvm_vz_check_extension,
+	.vcpu_init = kvm_vz_vcpu_init,
+	.vcpu_uninit = kvm_vz_vcpu_uninit,
+	.vcpu_setup = kvm_vz_vcpu_setup,
+	.flush_shadow_all = kvm_vz_flush_shadow_all,
+	.flush_shadow_memslot = kvm_vz_flush_shadow_memslot,
+	.gva_to_gpa = kvm_vz_gva_to_gpa_cb,
+	.queue_timer_int = kvm_vz_queue_timer_int_cb,
+	.dequeue_timer_int = kvm_vz_dequeue_timer_int_cb,
+	.queue_io_int = kvm_vz_queue_io_int_cb,
+	.dequeue_io_int = kvm_vz_dequeue_io_int_cb,
+	.irq_deliver = kvm_vz_irq_deliver_cb,
+	.irq_clear = kvm_vz_irq_clear_cb,
+	.num_regs = kvm_vz_num_regs,
+	.copy_reg_indices = kvm_vz_copy_reg_indices,
+	.get_one_reg = kvm_vz_get_one_reg,
+	.set_one_reg = kvm_vz_set_one_reg,
+	.vcpu_load = kvm_vz_vcpu_load,
+	.vcpu_put = kvm_vz_vcpu_put,
+	.vcpu_run = kvm_vz_vcpu_run,
+	.vcpu_reenter = kvm_vz_vcpu_reenter,
+};
+
+int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks)
+{
+	if (!cpu_has_vz)
+		return -ENODEV;
+
+	/*
+	 * VZ requires at least 2 KScratch registers, so it should have been
+	 * possible to allocate pgd_reg.
+	 */
+	if (WARN(pgd_reg == -1,
+		 "pgd_reg not allocated even though cpu_has_vz\n"))
+		return -ENODEV;
+
+	pr_info("Starting KVM with MIPS VZ extensions\n");
+
+	*install_callbacks = &kvm_vz_callbacks;
+	return 0;
+}
diff --git a/arch/mips/mm/cache.c b/arch/mips/mm/cache.c
index 6db3413..899e462 100644
--- a/arch/mips/mm/cache.c
+++ b/arch/mips/mm/cache.c
@@ -24,6 +24,7 @@
 /* Cache operations. */
 void (*flush_cache_all)(void);
 void (*__flush_cache_all)(void);
+EXPORT_SYMBOL_GPL(__flush_cache_all);
 void (*flush_cache_mm)(struct mm_struct *mm);
 void (*flush_cache_range)(struct vm_area_struct *vma, unsigned long start,
 	unsigned long end);
diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c
index aa75849..3ca2028 100644
--- a/arch/mips/mm/init.c
+++ b/arch/mips/mm/init.c
@@ -348,7 +348,7 @@ void maar_init(void)
 		upper = ((upper & MIPS_MAAR_ADDR) << 4) | 0xffff;
 
 		pr_info("  [%d]: ", i / 2);
-		if (!(attr & MIPS_MAAR_V)) {
+		if (!(attr & MIPS_MAAR_VL)) {
 			pr_cont("disabled\n");
 			continue;
 		}
diff --git a/arch/powerpc/include/asm/disassemble.h b/arch/powerpc/include/asm/disassemble.h
index 4852e84..c0a5505 100644
--- a/arch/powerpc/include/asm/disassemble.h
+++ b/arch/powerpc/include/asm/disassemble.h
@@ -87,6 +87,11 @@ static inline unsigned int get_oc(u32 inst)
 	return (inst >> 11) & 0x7fff;
 }
 
+static inline unsigned int get_tx_or_sx(u32 inst)
+{
+	return (inst) & 0x1;
+}
+
 #define IS_XFORM(inst)	(get_op(inst)  == 31)
 #define IS_DSFORM(inst)	(get_op(inst) >= 56)
 
diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 2c1d507..8a8ce22 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -64,6 +64,11 @@ struct iommu_table_ops {
 			long index,
 			unsigned long *hpa,
 			enum dma_data_direction *direction);
+	/* Real mode */
+	int (*exchange_rm)(struct iommu_table *tbl,
+			long index,
+			unsigned long *hpa,
+			enum dma_data_direction *direction);
 #endif
 	void (*clear)(struct iommu_table *tbl,
 			long index, long npages);
@@ -114,6 +119,7 @@ struct iommu_table {
 	struct list_head it_group_list;/* List of iommu_table_group_link */
 	unsigned long *it_userspace; /* userspace view of the table */
 	struct iommu_table_ops *it_ops;
+	struct kref    it_kref;
 };
 
 #define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \
@@ -146,8 +152,8 @@ static inline void *get_iommu_table_base(struct device *dev)
 
 extern int dma_iommu_dma_supported(struct device *dev, u64 mask);
 
-/* Frees table for an individual device node */
-extern void iommu_free_table(struct iommu_table *tbl, const char *node_name);
+extern struct iommu_table *iommu_tce_table_get(struct iommu_table *tbl);
+extern int iommu_tce_table_put(struct iommu_table *tbl);
 
 /* Initializes an iommu_table based in values set in the passed-in
  * structure
@@ -208,6 +214,8 @@ extern void iommu_del_device(struct device *dev);
 extern int __init tce_iommu_bus_notifier_init(void);
 extern long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry,
 		unsigned long *hpa, enum dma_data_direction *direction);
+extern long iommu_tce_xchg_rm(struct iommu_table *tbl, unsigned long entry,
+		unsigned long *hpa, enum dma_data_direction *direction);
 #else
 static inline void iommu_register_group(struct iommu_table_group *table_group,
 					int pci_domain_number,
@@ -288,11 +296,21 @@ static inline void iommu_restore(void)
 #endif
 
 /* The API to support IOMMU operations for VFIO */
-extern int iommu_tce_clear_param_check(struct iommu_table *tbl,
-		unsigned long ioba, unsigned long tce_value,
-		unsigned long npages);
-extern int iommu_tce_put_param_check(struct iommu_table *tbl,
-		unsigned long ioba, unsigned long tce);
+extern int iommu_tce_check_ioba(unsigned long page_shift,
+		unsigned long offset, unsigned long size,
+		unsigned long ioba, unsigned long npages);
+extern int iommu_tce_check_gpa(unsigned long page_shift,
+		unsigned long gpa);
+
+#define iommu_tce_clear_param_check(tbl, ioba, tce_value, npages) \
+		(iommu_tce_check_ioba((tbl)->it_page_shift,       \
+				(tbl)->it_offset, (tbl)->it_size, \
+				(ioba), (npages)) || (tce_value))
+#define iommu_tce_put_param_check(tbl, ioba, gpa)                 \
+		(iommu_tce_check_ioba((tbl)->it_page_shift,       \
+				(tbl)->it_offset, (tbl)->it_size, \
+				(ioba), 1) ||                     \
+		iommu_tce_check_gpa((tbl)->it_page_shift, (gpa)))
 
 extern void iommu_flush_tce(struct iommu_table *tbl);
 extern int iommu_take_ownership(struct iommu_table *tbl);
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 7bba8f4..77c6082 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -45,9 +45,6 @@
 
 #define __KVM_HAVE_ARCH_INTC_INITIALIZED
 
-#ifdef CONFIG_KVM_MMIO
-#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
-#endif
 #define KVM_HALT_POLL_NS_DEFAULT 10000	/* 10 us */
 
 /* These values are internal and can be increased later */
@@ -191,6 +188,13 @@ struct kvmppc_pginfo {
 	atomic_t refcnt;
 };
 
+struct kvmppc_spapr_tce_iommu_table {
+	struct rcu_head rcu;
+	struct list_head next;
+	struct iommu_table *tbl;
+	struct kref kref;
+};
+
 struct kvmppc_spapr_tce_table {
 	struct list_head list;
 	struct kvm *kvm;
@@ -199,6 +203,7 @@ struct kvmppc_spapr_tce_table {
 	u32 page_shift;
 	u64 offset;		/* in pages */
 	u64 size;		/* window size in pages */
+	struct list_head iommu_tables;
 	struct page *pages[0];
 };
 
@@ -345,6 +350,7 @@ struct kvmppc_pte {
 	bool may_read		: 1;
 	bool may_write		: 1;
 	bool may_execute	: 1;
+	unsigned long wimg;
 	u8 page_size;		/* MMU_PAGE_xxx */
 };
 
@@ -441,6 +447,11 @@ struct mmio_hpte_cache {
 	unsigned int index;
 };
 
+#define KVMPPC_VSX_COPY_NONE		0
+#define KVMPPC_VSX_COPY_WORD		1
+#define KVMPPC_VSX_COPY_DWORD		2
+#define KVMPPC_VSX_COPY_DWORD_LOAD_DUMP	3
+
 struct openpic;
 
 struct kvm_vcpu_arch {
@@ -644,6 +655,21 @@ struct kvm_vcpu_arch {
 	u8 io_gpr; /* GPR used as IO source/target */
 	u8 mmio_host_swabbed;
 	u8 mmio_sign_extend;
+	/* conversion between single and double precision */
+	u8 mmio_sp64_extend;
+	/*
+	 * Number of simulations for vsx.
+	 * If we use 2*8bytes to simulate 1*16bytes,
+	 * then the number should be 2 and
+	 * mmio_vsx_copy_type=KVMPPC_VSX_COPY_DWORD.
+	 * If we use 4*4bytes to simulate 1*16bytes,
+	 * the number should be 4 and
+	 * mmio_vsx_copy_type=KVMPPC_VSX_COPY_WORD.
+	 */
+	u8 mmio_vsx_copy_nums;
+	u8 mmio_vsx_offset;
+	u8 mmio_vsx_copy_type;
+	u8 mmio_vsx_tx_sx_enabled;
 	u8 osi_needed;
 	u8 osi_enabled;
 	u8 papr_enabled;
@@ -732,6 +758,8 @@ struct kvm_vcpu_arch {
 };
 
 #define VCPU_FPR(vcpu, i)	(vcpu)->arch.fp.fpr[i][TS_FPROFFSET]
+#define VCPU_VSX_FPR(vcpu, i, j)	((vcpu)->arch.fp.fpr[i][j])
+#define VCPU_VSX_VR(vcpu, i)		((vcpu)->arch.vr.vr[i])
 
 /* Values for vcpu->arch.state */
 #define KVMPPC_VCPU_NOTREADY		0
@@ -745,6 +773,7 @@ struct kvm_vcpu_arch {
 #define KVM_MMIO_REG_FPR	0x0020
 #define KVM_MMIO_REG_QPR	0x0040
 #define KVM_MMIO_REG_FQPR	0x0060
+#define KVM_MMIO_REG_VSX	0x0080
 
 #define __KVM_HAVE_ARCH_WQP
 #define __KVM_HAVE_CREATE_DEVICE
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index dd11c4c..5885d32 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -78,9 +78,15 @@ extern int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
 extern int kvmppc_handle_loads(struct kvm_run *run, struct kvm_vcpu *vcpu,
                                unsigned int rt, unsigned int bytes,
 			       int is_default_endian);
+extern int kvmppc_handle_vsx_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
+				unsigned int rt, unsigned int bytes,
+			int is_default_endian, int mmio_sign_extend);
 extern int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			       u64 val, unsigned int bytes,
 			       int is_default_endian);
+extern int kvmppc_handle_vsx_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
+				int rs, unsigned int bytes,
+				int is_default_endian);
 
 extern int kvmppc_load_last_inst(struct kvm_vcpu *vcpu,
 				 enum instruction_type type, u32 *inst);
@@ -132,6 +138,9 @@ extern void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu);
 extern int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu);
 extern int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu);
 extern void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags);
+extern void kvmppc_core_queue_fpunavail(struct kvm_vcpu *vcpu);
+extern void kvmppc_core_queue_vec_unavail(struct kvm_vcpu *vcpu);
+extern void kvmppc_core_queue_vsx_unavail(struct kvm_vcpu *vcpu);
 extern void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu);
 extern void kvmppc_core_dequeue_dec(struct kvm_vcpu *vcpu);
 extern void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
@@ -164,13 +173,19 @@ extern long kvmppc_prepare_vrma(struct kvm *kvm,
 extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu,
 			struct kvm_memory_slot *memslot, unsigned long porder);
 extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
+extern long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd,
+		struct iommu_group *grp);
+extern void kvm_spapr_tce_release_iommu_group(struct kvm *kvm,
+		struct iommu_group *grp);
 
 extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 				struct kvm_create_spapr_tce_64 *args);
 extern struct kvmppc_spapr_tce_table *kvmppc_find_table(
-		struct kvm_vcpu *vcpu, unsigned long liobn);
-extern long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt,
-		unsigned long ioba, unsigned long npages);
+		struct kvm *kvm, unsigned long liobn);
+#define kvmppc_ioba_validate(stt, ioba, npages)                         \
+		(iommu_tce_check_ioba((stt)->page_shift, (stt)->offset, \
+				(stt)->size, (ioba), (npages)) ?        \
+				H_PARAMETER : H_SUCCESS)
 extern long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *tt,
 		unsigned long tce);
 extern long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
@@ -240,6 +255,7 @@ union kvmppc_one_reg {
 	u64	dval;
 	vector128 vval;
 	u64	vsxval[2];
+	u32	vsx32val[4];
 	struct {
 		u64	addr;
 		u64	length;
diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index b9e3f0a..c70c827 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -29,10 +29,14 @@ extern void mm_iommu_init(struct mm_struct *mm);
 extern void mm_iommu_cleanup(struct mm_struct *mm);
 extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm,
 		unsigned long ua, unsigned long size);
+extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(
+		struct mm_struct *mm, unsigned long ua, unsigned long size);
 extern struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm,
 		unsigned long ua, unsigned long entries);
 extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
 		unsigned long ua, unsigned long *hpa);
+extern long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
+		unsigned long ua, unsigned long *hpa);
 extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem);
 extern void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem);
 #endif
diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h
index e7d6d86..73f06f4 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -86,32 +86,79 @@
 #define OP_TRAP_64 2
 
 #define OP_31_XOP_TRAP      4
+#define OP_31_XOP_LDX       21
 #define OP_31_XOP_LWZX      23
+#define OP_31_XOP_LDUX      53
 #define OP_31_XOP_DCBST     54
 #define OP_31_XOP_LWZUX     55
 #define OP_31_XOP_TRAP_64   68
 #define OP_31_XOP_DCBF      86
 #define OP_31_XOP_LBZX      87
+#define OP_31_XOP_STDX      149
 #define OP_31_XOP_STWX      151
+#define OP_31_XOP_STDUX     181
+#define OP_31_XOP_STWUX     183
 #define OP_31_XOP_STBX      215
 #define OP_31_XOP_LBZUX     119
 #define OP_31_XOP_STBUX     247
 #define OP_31_XOP_LHZX      279
 #define OP_31_XOP_LHZUX     311
 #define OP_31_XOP_MFSPR     339
+#define OP_31_XOP_LWAX      341
 #define OP_31_XOP_LHAX      343
+#define OP_31_XOP_LWAUX     373
 #define OP_31_XOP_LHAUX     375
 #define OP_31_XOP_STHX      407
 #define OP_31_XOP_STHUX     439
 #define OP_31_XOP_MTSPR     467
 #define OP_31_XOP_DCBI      470
+#define OP_31_XOP_LDBRX     532
 #define OP_31_XOP_LWBRX     534
 #define OP_31_XOP_TLBSYNC   566
+#define OP_31_XOP_STDBRX    660
 #define OP_31_XOP_STWBRX    662
+#define OP_31_XOP_STFSX	    663
+#define OP_31_XOP_STFSUX    695
+#define OP_31_XOP_STFDX     727
+#define OP_31_XOP_STFDUX    759
 #define OP_31_XOP_LHBRX     790
+#define OP_31_XOP_LFIWAX    855
+#define OP_31_XOP_LFIWZX    887
 #define OP_31_XOP_STHBRX    918
+#define OP_31_XOP_STFIWX    983
+
+/* VSX Scalar Load Instructions */
+#define OP_31_XOP_LXSDX         588
+#define OP_31_XOP_LXSSPX        524
+#define OP_31_XOP_LXSIWAX       76
+#define OP_31_XOP_LXSIWZX       12
+
+/* VSX Scalar Store Instructions */
+#define OP_31_XOP_STXSDX        716
+#define OP_31_XOP_STXSSPX       652
+#define OP_31_XOP_STXSIWX       140
+
+/* VSX Vector Load Instructions */
+#define OP_31_XOP_LXVD2X        844
+#define OP_31_XOP_LXVW4X        780
+
+/* VSX Vector Load and Splat Instruction */
+#define OP_31_XOP_LXVDSX        332
+
+/* VSX Vector Store Instructions */
+#define OP_31_XOP_STXVD2X       972
+#define OP_31_XOP_STXVW4X       908
+
+#define OP_31_XOP_LFSX          535
+#define OP_31_XOP_LFSUX         567
+#define OP_31_XOP_LFDX          599
+#define OP_31_XOP_LFDUX		631
 
 #define OP_LWZ  32
+#define OP_STFS 52
+#define OP_STFSU 53
+#define OP_STFD 54
+#define OP_STFDU 55
 #define OP_LD   58
 #define OP_LWZU 33
 #define OP_LBZ  34
@@ -127,6 +174,17 @@
 #define OP_LHAU 43
 #define OP_STH  44
 #define OP_STHU 45
+#define OP_LMW  46
+#define OP_STMW 47
+#define OP_LFS  48
+#define OP_LFSU 49
+#define OP_LFD  50
+#define OP_LFDU 51
+#define OP_STFS 52
+#define OP_STFSU 53
+#define OP_STFD  54
+#define OP_STFDU 55
+#define OP_LQ    56
 
 /* sorted alphabetically */
 #define PPC_INST_BHRBE			0x7c00025c
diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index 4edbe4b..07fbeb9 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -29,6 +29,9 @@
 #define __KVM_HAVE_IRQ_LINE
 #define __KVM_HAVE_GUEST_DEBUG
 
+/* Not always available, but if it is, this is the correct offset.  */
+#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
+
 struct kvm_regs {
 	__u64 pc;
 	__u64 cr;
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 5f202a5..f2b724c 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -711,13 +711,16 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid)
 	return tbl;
 }
 
-void iommu_free_table(struct iommu_table *tbl, const char *node_name)
+static void iommu_table_free(struct kref *kref)
 {
 	unsigned long bitmap_sz;
 	unsigned int order;
+	struct iommu_table *tbl;
 
-	if (!tbl)
-		return;
+	tbl = container_of(kref, struct iommu_table, it_kref);
+
+	if (tbl->it_ops->free)
+		tbl->it_ops->free(tbl);
 
 	if (!tbl->it_map) {
 		kfree(tbl);
@@ -733,7 +736,7 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name)
 
 	/* verify that table contains no entries */
 	if (!bitmap_empty(tbl->it_map, tbl->it_size))
-		pr_warn("%s: Unexpected TCEs for %s\n", __func__, node_name);
+		pr_warn("%s: Unexpected TCEs\n", __func__);
 
 	/* calculate bitmap size in bytes */
 	bitmap_sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long);
@@ -746,6 +749,24 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name)
 	kfree(tbl);
 }
 
+struct iommu_table *iommu_tce_table_get(struct iommu_table *tbl)
+{
+	if (kref_get_unless_zero(&tbl->it_kref))
+		return tbl;
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(iommu_tce_table_get);
+
+int iommu_tce_table_put(struct iommu_table *tbl)
+{
+	if (WARN_ON(!tbl))
+		return 0;
+
+	return kref_put(&tbl->it_kref, iommu_table_free);
+}
+EXPORT_SYMBOL_GPL(iommu_tce_table_put);
+
 /* Creates TCEs for a user provided buffer.  The user buffer must be
  * contiguous real kernel storage (not vmalloc).  The address passed here
  * comprises a page address and offset into that page. The dma_addr_t
@@ -942,47 +963,36 @@ void iommu_flush_tce(struct iommu_table *tbl)
 }
 EXPORT_SYMBOL_GPL(iommu_flush_tce);
 
-int iommu_tce_clear_param_check(struct iommu_table *tbl,
-		unsigned long ioba, unsigned long tce_value,
-		unsigned long npages)
+int iommu_tce_check_ioba(unsigned long page_shift,
+		unsigned long offset, unsigned long size,
+		unsigned long ioba, unsigned long npages)
 {
-	/* tbl->it_ops->clear() does not support any value but 0 */
-	if (tce_value)
-		return -EINVAL;
+	unsigned long mask = (1UL << page_shift) - 1;
 
-	if (ioba & ~IOMMU_PAGE_MASK(tbl))
+	if (ioba & mask)
 		return -EINVAL;
 
-	ioba >>= tbl->it_page_shift;
-	if (ioba < tbl->it_offset)
+	ioba >>= page_shift;
+	if (ioba < offset)
 		return -EINVAL;
 
-	if ((ioba + npages) > (tbl->it_offset + tbl->it_size))
+	if ((ioba + 1) > (offset + size))
 		return -EINVAL;
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(iommu_tce_clear_param_check);
+EXPORT_SYMBOL_GPL(iommu_tce_check_ioba);
 
-int iommu_tce_put_param_check(struct iommu_table *tbl,
-		unsigned long ioba, unsigned long tce)
+int iommu_tce_check_gpa(unsigned long page_shift, unsigned long gpa)
 {
-	if (tce & ~IOMMU_PAGE_MASK(tbl))
-		return -EINVAL;
-
-	if (ioba & ~IOMMU_PAGE_MASK(tbl))
-		return -EINVAL;
+	unsigned long mask = (1UL << page_shift) - 1;
 
-	ioba >>= tbl->it_page_shift;
-	if (ioba < tbl->it_offset)
-		return -EINVAL;
-
-	if ((ioba + 1) > (tbl->it_offset + tbl->it_size))
+	if (gpa & mask)
 		return -EINVAL;
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(iommu_tce_put_param_check);
+EXPORT_SYMBOL_GPL(iommu_tce_check_gpa);
 
 long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry,
 		unsigned long *hpa, enum dma_data_direction *direction)
@@ -1004,6 +1014,31 @@ long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry,
 }
 EXPORT_SYMBOL_GPL(iommu_tce_xchg);
 
+#ifdef CONFIG_PPC_BOOK3S_64
+long iommu_tce_xchg_rm(struct iommu_table *tbl, unsigned long entry,
+		unsigned long *hpa, enum dma_data_direction *direction)
+{
+	long ret;
+
+	ret = tbl->it_ops->exchange_rm(tbl, entry, hpa, direction);
+
+	if (!ret && ((*direction == DMA_FROM_DEVICE) ||
+			(*direction == DMA_BIDIRECTIONAL))) {
+		struct page *pg = realmode_pfn_to_page(*hpa >> PAGE_SHIFT);
+
+		if (likely(pg)) {
+			SetPageDirty(pg);
+		} else {
+			tbl->it_ops->exchange_rm(tbl, entry, hpa, direction);
+			ret = -EFAULT;
+		}
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_tce_xchg_rm);
+#endif
+
 int iommu_take_ownership(struct iommu_table *tbl)
 {
 	unsigned long flags, i, sz = (tbl->it_size + 7) >> 3;
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 029be26..65a471d 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -67,6 +67,7 @@ config KVM_BOOK3S_64
 	select KVM_BOOK3S_64_HANDLER
 	select KVM
 	select KVM_BOOK3S_PR_POSSIBLE if !KVM_BOOK3S_HV_POSSIBLE
+	select SPAPR_TCE_IOMMU if IOMMU_SUPPORT
 	---help---
 	  Support running unmodified book3s_64 and book3s_32 guest kernels
 	  in virtual machines on book3s_64 host processors.
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index b6b5c18..0ff0d07 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -197,6 +197,24 @@ void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags)
 }
 EXPORT_SYMBOL_GPL(kvmppc_core_queue_program);
 
+void kvmppc_core_queue_fpunavail(struct kvm_vcpu *vcpu)
+{
+	/* might as well deliver this straight away */
+	kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, 0);
+}
+
+void kvmppc_core_queue_vec_unavail(struct kvm_vcpu *vcpu)
+{
+	/* might as well deliver this straight away */
+	kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_ALTIVEC, 0);
+}
+
+void kvmppc_core_queue_vsx_unavail(struct kvm_vcpu *vcpu)
+{
+	/* might as well deliver this straight away */
+	kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_VSX, 0);
+}
+
 void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu)
 {
 	kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DECREMENTER);
diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c
index 7015357..29ebe2f 100644
--- a/arch/powerpc/kvm/book3s_64_mmu.c
+++ b/arch/powerpc/kvm/book3s_64_mmu.c
@@ -319,6 +319,7 @@ do_second:
 		gpte->may_execute = true;
 	gpte->may_read = false;
 	gpte->may_write = false;
+	gpte->wimg = r & HPTE_R_WIMG;
 
 	switch (pp) {
 	case 0:
diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c
index a587e8f..145a618 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
@@ -145,6 +145,8 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte,
 	else
 		kvmppc_mmu_flush_icache(pfn);
 
+	rflags = (rflags & ~HPTE_R_WIMG) | orig_pte->wimg;
+
 	/*
 	 * Use 64K pages if possible; otherwise, on 64K page kernels,
 	 * we need to transfer 4 more bits from guest real to host real addr.
@@ -177,12 +179,15 @@ map_again:
 	ret = mmu_hash_ops.hpte_insert(hpteg, vpn, hpaddr, rflags, vflags,
 				       hpsize, hpsize, MMU_SEGSIZE_256M);
 
-	if (ret < 0) {
+	if (ret == -1) {
 		/* If we couldn't map a primary PTE, try a secondary */
 		hash = ~hash;
 		vflags ^= HPTE_V_SECONDARY;
 		attempt++;
 		goto map_again;
+	} else if (ret < 0) {
+		r = -EIO;
+		goto out_unlock;
 	} else {
 		trace_kvm_book3s_64_mmu_map(rflags, hpteg,
 					    vpn, hpaddr, orig_pte);
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 3e26cd4..a160c14 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -28,6 +28,8 @@
 #include <linux/hugetlb.h>
 #include <linux/list.h>
 #include <linux/anon_inodes.h>
+#include <linux/iommu.h>
+#include <linux/file.h>
 
 #include <asm/tlbflush.h>
 #include <asm/kvm_ppc.h>
@@ -40,6 +42,7 @@
 #include <asm/udbg.h>
 #include <asm/iommu.h>
 #include <asm/tce.h>
+#include <asm/mmu_context.h>
 
 static unsigned long kvmppc_tce_pages(unsigned long iommu_pages)
 {
@@ -91,6 +94,137 @@ static long kvmppc_account_memlimit(unsigned long stt_pages, bool inc)
 	return ret;
 }
 
+static void kvm_spapr_tce_iommu_table_free(struct rcu_head *head)
+{
+	struct kvmppc_spapr_tce_iommu_table *stit = container_of(head,
+			struct kvmppc_spapr_tce_iommu_table, rcu);
+
+	iommu_tce_table_put(stit->tbl);
+
+	kfree(stit);
+}
+
+static void kvm_spapr_tce_liobn_put(struct kref *kref)
+{
+	struct kvmppc_spapr_tce_iommu_table *stit = container_of(kref,
+			struct kvmppc_spapr_tce_iommu_table, kref);
+
+	list_del_rcu(&stit->next);
+
+	call_rcu(&stit->rcu, kvm_spapr_tce_iommu_table_free);
+}
+
+extern void kvm_spapr_tce_release_iommu_group(struct kvm *kvm,
+		struct iommu_group *grp)
+{
+	int i;
+	struct kvmppc_spapr_tce_table *stt;
+	struct kvmppc_spapr_tce_iommu_table *stit, *tmp;
+	struct iommu_table_group *table_group = NULL;
+
+	list_for_each_entry_rcu(stt, &kvm->arch.spapr_tce_tables, list) {
+
+		table_group = iommu_group_get_iommudata(grp);
+		if (WARN_ON(!table_group))
+			continue;
+
+		list_for_each_entry_safe(stit, tmp, &stt->iommu_tables, next) {
+			for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+				if (table_group->tables[i] != stit->tbl)
+					continue;
+
+				kref_put(&stit->kref, kvm_spapr_tce_liobn_put);
+				return;
+			}
+		}
+	}
+}
+
+extern long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd,
+		struct iommu_group *grp)
+{
+	struct kvmppc_spapr_tce_table *stt = NULL;
+	bool found = false;
+	struct iommu_table *tbl = NULL;
+	struct iommu_table_group *table_group;
+	long i;
+	struct kvmppc_spapr_tce_iommu_table *stit;
+	struct fd f;
+
+	f = fdget(tablefd);
+	if (!f.file)
+		return -EBADF;
+
+	list_for_each_entry_rcu(stt, &kvm->arch.spapr_tce_tables, list) {
+		if (stt == f.file->private_data) {
+			found = true;
+			break;
+		}
+	}
+
+	fdput(f);
+
+	if (!found)
+		return -EINVAL;
+
+	table_group = iommu_group_get_iommudata(grp);
+	if (WARN_ON(!table_group))
+		return -EFAULT;
+
+	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+		struct iommu_table *tbltmp = table_group->tables[i];
+
+		if (!tbltmp)
+			continue;
+		/*
+		 * Make sure hardware table parameters are exactly the same;
+		 * this is used in the TCE handlers where boundary checks
+		 * use only the first attached table.
+		 */
+		if ((tbltmp->it_page_shift == stt->page_shift) &&
+				(tbltmp->it_offset == stt->offset) &&
+				(tbltmp->it_size == stt->size)) {
+			/*
+			 * Reference the table to avoid races with
+			 * add/remove DMA windows.
+			 */
+			tbl = iommu_tce_table_get(tbltmp);
+			break;
+		}
+	}
+	if (!tbl)
+		return -EINVAL;
+
+	list_for_each_entry_rcu(stit, &stt->iommu_tables, next) {
+		if (tbl != stit->tbl)
+			continue;
+
+		if (!kref_get_unless_zero(&stit->kref)) {
+			/* stit is being destroyed */
+			iommu_tce_table_put(tbl);
+			return -ENOTTY;
+		}
+		/*
+		 * The table is already known to this KVM, we just increased
+		 * its KVM reference counter and can return.
+		 */
+		return 0;
+	}
+
+	stit = kzalloc(sizeof(*stit), GFP_KERNEL);
+	if (!stit) {
+		iommu_tce_table_put(tbl);
+		return -ENOMEM;
+	}
+
+	stit->tbl = tbl;
+	kref_init(&stit->kref);
+
+	list_add_rcu(&stit->next, &stt->iommu_tables);
+
+	return 0;
+}
+
 static void release_spapr_tce_table(struct rcu_head *head)
 {
 	struct kvmppc_spapr_tce_table *stt = container_of(head,
@@ -130,9 +264,18 @@ static int kvm_spapr_tce_mmap(struct file *file, struct vm_area_struct *vma)
 static int kvm_spapr_tce_release(struct inode *inode, struct file *filp)
 {
 	struct kvmppc_spapr_tce_table *stt = filp->private_data;
+	struct kvmppc_spapr_tce_iommu_table *stit, *tmp;
 
 	list_del_rcu(&stt->list);
 
+	list_for_each_entry_safe(stit, tmp, &stt->iommu_tables, next) {
+		WARN_ON(!kref_read(&stit->kref));
+		while (1) {
+			if (kref_put(&stit->kref, kvm_spapr_tce_liobn_put))
+				break;
+		}
+	}
+
 	kvm_put_kvm(stt->kvm);
 
 	kvmppc_account_memlimit(
@@ -164,7 +307,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 			return -EBUSY;
 	}
 
-	size = args->size;
+	size = _ALIGN_UP(args->size, PAGE_SIZE >> 3);
 	npages = kvmppc_tce_pages(size);
 	ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true);
 	if (ret) {
@@ -183,6 +326,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 	stt->offset = args->offset;
 	stt->size = size;
 	stt->kvm = kvm;
+	INIT_LIST_HEAD_RCU(&stt->iommu_tables);
 
 	for (i = 0; i < npages; i++) {
 		stt->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO);
@@ -211,15 +355,106 @@ fail:
 	return ret;
 }
 
+static void kvmppc_clear_tce(struct iommu_table *tbl, unsigned long entry)
+{
+	unsigned long hpa = 0;
+	enum dma_data_direction dir = DMA_NONE;
+
+	iommu_tce_xchg(tbl, entry, &hpa, &dir);
+}
+
+static long kvmppc_tce_iommu_mapped_dec(struct kvm *kvm,
+		struct iommu_table *tbl, unsigned long entry)
+{
+	struct mm_iommu_table_group_mem_t *mem = NULL;
+	const unsigned long pgsize = 1ULL << tbl->it_page_shift;
+	unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
+
+	if (!pua)
+		/* it_userspace allocation might be delayed */
+		return H_TOO_HARD;
+
+	mem = mm_iommu_lookup(kvm->mm, *pua, pgsize);
+	if (!mem)
+		return H_TOO_HARD;
+
+	mm_iommu_mapped_dec(mem);
+
+	*pua = 0;
+
+	return H_SUCCESS;
+}
+
+static long kvmppc_tce_iommu_unmap(struct kvm *kvm,
+		struct iommu_table *tbl, unsigned long entry)
+{
+	enum dma_data_direction dir = DMA_NONE;
+	unsigned long hpa = 0;
+	long ret;
+
+	if (WARN_ON_ONCE(iommu_tce_xchg(tbl, entry, &hpa, &dir)))
+		return H_HARDWARE;
+
+	if (dir == DMA_NONE)
+		return H_SUCCESS;
+
+	ret = kvmppc_tce_iommu_mapped_dec(kvm, tbl, entry);
+	if (ret != H_SUCCESS)
+		iommu_tce_xchg(tbl, entry, &hpa, &dir);
+
+	return ret;
+}
+
+long kvmppc_tce_iommu_map(struct kvm *kvm, struct iommu_table *tbl,
+		unsigned long entry, unsigned long ua,
+		enum dma_data_direction dir)
+{
+	long ret;
+	unsigned long hpa, *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
+	struct mm_iommu_table_group_mem_t *mem;
+
+	if (!pua)
+		/* it_userspace allocation might be delayed */
+		return H_TOO_HARD;
+
+	mem = mm_iommu_lookup(kvm->mm, ua, 1ULL << tbl->it_page_shift);
+	if (!mem)
+		/* This only handles v2 IOMMU type, v1 is handled via ioctl() */
+		return H_TOO_HARD;
+
+	if (WARN_ON_ONCE(mm_iommu_ua_to_hpa(mem, ua, &hpa)))
+		return H_HARDWARE;
+
+	if (mm_iommu_mapped_inc(mem))
+		return H_CLOSED;
+
+	ret = iommu_tce_xchg(tbl, entry, &hpa, &dir);
+	if (WARN_ON_ONCE(ret)) {
+		mm_iommu_mapped_dec(mem);
+		return H_HARDWARE;
+	}
+
+	if (dir != DMA_NONE)
+		kvmppc_tce_iommu_mapped_dec(kvm, tbl, entry);
+
+	*pua = ua;
+
+	return 0;
+}
+
 long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 		      unsigned long ioba, unsigned long tce)
 {
-	struct kvmppc_spapr_tce_table *stt = kvmppc_find_table(vcpu, liobn);
-	long ret;
+	struct kvmppc_spapr_tce_table *stt;
+	long ret, idx;
+	struct kvmppc_spapr_tce_iommu_table *stit;
+	unsigned long entry, ua = 0;
+	enum dma_data_direction dir;
 
 	/* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */
 	/* 	    liobn, ioba, tce); */
 
+	stt = kvmppc_find_table(vcpu->kvm, liobn);
 	if (!stt)
 		return H_TOO_HARD;
 
@@ -231,7 +466,35 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 	if (ret != H_SUCCESS)
 		return ret;
 
-	kvmppc_tce_put(stt, ioba >> stt->page_shift, tce);
+	dir = iommu_tce_direction(tce);
+	if ((dir != DMA_NONE) && kvmppc_gpa_to_ua(vcpu->kvm,
+			tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), &ua, NULL))
+		return H_PARAMETER;
+
+	entry = ioba >> stt->page_shift;
+
+	list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
+		if (dir == DMA_NONE) {
+			ret = kvmppc_tce_iommu_unmap(vcpu->kvm,
+					stit->tbl, entry);
+		} else {
+			idx = srcu_read_lock(&vcpu->kvm->srcu);
+			ret = kvmppc_tce_iommu_map(vcpu->kvm, stit->tbl,
+					entry, ua, dir);
+			srcu_read_unlock(&vcpu->kvm->srcu, idx);
+		}
+
+		if (ret == H_SUCCESS)
+			continue;
+
+		if (ret == H_TOO_HARD)
+			return ret;
+
+		WARN_ON_ONCE(1);
+		kvmppc_clear_tce(stit->tbl, entry);
+	}
+
+	kvmppc_tce_put(stt, entry, tce);
 
 	return H_SUCCESS;
 }
@@ -246,8 +509,9 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 	unsigned long entry, ua = 0;
 	u64 __user *tces;
 	u64 tce;
+	struct kvmppc_spapr_tce_iommu_table *stit;
 
-	stt = kvmppc_find_table(vcpu, liobn);
+	stt = kvmppc_find_table(vcpu->kvm, liobn);
 	if (!stt)
 		return H_TOO_HARD;
 
@@ -284,6 +548,26 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 		if (ret != H_SUCCESS)
 			goto unlock_exit;
 
+		if (kvmppc_gpa_to_ua(vcpu->kvm,
+				tce & ~(TCE_PCI_READ | TCE_PCI_WRITE),
+				&ua, NULL))
+			return H_PARAMETER;
+
+		list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
+			ret = kvmppc_tce_iommu_map(vcpu->kvm,
+					stit->tbl, entry + i, ua,
+					iommu_tce_direction(tce));
+
+			if (ret == H_SUCCESS)
+				continue;
+
+			if (ret == H_TOO_HARD)
+				goto unlock_exit;
+
+			WARN_ON_ONCE(1);
+			kvmppc_clear_tce(stit->tbl, entry);
+		}
+
 		kvmppc_tce_put(stt, entry + i, tce);
 	}
 
@@ -300,8 +584,9 @@ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
 {
 	struct kvmppc_spapr_tce_table *stt;
 	long i, ret;
+	struct kvmppc_spapr_tce_iommu_table *stit;
 
-	stt = kvmppc_find_table(vcpu, liobn);
+	stt = kvmppc_find_table(vcpu->kvm, liobn);
 	if (!stt)
 		return H_TOO_HARD;
 
@@ -313,6 +598,24 @@ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
 	if (tce_value & (TCE_PCI_WRITE | TCE_PCI_READ))
 		return H_PARAMETER;
 
+	list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
+		unsigned long entry = ioba >> stit->tbl->it_page_shift;
+
+		for (i = 0; i < npages; ++i) {
+			ret = kvmppc_tce_iommu_unmap(vcpu->kvm,
+					stit->tbl, entry + i);
+
+			if (ret == H_SUCCESS)
+				continue;
+
+			if (ret == H_TOO_HARD)
+				return ret;
+
+			WARN_ON_ONCE(1);
+			kvmppc_clear_tce(stit->tbl, entry);
+		}
+	}
+
 	for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift))
 		kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value);
 
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
index e4c4ea9..eda0a8f 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -40,6 +40,31 @@
 #include <asm/iommu.h>
 #include <asm/tce.h>
 
+#ifdef CONFIG_BUG
+
+#define WARN_ON_ONCE_RM(condition)	({			\
+	static bool __section(.data.unlikely) __warned;		\
+	int __ret_warn_once = !!(condition);			\
+								\
+	if (unlikely(__ret_warn_once && !__warned)) {		\
+		__warned = true;				\
+		pr_err("WARN_ON_ONCE_RM: (%s) at %s:%u\n",	\
+				__stringify(condition),		\
+				__func__, __LINE__);		\
+		dump_stack();					\
+	}							\
+	unlikely(__ret_warn_once);				\
+})
+
+#else
+
+#define WARN_ON_ONCE_RM(condition) ({				\
+	int __ret_warn_on = !!(condition);			\
+	unlikely(__ret_warn_on);				\
+})
+
+#endif
+
 #define TCES_PER_PAGE	(PAGE_SIZE / sizeof(u64))
 
 /*
@@ -48,10 +73,9 @@
  * WARNING: This will be called in real or virtual mode on HV KVM and virtual
  *          mode on PR KVM
  */
-struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm_vcpu *vcpu,
+struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm *kvm,
 		unsigned long liobn)
 {
-	struct kvm *kvm = vcpu->kvm;
 	struct kvmppc_spapr_tce_table *stt;
 
 	list_for_each_entry_lockless(stt, &kvm->arch.spapr_tce_tables, list)
@@ -63,27 +87,6 @@ struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm_vcpu *vcpu,
 EXPORT_SYMBOL_GPL(kvmppc_find_table);
 
 /*
- * Validates IO address.
- *
- * WARNING: This will be called in real-mode on HV KVM and virtual
- *          mode on PR KVM
- */
-long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt,
-		unsigned long ioba, unsigned long npages)
-{
-	unsigned long mask = (1ULL << stt->page_shift) - 1;
-	unsigned long idx = ioba >> stt->page_shift;
-
-	if ((ioba & mask) || (idx < stt->offset) ||
-			(idx - stt->offset + npages > stt->size) ||
-			(idx + npages < idx))
-		return H_PARAMETER;
-
-	return H_SUCCESS;
-}
-EXPORT_SYMBOL_GPL(kvmppc_ioba_validate);
-
-/*
  * Validates TCE address.
  * At the moment flags and page mask are validated.
  * As the host kernel does not access those addresses (just puts them
@@ -96,10 +99,14 @@ EXPORT_SYMBOL_GPL(kvmppc_ioba_validate);
  */
 long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, unsigned long tce)
 {
-	unsigned long page_mask = ~((1ULL << stt->page_shift) - 1);
-	unsigned long mask = ~(page_mask | TCE_PCI_WRITE | TCE_PCI_READ);
+	unsigned long gpa = tce & ~(TCE_PCI_READ | TCE_PCI_WRITE);
+	enum dma_data_direction dir = iommu_tce_direction(tce);
+
+	/* Allow userspace to poison TCE table */
+	if (dir == DMA_NONE)
+		return H_SUCCESS;
 
-	if (tce & mask)
+	if (iommu_tce_check_gpa(stt->page_shift, gpa))
 		return H_PARAMETER;
 
 	return H_SUCCESS;
@@ -179,15 +186,122 @@ long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
 EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua);
 
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+static void kvmppc_rm_clear_tce(struct iommu_table *tbl, unsigned long entry)
+{
+	unsigned long hpa = 0;
+	enum dma_data_direction dir = DMA_NONE;
+
+	iommu_tce_xchg_rm(tbl, entry, &hpa, &dir);
+}
+
+static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm,
+		struct iommu_table *tbl, unsigned long entry)
+{
+	struct mm_iommu_table_group_mem_t *mem = NULL;
+	const unsigned long pgsize = 1ULL << tbl->it_page_shift;
+	unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
+
+	if (!pua)
+		/* it_userspace allocation might be delayed */
+		return H_TOO_HARD;
+
+	pua = (void *) vmalloc_to_phys(pua);
+	if (WARN_ON_ONCE_RM(!pua))
+		return H_HARDWARE;
+
+	mem = mm_iommu_lookup_rm(kvm->mm, *pua, pgsize);
+	if (!mem)
+		return H_TOO_HARD;
+
+	mm_iommu_mapped_dec(mem);
+
+	*pua = 0;
+
+	return H_SUCCESS;
+}
+
+static long kvmppc_rm_tce_iommu_unmap(struct kvm *kvm,
+		struct iommu_table *tbl, unsigned long entry)
+{
+	enum dma_data_direction dir = DMA_NONE;
+	unsigned long hpa = 0;
+	long ret;
+
+	if (iommu_tce_xchg_rm(tbl, entry, &hpa, &dir))
+		/*
+		 * real mode xchg can fail if struct page crosses
+		 * a page boundary
+		 */
+		return H_TOO_HARD;
+
+	if (dir == DMA_NONE)
+		return H_SUCCESS;
+
+	ret = kvmppc_rm_tce_iommu_mapped_dec(kvm, tbl, entry);
+	if (ret)
+		iommu_tce_xchg_rm(tbl, entry, &hpa, &dir);
+
+	return ret;
+}
+
+static long kvmppc_rm_tce_iommu_map(struct kvm *kvm, struct iommu_table *tbl,
+		unsigned long entry, unsigned long ua,
+		enum dma_data_direction dir)
+{
+	long ret;
+	unsigned long hpa = 0;
+	unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
+	struct mm_iommu_table_group_mem_t *mem;
+
+	if (!pua)
+		/* it_userspace allocation might be delayed */
+		return H_TOO_HARD;
+
+	mem = mm_iommu_lookup_rm(kvm->mm, ua, 1ULL << tbl->it_page_shift);
+	if (!mem)
+		return H_TOO_HARD;
+
+	if (WARN_ON_ONCE_RM(mm_iommu_ua_to_hpa_rm(mem, ua, &hpa)))
+		return H_HARDWARE;
+
+	pua = (void *) vmalloc_to_phys(pua);
+	if (WARN_ON_ONCE_RM(!pua))
+		return H_HARDWARE;
+
+	if (WARN_ON_ONCE_RM(mm_iommu_mapped_inc(mem)))
+		return H_CLOSED;
+
+	ret = iommu_tce_xchg_rm(tbl, entry, &hpa, &dir);
+	if (ret) {
+		mm_iommu_mapped_dec(mem);
+		/*
+		 * real mode xchg can fail if struct page crosses
+		 * a page boundary
+		 */
+		return H_TOO_HARD;
+	}
+
+	if (dir != DMA_NONE)
+		kvmppc_rm_tce_iommu_mapped_dec(kvm, tbl, entry);
+
+	*pua = ua;
+
+	return 0;
+}
+
 long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 		unsigned long ioba, unsigned long tce)
 {
-	struct kvmppc_spapr_tce_table *stt = kvmppc_find_table(vcpu, liobn);
+	struct kvmppc_spapr_tce_table *stt;
 	long ret;
+	struct kvmppc_spapr_tce_iommu_table *stit;
+	unsigned long entry, ua = 0;
+	enum dma_data_direction dir;
 
 	/* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */
 	/* 	    liobn, ioba, tce); */
 
+	stt = kvmppc_find_table(vcpu->kvm, liobn);
 	if (!stt)
 		return H_TOO_HARD;
 
@@ -199,7 +313,32 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 	if (ret != H_SUCCESS)
 		return ret;
 
-	kvmppc_tce_put(stt, ioba >> stt->page_shift, tce);
+	dir = iommu_tce_direction(tce);
+	if ((dir != DMA_NONE) && kvmppc_gpa_to_ua(vcpu->kvm,
+			tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), &ua, NULL))
+		return H_PARAMETER;
+
+	entry = ioba >> stt->page_shift;
+
+	list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
+		if (dir == DMA_NONE)
+			ret = kvmppc_rm_tce_iommu_unmap(vcpu->kvm,
+					stit->tbl, entry);
+		else
+			ret = kvmppc_rm_tce_iommu_map(vcpu->kvm,
+					stit->tbl, entry, ua, dir);
+
+		if (ret == H_SUCCESS)
+			continue;
+
+		if (ret == H_TOO_HARD)
+			return ret;
+
+		WARN_ON_ONCE_RM(1);
+		kvmppc_rm_clear_tce(stit->tbl, entry);
+	}
+
+	kvmppc_tce_put(stt, entry, tce);
 
 	return H_SUCCESS;
 }
@@ -239,8 +378,10 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 	long i, ret = H_SUCCESS;
 	unsigned long tces, entry, ua = 0;
 	unsigned long *rmap = NULL;
+	bool prereg = false;
+	struct kvmppc_spapr_tce_iommu_table *stit;
 
-	stt = kvmppc_find_table(vcpu, liobn);
+	stt = kvmppc_find_table(vcpu->kvm, liobn);
 	if (!stt)
 		return H_TOO_HARD;
 
@@ -259,23 +400,49 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 	if (ret != H_SUCCESS)
 		return ret;
 
-	if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap))
-		return H_TOO_HARD;
+	if (mm_iommu_preregistered(vcpu->kvm->mm)) {
+		/*
+		 * We get here if guest memory was pre-registered which
+		 * is normally VFIO case and gpa->hpa translation does not
+		 * depend on hpt.
+		 */
+		struct mm_iommu_table_group_mem_t *mem;
 
-	rmap = (void *) vmalloc_to_phys(rmap);
+		if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL))
+			return H_TOO_HARD;
 
-	/*
-	 * Synchronize with the MMU notifier callbacks in
-	 * book3s_64_mmu_hv.c (kvm_unmap_hva_hv etc.).
-	 * While we have the rmap lock, code running on other CPUs
-	 * cannot finish unmapping the host real page that backs
-	 * this guest real page, so we are OK to access the host
-	 * real page.
-	 */
-	lock_rmap(rmap);
-	if (kvmppc_rm_ua_to_hpa(vcpu, ua, &tces)) {
-		ret = H_TOO_HARD;
-		goto unlock_exit;
+		mem = mm_iommu_lookup_rm(vcpu->kvm->mm, ua, IOMMU_PAGE_SIZE_4K);
+		if (mem)
+			prereg = mm_iommu_ua_to_hpa_rm(mem, ua, &tces) == 0;
+	}
+
+	if (!prereg) {
+		/*
+		 * This is usually a case of a guest with emulated devices only
+		 * when TCE list is not in preregistered memory.
+		 * We do not require memory to be preregistered in this case
+		 * so lock rmap and do __find_linux_pte_or_hugepte().
+		 */
+		if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap))
+			return H_TOO_HARD;
+
+		rmap = (void *) vmalloc_to_phys(rmap);
+		if (WARN_ON_ONCE_RM(!rmap))
+			return H_HARDWARE;
+
+		/*
+		 * Synchronize with the MMU notifier callbacks in
+		 * book3s_64_mmu_hv.c (kvm_unmap_hva_hv etc.).
+		 * While we have the rmap lock, code running on other CPUs
+		 * cannot finish unmapping the host real page that backs
+		 * this guest real page, so we are OK to access the host
+		 * real page.
+		 */
+		lock_rmap(rmap);
+		if (kvmppc_rm_ua_to_hpa(vcpu, ua, &tces)) {
+			ret = H_TOO_HARD;
+			goto unlock_exit;
+		}
 	}
 
 	for (i = 0; i < npages; ++i) {
@@ -285,11 +452,33 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 		if (ret != H_SUCCESS)
 			goto unlock_exit;
 
+		ua = 0;
+		if (kvmppc_gpa_to_ua(vcpu->kvm,
+				tce & ~(TCE_PCI_READ | TCE_PCI_WRITE),
+				&ua, NULL))
+			return H_PARAMETER;
+
+		list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
+			ret = kvmppc_rm_tce_iommu_map(vcpu->kvm,
+					stit->tbl, entry + i, ua,
+					iommu_tce_direction(tce));
+
+			if (ret == H_SUCCESS)
+				continue;
+
+			if (ret == H_TOO_HARD)
+				goto unlock_exit;
+
+			WARN_ON_ONCE_RM(1);
+			kvmppc_rm_clear_tce(stit->tbl, entry);
+		}
+
 		kvmppc_tce_put(stt, entry + i, tce);
 	}
 
 unlock_exit:
-	unlock_rmap(rmap);
+	if (rmap)
+		unlock_rmap(rmap);
 
 	return ret;
 }
@@ -300,8 +489,9 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu,
 {
 	struct kvmppc_spapr_tce_table *stt;
 	long i, ret;
+	struct kvmppc_spapr_tce_iommu_table *stit;
 
-	stt = kvmppc_find_table(vcpu, liobn);
+	stt = kvmppc_find_table(vcpu->kvm, liobn);
 	if (!stt)
 		return H_TOO_HARD;
 
@@ -313,6 +503,24 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu,
 	if (tce_value & (TCE_PCI_WRITE | TCE_PCI_READ))
 		return H_PARAMETER;
 
+	list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
+		unsigned long entry = ioba >> stit->tbl->it_page_shift;
+
+		for (i = 0; i < npages; ++i) {
+			ret = kvmppc_rm_tce_iommu_unmap(vcpu->kvm,
+					stit->tbl, entry + i);
+
+			if (ret == H_SUCCESS)
+				continue;
+
+			if (ret == H_TOO_HARD)
+				return ret;
+
+			WARN_ON_ONCE_RM(1);
+			kvmppc_rm_clear_tce(stit->tbl, entry);
+		}
+	}
+
 	for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift))
 		kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value);
 
@@ -322,12 +530,13 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu,
 long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 		      unsigned long ioba)
 {
-	struct kvmppc_spapr_tce_table *stt = kvmppc_find_table(vcpu, liobn);
+	struct kvmppc_spapr_tce_table *stt;
 	long ret;
 	unsigned long idx;
 	struct page *page;
 	u64 *tbl;
 
+	stt = kvmppc_find_table(vcpu->kvm, liobn);
 	if (!stt)
 		return H_TOO_HARD;
 
diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c
index 8359752..68d6898 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -503,10 +503,18 @@ int kvmppc_core_emulate_mtspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
 		break;
 unprivileged:
 	default:
-		printk(KERN_INFO "KVM: invalid SPR write: %d\n", sprn);
-#ifndef DEBUG_SPR
-		emulated = EMULATE_FAIL;
-#endif
+		pr_info_ratelimited("KVM: invalid SPR write: %d\n", sprn);
+		if (sprn & 0x10) {
+			if (kvmppc_get_msr(vcpu) & MSR_PR) {
+				kvmppc_core_queue_program(vcpu, SRR1_PROGPRIV);
+				emulated = EMULATE_AGAIN;
+			}
+		} else {
+			if ((kvmppc_get_msr(vcpu) & MSR_PR) || sprn == 0) {
+				kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
+				emulated = EMULATE_AGAIN;
+			}
+		}
 		break;
 	}
 
@@ -648,10 +656,20 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val
 		break;
 	default:
 unprivileged:
-		printk(KERN_INFO "KVM: invalid SPR read: %d\n", sprn);
-#ifndef DEBUG_SPR
-		emulated = EMULATE_FAIL;
-#endif
+		pr_info_ratelimited("KVM: invalid SPR read: %d\n", sprn);
+		if (sprn & 0x10) {
+			if (kvmppc_get_msr(vcpu) & MSR_PR) {
+				kvmppc_core_queue_program(vcpu, SRR1_PROGPRIV);
+				emulated = EMULATE_AGAIN;
+			}
+		} else {
+			if ((kvmppc_get_msr(vcpu) & MSR_PR) || sprn == 0 ||
+			    sprn == 4 || sprn == 5 || sprn == 6) {
+				kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
+				emulated = EMULATE_AGAIN;
+			}
+		}
+
 		break;
 	}
 
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 1ec86d9..06b7d8a 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3624,11 +3624,9 @@ static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
 		return -EIO;
 
 	mutex_lock(&kvm->lock);
+	if (!kvm->arch.pimap)
+		goto unlock;
 
-	if (kvm->arch.pimap == NULL) {
-		mutex_unlock(&kvm->lock);
-		return 0;
-	}
 	pimap = kvm->arch.pimap;
 
 	for (i = 0; i < pimap->n_mapped; i++) {
@@ -3650,7 +3648,7 @@ static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
 	 * We don't free this structure even when the count goes to
 	 * zero. The structure is freed when we destroy the VM.
 	 */
-
+ unlock:
 	mutex_unlock(&kvm->lock);
 	return 0;
 }
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index d4dfc0c..69a0944 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -349,7 +349,7 @@ static void kvmppc_set_msr_pr(struct kvm_vcpu *vcpu, u64 msr)
 	if (msr & MSR_POW) {
 		if (!vcpu->arch.pending_exceptions) {
 			kvm_vcpu_block(vcpu);
-			clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
+			kvm_clear_request(KVM_REQ_UNHALT, vcpu);
 			vcpu->stat.halt_wakeup++;
 
 			/* Unset POW bit after we woke up */
@@ -537,8 +537,7 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	int r = RESUME_GUEST;
 	int relocated;
 	int page_found = 0;
-	struct kvmppc_pte pte;
-	bool is_mmio = false;
+	struct kvmppc_pte pte = { 0 };
 	bool dr = (kvmppc_get_msr(vcpu) & MSR_DR) ? true : false;
 	bool ir = (kvmppc_get_msr(vcpu) & MSR_IR) ? true : false;
 	u64 vsid;
@@ -616,8 +615,7 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		/* Page not found in guest SLB */
 		kvmppc_set_dar(vcpu, kvmppc_get_fault_dar(vcpu));
 		kvmppc_book3s_queue_irqprio(vcpu, vec + 0x80);
-	} else if (!is_mmio &&
-		   kvmppc_visible_gpa(vcpu, pte.raddr)) {
+	} else if (kvmppc_visible_gpa(vcpu, pte.raddr)) {
 		if (data && !(vcpu->arch.fault_dsisr & DSISR_NOHPTE)) {
 			/*
 			 * There is already a host HPTE there, presumably
@@ -627,7 +625,11 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			kvmppc_mmu_unmap_page(vcpu, &pte);
 		}
 		/* The guest's PTE is not mapped yet. Map on the host */
-		kvmppc_mmu_map_page(vcpu, &pte, iswrite);
+		if (kvmppc_mmu_map_page(vcpu, &pte, iswrite) == -EIO) {
+			/* Exit KVM if mapping failed */
+			run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+			return RESUME_HOST;
+		}
 		if (data)
 			vcpu->stat.sp_storage++;
 		else if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c
index f102616..bcbeeb6 100644
--- a/arch/powerpc/kvm/book3s_pr_papr.c
+++ b/arch/powerpc/kvm/book3s_pr_papr.c
@@ -344,7 +344,7 @@ int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
 	case H_CEDE:
 		kvmppc_set_msr_fast(vcpu, kvmppc_get_msr(vcpu) | MSR_EE);
 		kvm_vcpu_block(vcpu);
-		clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
+		kvm_clear_request(KVM_REQ_UNHALT, vcpu);
 		vcpu->stat.halt_wakeup++;
 		return EMULATE_DONE;
 	case H_LOGICAL_CI_LOAD:
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 0514cbd..3eaac38 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -300,6 +300,11 @@ void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong esr_flags)
 	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_PROGRAM);
 }
 
+void kvmppc_core_queue_fpunavail(struct kvm_vcpu *vcpu)
+{
+	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_FP_UNAVAIL);
+}
+
 void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu)
 {
 	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DECREMENTER);
@@ -579,7 +584,7 @@ static void arm_next_watchdog(struct kvm_vcpu *vcpu)
 	 * userspace, so clear the KVM_REQ_WATCHDOG request.
 	 */
 	if ((vcpu->arch.tsr & (TSR_ENW | TSR_WIS)) != (TSR_ENW | TSR_WIS))
-		clear_bit(KVM_REQ_WATCHDOG, &vcpu->requests);
+		kvm_clear_request(KVM_REQ_WATCHDOG, vcpu);
 
 	spin_lock_irqsave(&vcpu->arch.wdt_lock, flags);
 	nr_jiffies = watchdog_next_timeout(vcpu);
@@ -690,7 +695,7 @@ int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu)
 	if (vcpu->arch.shared->msr & MSR_WE) {
 		local_irq_enable();
 		kvm_vcpu_block(vcpu);
-		clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
+		kvm_clear_request(KVM_REQ_UNHALT, vcpu);
 		hard_irq_disable();
 
 		kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS);
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
index 0fda423..77fd043 100644
--- a/arch/powerpc/kvm/e500_mmu_host.c
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -797,9 +797,8 @@ int e500_mmu_host_init(struct kvmppc_vcpu_e500 *vcpu_e500)
 	host_tlb_params[0].sets =
 		host_tlb_params[0].entries / host_tlb_params[0].ways;
 	host_tlb_params[1].sets = 1;
-
-	vcpu_e500->h2g_tlb1_rmap = kzalloc(sizeof(unsigned int) *
-					   host_tlb_params[1].entries,
+	vcpu_e500->h2g_tlb1_rmap = kcalloc(host_tlb_params[1].entries,
+					   sizeof(*vcpu_e500->h2g_tlb1_rmap),
 					   GFP_KERNEL);
 	if (!vcpu_e500->h2g_tlb1_rmap)
 		return -EINVAL;
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index b379146..c873ffe 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -259,10 +259,18 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 
 		case OP_31_XOP_MFSPR:
 			emulated = kvmppc_emulate_mfspr(vcpu, sprn, rt);
+			if (emulated == EMULATE_AGAIN) {
+				emulated = EMULATE_DONE;
+				advance = 0;
+			}
 			break;
 
 		case OP_31_XOP_MTSPR:
 			emulated = kvmppc_emulate_mtspr(vcpu, sprn, rs);
+			if (emulated == EMULATE_AGAIN) {
+				emulated = EMULATE_DONE;
+				advance = 0;
+			}
 			break;
 
 		case OP_31_XOP_TLBSYNC:
diff --git a/arch/powerpc/kvm/emulate_loadstore.c b/arch/powerpc/kvm/emulate_loadstore.c
index 6d3c0ee..af83353 100644
--- a/arch/powerpc/kvm/emulate_loadstore.c
+++ b/arch/powerpc/kvm/emulate_loadstore.c
@@ -34,18 +34,38 @@
 #include "timing.h"
 #include "trace.h"
 
-/* XXX to do:
- * lhax
- * lhaux
- * lswx
- * lswi
- * stswx
- * stswi
- * lha
- * lhau
- * lmw
- * stmw
+#ifdef CONFIG_PPC_FPU
+static bool kvmppc_check_fp_disabled(struct kvm_vcpu *vcpu)
+{
+	if (!(kvmppc_get_msr(vcpu) & MSR_FP)) {
+		kvmppc_core_queue_fpunavail(vcpu);
+		return true;
+	}
+
+	return false;
+}
+#endif /* CONFIG_PPC_FPU */
+
+#ifdef CONFIG_VSX
+static bool kvmppc_check_vsx_disabled(struct kvm_vcpu *vcpu)
+{
+	if (!(kvmppc_get_msr(vcpu) & MSR_VSX)) {
+		kvmppc_core_queue_vsx_unavail(vcpu);
+		return true;
+	}
+
+	return false;
+}
+#endif /* CONFIG_VSX */
+
+/*
+ * XXX to do:
+ * lfiwax, lfiwzx
+ * vector loads and stores
  *
+ * Instructions that trap when used on cache-inhibited mappings
+ * are not emulated here: multiple and string instructions,
+ * lq/stq, and the load-reserve/store-conditional instructions.
  */
 int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 {
@@ -66,6 +86,19 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 	rs = get_rs(inst);
 	rt = get_rt(inst);
 
+	/*
+	 * if mmio_vsx_tx_sx_enabled == 0, copy data between
+	 * VSR[0..31] and memory
+	 * if mmio_vsx_tx_sx_enabled == 1, copy data between
+	 * VSR[32..63] and memory
+	 */
+	vcpu->arch.mmio_vsx_tx_sx_enabled = get_tx_or_sx(inst);
+	vcpu->arch.mmio_vsx_copy_nums = 0;
+	vcpu->arch.mmio_vsx_offset = 0;
+	vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_NONE;
+	vcpu->arch.mmio_sp64_extend = 0;
+	vcpu->arch.mmio_sign_extend = 0;
+
 	switch (get_op(inst)) {
 	case 31:
 		switch (get_xop(inst)) {
@@ -73,6 +106,11 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 			emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1);
 			break;
 
+		case OP_31_XOP_LWZUX:
+			emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1);
+			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+			break;
+
 		case OP_31_XOP_LBZX:
 			emulated = kvmppc_handle_load(run, vcpu, rt, 1, 1);
 			break;
@@ -82,22 +120,36 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
 			break;
 
+		case OP_31_XOP_STDX:
+			emulated = kvmppc_handle_store(run, vcpu,
+					kvmppc_get_gpr(vcpu, rs), 8, 1);
+			break;
+
+		case OP_31_XOP_STDUX:
+			emulated = kvmppc_handle_store(run, vcpu,
+					kvmppc_get_gpr(vcpu, rs), 8, 1);
+			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+			break;
+
 		case OP_31_XOP_STWX:
 			emulated = kvmppc_handle_store(run, vcpu,
-						       kvmppc_get_gpr(vcpu, rs),
-			                               4, 1);
+					kvmppc_get_gpr(vcpu, rs), 4, 1);
+			break;
+
+		case OP_31_XOP_STWUX:
+			emulated = kvmppc_handle_store(run, vcpu,
+					kvmppc_get_gpr(vcpu, rs), 4, 1);
+			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
 			break;
 
 		case OP_31_XOP_STBX:
 			emulated = kvmppc_handle_store(run, vcpu,
-						       kvmppc_get_gpr(vcpu, rs),
-			                               1, 1);
+					kvmppc_get_gpr(vcpu, rs), 1, 1);
 			break;
 
 		case OP_31_XOP_STBUX:
 			emulated = kvmppc_handle_store(run, vcpu,
-						       kvmppc_get_gpr(vcpu, rs),
-			                               1, 1);
+					kvmppc_get_gpr(vcpu, rs), 1, 1);
 			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
 			break;
 
@@ -105,6 +157,11 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 			emulated = kvmppc_handle_loads(run, vcpu, rt, 2, 1);
 			break;
 
+		case OP_31_XOP_LHAUX:
+			emulated = kvmppc_handle_loads(run, vcpu, rt, 2, 1);
+			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+			break;
+
 		case OP_31_XOP_LHZX:
 			emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1);
 			break;
@@ -116,14 +173,12 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 
 		case OP_31_XOP_STHX:
 			emulated = kvmppc_handle_store(run, vcpu,
-						       kvmppc_get_gpr(vcpu, rs),
-			                               2, 1);
+					kvmppc_get_gpr(vcpu, rs), 2, 1);
 			break;
 
 		case OP_31_XOP_STHUX:
 			emulated = kvmppc_handle_store(run, vcpu,
-						       kvmppc_get_gpr(vcpu, rs),
-			                               2, 1);
+					kvmppc_get_gpr(vcpu, rs), 2, 1);
 			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
 			break;
 
@@ -143,8 +198,7 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 
 		case OP_31_XOP_STWBRX:
 			emulated = kvmppc_handle_store(run, vcpu,
-						       kvmppc_get_gpr(vcpu, rs),
-			                               4, 0);
+					kvmppc_get_gpr(vcpu, rs), 4, 0);
 			break;
 
 		case OP_31_XOP_LHBRX:
@@ -153,10 +207,258 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 
 		case OP_31_XOP_STHBRX:
 			emulated = kvmppc_handle_store(run, vcpu,
-						       kvmppc_get_gpr(vcpu, rs),
-			                               2, 0);
+					kvmppc_get_gpr(vcpu, rs), 2, 0);
+			break;
+
+		case OP_31_XOP_LDBRX:
+			emulated = kvmppc_handle_load(run, vcpu, rt, 8, 0);
+			break;
+
+		case OP_31_XOP_STDBRX:
+			emulated = kvmppc_handle_store(run, vcpu,
+					kvmppc_get_gpr(vcpu, rs), 8, 0);
+			break;
+
+		case OP_31_XOP_LDX:
+			emulated = kvmppc_handle_load(run, vcpu, rt, 8, 1);
+			break;
+
+		case OP_31_XOP_LDUX:
+			emulated = kvmppc_handle_load(run, vcpu, rt, 8, 1);
+			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+			break;
+
+		case OP_31_XOP_LWAX:
+			emulated = kvmppc_handle_loads(run, vcpu, rt, 4, 1);
+			break;
+
+		case OP_31_XOP_LWAUX:
+			emulated = kvmppc_handle_loads(run, vcpu, rt, 4, 1);
+			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+			break;
+
+#ifdef CONFIG_PPC_FPU
+		case OP_31_XOP_LFSX:
+			if (kvmppc_check_fp_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.mmio_sp64_extend = 1;
+			emulated = kvmppc_handle_load(run, vcpu,
+				KVM_MMIO_REG_FPR|rt, 4, 1);
+			break;
+
+		case OP_31_XOP_LFSUX:
+			if (kvmppc_check_fp_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.mmio_sp64_extend = 1;
+			emulated = kvmppc_handle_load(run, vcpu,
+				KVM_MMIO_REG_FPR|rt, 4, 1);
+			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+			break;
+
+		case OP_31_XOP_LFDX:
+			if (kvmppc_check_fp_disabled(vcpu))
+				return EMULATE_DONE;
+			emulated = kvmppc_handle_load(run, vcpu,
+				KVM_MMIO_REG_FPR|rt, 8, 1);
+			break;
+
+		case OP_31_XOP_LFDUX:
+			if (kvmppc_check_fp_disabled(vcpu))
+				return EMULATE_DONE;
+			emulated = kvmppc_handle_load(run, vcpu,
+				KVM_MMIO_REG_FPR|rt, 8, 1);
+			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+			break;
+
+		case OP_31_XOP_LFIWAX:
+			if (kvmppc_check_fp_disabled(vcpu))
+				return EMULATE_DONE;
+			emulated = kvmppc_handle_loads(run, vcpu,
+				KVM_MMIO_REG_FPR|rt, 4, 1);
+			break;
+
+		case OP_31_XOP_LFIWZX:
+			if (kvmppc_check_fp_disabled(vcpu))
+				return EMULATE_DONE;
+			emulated = kvmppc_handle_load(run, vcpu,
+				KVM_MMIO_REG_FPR|rt, 4, 1);
+			break;
+
+		case OP_31_XOP_STFSX:
+			if (kvmppc_check_fp_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.mmio_sp64_extend = 1;
+			emulated = kvmppc_handle_store(run, vcpu,
+				VCPU_FPR(vcpu, rs), 4, 1);
+			break;
+
+		case OP_31_XOP_STFSUX:
+			if (kvmppc_check_fp_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.mmio_sp64_extend = 1;
+			emulated = kvmppc_handle_store(run, vcpu,
+				VCPU_FPR(vcpu, rs), 4, 1);
+			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+			break;
+
+		case OP_31_XOP_STFDX:
+			if (kvmppc_check_fp_disabled(vcpu))
+				return EMULATE_DONE;
+			emulated = kvmppc_handle_store(run, vcpu,
+				VCPU_FPR(vcpu, rs), 8, 1);
+			break;
+
+		case OP_31_XOP_STFDUX:
+			if (kvmppc_check_fp_disabled(vcpu))
+				return EMULATE_DONE;
+			emulated = kvmppc_handle_store(run, vcpu,
+				VCPU_FPR(vcpu, rs), 8, 1);
+			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+			break;
+
+		case OP_31_XOP_STFIWX:
+			if (kvmppc_check_fp_disabled(vcpu))
+				return EMULATE_DONE;
+			emulated = kvmppc_handle_store(run, vcpu,
+				VCPU_FPR(vcpu, rs), 4, 1);
+			break;
+#endif
+
+#ifdef CONFIG_VSX
+		case OP_31_XOP_LXSDX:
+			if (kvmppc_check_vsx_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.mmio_vsx_copy_nums = 1;
+			vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_DWORD;
+			emulated = kvmppc_handle_vsx_load(run, vcpu,
+				KVM_MMIO_REG_VSX|rt, 8, 1, 0);
+			break;
+
+		case OP_31_XOP_LXSSPX:
+			if (kvmppc_check_vsx_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.mmio_vsx_copy_nums = 1;
+			vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_DWORD;
+			vcpu->arch.mmio_sp64_extend = 1;
+			emulated = kvmppc_handle_vsx_load(run, vcpu,
+				KVM_MMIO_REG_VSX|rt, 4, 1, 0);
+			break;
+
+		case OP_31_XOP_LXSIWAX:
+			if (kvmppc_check_vsx_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.mmio_vsx_copy_nums = 1;
+			vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_DWORD;
+			emulated = kvmppc_handle_vsx_load(run, vcpu,
+				KVM_MMIO_REG_VSX|rt, 4, 1, 1);
+			break;
+
+		case OP_31_XOP_LXSIWZX:
+			if (kvmppc_check_vsx_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.mmio_vsx_copy_nums = 1;
+			vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_DWORD;
+			emulated = kvmppc_handle_vsx_load(run, vcpu,
+				KVM_MMIO_REG_VSX|rt, 4, 1, 0);
+			break;
+
+		case OP_31_XOP_LXVD2X:
+		/*
+		 * In this case, the official load/store process is like this:
+		 * Step1, exit from vm by page fault isr, then kvm save vsr.
+		 * Please see guest_exit_cont->store_fp_state->SAVE_32VSRS
+		 * as reference.
+		 *
+		 * Step2, copy data between memory and VCPU
+		 * Notice: for LXVD2X/STXVD2X/LXVW4X/STXVW4X, we use
+		 * 2copies*8bytes or 4copies*4bytes
+		 * to simulate one copy of 16bytes.
+		 * Also there is an endian issue here, we should notice the
+		 * layout of memory.
+		 * Please see MARCO of LXVD2X_ROT/STXVD2X_ROT as more reference.
+		 * If host is little-endian, kvm will call XXSWAPD for
+		 * LXVD2X_ROT/STXVD2X_ROT.
+		 * So, if host is little-endian,
+		 * the postion of memeory should be swapped.
+		 *
+		 * Step3, return to guest, kvm reset register.
+		 * Please see kvmppc_hv_entry->load_fp_state->REST_32VSRS
+		 * as reference.
+		 */
+			if (kvmppc_check_vsx_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.mmio_vsx_copy_nums = 2;
+			vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_DWORD;
+			emulated = kvmppc_handle_vsx_load(run, vcpu,
+				KVM_MMIO_REG_VSX|rt, 8, 1, 0);
+			break;
+
+		case OP_31_XOP_LXVW4X:
+			if (kvmppc_check_vsx_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.mmio_vsx_copy_nums = 4;
+			vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_WORD;
+			emulated = kvmppc_handle_vsx_load(run, vcpu,
+				KVM_MMIO_REG_VSX|rt, 4, 1, 0);
+			break;
+
+		case OP_31_XOP_LXVDSX:
+			if (kvmppc_check_vsx_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.mmio_vsx_copy_nums = 1;
+			vcpu->arch.mmio_vsx_copy_type =
+				 KVMPPC_VSX_COPY_DWORD_LOAD_DUMP;
+			emulated = kvmppc_handle_vsx_load(run, vcpu,
+				KVM_MMIO_REG_VSX|rt, 8, 1, 0);
+			break;
+
+		case OP_31_XOP_STXSDX:
+			if (kvmppc_check_vsx_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.mmio_vsx_copy_nums = 1;
+			vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_DWORD;
+			emulated = kvmppc_handle_vsx_store(run, vcpu,
+						 rs, 8, 1);
 			break;
 
+		case OP_31_XOP_STXSSPX:
+			if (kvmppc_check_vsx_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.mmio_vsx_copy_nums = 1;
+			vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_DWORD;
+			vcpu->arch.mmio_sp64_extend = 1;
+			emulated = kvmppc_handle_vsx_store(run, vcpu,
+						 rs, 4, 1);
+			break;
+
+		case OP_31_XOP_STXSIWX:
+			if (kvmppc_check_vsx_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.mmio_vsx_offset = 1;
+			vcpu->arch.mmio_vsx_copy_nums = 1;
+			vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_WORD;
+			emulated = kvmppc_handle_vsx_store(run, vcpu,
+							 rs, 4, 1);
+			break;
+
+		case OP_31_XOP_STXVD2X:
+			if (kvmppc_check_vsx_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.mmio_vsx_copy_nums = 2;
+			vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_DWORD;
+			emulated = kvmppc_handle_vsx_store(run, vcpu,
+							 rs, 8, 1);
+			break;
+
+		case OP_31_XOP_STXVW4X:
+			if (kvmppc_check_vsx_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.mmio_vsx_copy_nums = 4;
+			vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_WORD;
+			emulated = kvmppc_handle_vsx_store(run, vcpu,
+							 rs, 4, 1);
+			break;
+#endif /* CONFIG_VSX */
 		default:
 			emulated = EMULATE_FAIL;
 			break;
@@ -167,10 +469,60 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 		emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1);
 		break;
 
-	/* TBD: Add support for other 64 bit load variants like ldu, ldux, ldx etc. */
+#ifdef CONFIG_PPC_FPU
+	case OP_STFS:
+		if (kvmppc_check_fp_disabled(vcpu))
+			return EMULATE_DONE;
+		vcpu->arch.mmio_sp64_extend = 1;
+		emulated = kvmppc_handle_store(run, vcpu,
+			VCPU_FPR(vcpu, rs),
+			4, 1);
+		break;
+
+	case OP_STFSU:
+		if (kvmppc_check_fp_disabled(vcpu))
+			return EMULATE_DONE;
+		vcpu->arch.mmio_sp64_extend = 1;
+		emulated = kvmppc_handle_store(run, vcpu,
+			VCPU_FPR(vcpu, rs),
+			4, 1);
+		kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+		break;
+
+	case OP_STFD:
+		if (kvmppc_check_fp_disabled(vcpu))
+			return EMULATE_DONE;
+		emulated = kvmppc_handle_store(run, vcpu,
+			VCPU_FPR(vcpu, rs),
+	                               8, 1);
+		break;
+
+	case OP_STFDU:
+		if (kvmppc_check_fp_disabled(vcpu))
+			return EMULATE_DONE;
+		emulated = kvmppc_handle_store(run, vcpu,
+			VCPU_FPR(vcpu, rs),
+	                               8, 1);
+		kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+		break;
+#endif
+
 	case OP_LD:
 		rt = get_rt(inst);
-		emulated = kvmppc_handle_load(run, vcpu, rt, 8, 1);
+		switch (inst & 3) {
+		case 0:	/* ld */
+			emulated = kvmppc_handle_load(run, vcpu, rt, 8, 1);
+			break;
+		case 1: /* ldu */
+			emulated = kvmppc_handle_load(run, vcpu, rt, 8, 1);
+			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+			break;
+		case 2:	/* lwa */
+			emulated = kvmppc_handle_loads(run, vcpu, rt, 4, 1);
+			break;
+		default:
+			emulated = EMULATE_FAIL;
+		}
 		break;
 
 	case OP_LWZU:
@@ -193,31 +545,37 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 		                               4, 1);
 		break;
 
-	/* TBD: Add support for other 64 bit store variants like stdu, stdux, stdx etc. */
 	case OP_STD:
 		rs = get_rs(inst);
-		emulated = kvmppc_handle_store(run, vcpu,
-					       kvmppc_get_gpr(vcpu, rs),
-		                               8, 1);
+		switch (inst & 3) {
+		case 0:	/* std */
+			emulated = kvmppc_handle_store(run, vcpu,
+				kvmppc_get_gpr(vcpu, rs), 8, 1);
+			break;
+		case 1: /* stdu */
+			emulated = kvmppc_handle_store(run, vcpu,
+				kvmppc_get_gpr(vcpu, rs), 8, 1);
+			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+			break;
+		default:
+			emulated = EMULATE_FAIL;
+		}
 		break;
 
 	case OP_STWU:
 		emulated = kvmppc_handle_store(run, vcpu,
-					       kvmppc_get_gpr(vcpu, rs),
-		                               4, 1);
+				kvmppc_get_gpr(vcpu, rs), 4, 1);
 		kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
 		break;
 
 	case OP_STB:
 		emulated = kvmppc_handle_store(run, vcpu,
-					       kvmppc_get_gpr(vcpu, rs),
-		                               1, 1);
+				kvmppc_get_gpr(vcpu, rs), 1, 1);
 		break;
 
 	case OP_STBU:
 		emulated = kvmppc_handle_store(run, vcpu,
-					       kvmppc_get_gpr(vcpu, rs),
-		                               1, 1);
+				kvmppc_get_gpr(vcpu, rs), 1, 1);
 		kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
 		break;
 
@@ -241,16 +599,48 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 
 	case OP_STH:
 		emulated = kvmppc_handle_store(run, vcpu,
-					       kvmppc_get_gpr(vcpu, rs),
-		                               2, 1);
+				kvmppc_get_gpr(vcpu, rs), 2, 1);
 		break;
 
 	case OP_STHU:
 		emulated = kvmppc_handle_store(run, vcpu,
-					       kvmppc_get_gpr(vcpu, rs),
-		                               2, 1);
+				kvmppc_get_gpr(vcpu, rs), 2, 1);
+		kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+		break;
+
+#ifdef CONFIG_PPC_FPU
+	case OP_LFS:
+		if (kvmppc_check_fp_disabled(vcpu))
+			return EMULATE_DONE;
+		vcpu->arch.mmio_sp64_extend = 1;
+		emulated = kvmppc_handle_load(run, vcpu,
+			KVM_MMIO_REG_FPR|rt, 4, 1);
+		break;
+
+	case OP_LFSU:
+		if (kvmppc_check_fp_disabled(vcpu))
+			return EMULATE_DONE;
+		vcpu->arch.mmio_sp64_extend = 1;
+		emulated = kvmppc_handle_load(run, vcpu,
+			KVM_MMIO_REG_FPR|rt, 4, 1);
+		kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+		break;
+
+	case OP_LFD:
+		if (kvmppc_check_fp_disabled(vcpu))
+			return EMULATE_DONE;
+		emulated = kvmppc_handle_load(run, vcpu,
+			KVM_MMIO_REG_FPR|rt, 8, 1);
+		break;
+
+	case OP_LFDU:
+		if (kvmppc_check_fp_disabled(vcpu))
+			return EMULATE_DONE;
+		emulated = kvmppc_handle_load(run, vcpu,
+			KVM_MMIO_REG_FPR|rt, 8, 1);
 		kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
 		break;
+#endif
 
 	default:
 		emulated = EMULATE_FAIL;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 95c91a9..1ee22a9 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -37,6 +37,7 @@
 #include <asm/cputhreads.h>
 #include <asm/irqflags.h>
 #include <asm/iommu.h>
+#include <asm/switch_to.h>
 #include "timing.h"
 #include "irq.h"
 #include "../mm/mmu_decl.h"
@@ -232,7 +233,7 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
 	case EV_HCALL_TOKEN(EV_IDLE):
 		r = EV_SUCCESS;
 		kvm_vcpu_block(vcpu);
-		clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
+		kvm_clear_request(KVM_REQ_UNHALT, vcpu);
 		break;
 	default:
 		r = EV_UNIMPLEMENTED;
@@ -524,11 +525,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		/* We support this only for PR */
 		r = !hv_enabled;
 		break;
-#ifdef CONFIG_KVM_MMIO
-	case KVM_CAP_COALESCED_MMIO:
-		r = KVM_COALESCED_MMIO_PAGE_OFFSET;
-		break;
-#endif
 #ifdef CONFIG_KVM_MPIC
 	case KVM_CAP_IRQ_MPIC:
 		r = 1;
@@ -538,6 +534,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 #ifdef CONFIG_PPC_BOOK3S_64
 	case KVM_CAP_SPAPR_TCE:
 	case KVM_CAP_SPAPR_TCE_64:
+		/* fallthrough */
+	case KVM_CAP_SPAPR_TCE_VFIO:
 	case KVM_CAP_PPC_RTAS:
 	case KVM_CAP_PPC_FIXUP_HCALL:
 	case KVM_CAP_PPC_ENABLE_HCALL:
@@ -806,6 +804,129 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
 		kvm->arch.kvm_ops->irq_bypass_del_producer(cons, prod);
 }
 
+#ifdef CONFIG_VSX
+static inline int kvmppc_get_vsr_dword_offset(int index)
+{
+	int offset;
+
+	if ((index != 0) && (index != 1))
+		return -1;
+
+#ifdef __BIG_ENDIAN
+	offset =  index;
+#else
+	offset = 1 - index;
+#endif
+
+	return offset;
+}
+
+static inline int kvmppc_get_vsr_word_offset(int index)
+{
+	int offset;
+
+	if ((index > 3) || (index < 0))
+		return -1;
+
+#ifdef __BIG_ENDIAN
+	offset = index;
+#else
+	offset = 3 - index;
+#endif
+	return offset;
+}
+
+static inline void kvmppc_set_vsr_dword(struct kvm_vcpu *vcpu,
+	u64 gpr)
+{
+	union kvmppc_one_reg val;
+	int offset = kvmppc_get_vsr_dword_offset(vcpu->arch.mmio_vsx_offset);
+	int index = vcpu->arch.io_gpr & KVM_MMIO_REG_MASK;
+
+	if (offset == -1)
+		return;
+
+	if (vcpu->arch.mmio_vsx_tx_sx_enabled) {
+		val.vval = VCPU_VSX_VR(vcpu, index);
+		val.vsxval[offset] = gpr;
+		VCPU_VSX_VR(vcpu, index) = val.vval;
+	} else {
+		VCPU_VSX_FPR(vcpu, index, offset) = gpr;
+	}
+}
+
+static inline void kvmppc_set_vsr_dword_dump(struct kvm_vcpu *vcpu,
+	u64 gpr)
+{
+	union kvmppc_one_reg val;
+	int index = vcpu->arch.io_gpr & KVM_MMIO_REG_MASK;
+
+	if (vcpu->arch.mmio_vsx_tx_sx_enabled) {
+		val.vval = VCPU_VSX_VR(vcpu, index);
+		val.vsxval[0] = gpr;
+		val.vsxval[1] = gpr;
+		VCPU_VSX_VR(vcpu, index) = val.vval;
+	} else {
+		VCPU_VSX_FPR(vcpu, index, 0) = gpr;
+		VCPU_VSX_FPR(vcpu, index, 1) = gpr;
+	}
+}
+
+static inline void kvmppc_set_vsr_word(struct kvm_vcpu *vcpu,
+	u32 gpr32)
+{
+	union kvmppc_one_reg val;
+	int offset = kvmppc_get_vsr_word_offset(vcpu->arch.mmio_vsx_offset);
+	int index = vcpu->arch.io_gpr & KVM_MMIO_REG_MASK;
+	int dword_offset, word_offset;
+
+	if (offset == -1)
+		return;
+
+	if (vcpu->arch.mmio_vsx_tx_sx_enabled) {
+		val.vval = VCPU_VSX_VR(vcpu, index);
+		val.vsx32val[offset] = gpr32;
+		VCPU_VSX_VR(vcpu, index) = val.vval;
+	} else {
+		dword_offset = offset / 2;
+		word_offset = offset % 2;
+		val.vsxval[0] = VCPU_VSX_FPR(vcpu, index, dword_offset);
+		val.vsx32val[word_offset] = gpr32;
+		VCPU_VSX_FPR(vcpu, index, dword_offset) = val.vsxval[0];
+	}
+}
+#endif /* CONFIG_VSX */
+
+#ifdef CONFIG_PPC_FPU
+static inline u64 sp_to_dp(u32 fprs)
+{
+	u64 fprd;
+
+	preempt_disable();
+	enable_kernel_fp();
+	asm ("lfs%U1%X1 0,%1; stfd%U0%X0 0,%0" : "=m" (fprd) : "m" (fprs)
+	     : "fr0");
+	preempt_enable();
+	return fprd;
+}
+
+static inline u32 dp_to_sp(u64 fprd)
+{
+	u32 fprs;
+
+	preempt_disable();
+	enable_kernel_fp();
+	asm ("lfd%U1%X1 0,%1; stfs%U0%X0 0,%0" : "=m" (fprs) : "m" (fprd)
+	     : "fr0");
+	preempt_enable();
+	return fprs;
+}
+
+#else
+#define sp_to_dp(x)	(x)
+#define dp_to_sp(x)	(x)
+#endif /* CONFIG_PPC_FPU */
+
 static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu,
                                       struct kvm_run *run)
 {
@@ -832,6 +953,10 @@ static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu,
 		}
 	}
 
+	/* conversion between single and double precision */
+	if ((vcpu->arch.mmio_sp64_extend) && (run->mmio.len == 4))
+		gpr = sp_to_dp(gpr);
+
 	if (vcpu->arch.mmio_sign_extend) {
 		switch (run->mmio.len) {
 #ifdef CONFIG_PPC64
@@ -848,8 +973,6 @@ static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu,
 		}
 	}
 
-	kvmppc_set_gpr(vcpu, vcpu->arch.io_gpr, gpr);
-
 	switch (vcpu->arch.io_gpr & KVM_MMIO_REG_EXT_MASK) {
 	case KVM_MMIO_REG_GPR:
 		kvmppc_set_gpr(vcpu, vcpu->arch.io_gpr, gpr);
@@ -866,6 +989,17 @@ static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu,
 		vcpu->arch.qpr[vcpu->arch.io_gpr & KVM_MMIO_REG_MASK] = gpr;
 		break;
 #endif
+#ifdef CONFIG_VSX
+	case KVM_MMIO_REG_VSX:
+		if (vcpu->arch.mmio_vsx_copy_type == KVMPPC_VSX_COPY_DWORD)
+			kvmppc_set_vsr_dword(vcpu, gpr);
+		else if (vcpu->arch.mmio_vsx_copy_type == KVMPPC_VSX_COPY_WORD)
+			kvmppc_set_vsr_word(vcpu, gpr);
+		else if (vcpu->arch.mmio_vsx_copy_type ==
+				KVMPPC_VSX_COPY_DWORD_LOAD_DUMP)
+			kvmppc_set_vsr_dword_dump(vcpu, gpr);
+		break;
+#endif
 	default:
 		BUG();
 	}
@@ -932,6 +1066,35 @@ int kvmppc_handle_loads(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	return __kvmppc_handle_load(run, vcpu, rt, bytes, is_default_endian, 1);
 }
 
+#ifdef CONFIG_VSX
+int kvmppc_handle_vsx_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
+			unsigned int rt, unsigned int bytes,
+			int is_default_endian, int mmio_sign_extend)
+{
+	enum emulation_result emulated = EMULATE_DONE;
+
+	/* Currently, mmio_vsx_copy_nums only allowed to be less than 4 */
+	if ( (vcpu->arch.mmio_vsx_copy_nums > 4) ||
+		(vcpu->arch.mmio_vsx_copy_nums < 0) ) {
+		return EMULATE_FAIL;
+	}
+
+	while (vcpu->arch.mmio_vsx_copy_nums) {
+		emulated = __kvmppc_handle_load(run, vcpu, rt, bytes,
+			is_default_endian, mmio_sign_extend);
+
+		if (emulated != EMULATE_DONE)
+			break;
+
+		vcpu->arch.paddr_accessed += run->mmio.len;
+
+		vcpu->arch.mmio_vsx_copy_nums--;
+		vcpu->arch.mmio_vsx_offset++;
+	}
+	return emulated;
+}
+#endif /* CONFIG_VSX */
+
 int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			u64 val, unsigned int bytes, int is_default_endian)
 {
@@ -957,6 +1120,9 @@ int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	vcpu->mmio_needed = 1;
 	vcpu->mmio_is_write = 1;
 
+	if ((vcpu->arch.mmio_sp64_extend) && (bytes == 4))
+		val = dp_to_sp(val);
+
 	/* Store the value at the lowest bytes in 'data'. */
 	if (!host_swabbed) {
 		switch (bytes) {
@@ -990,6 +1156,129 @@ int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
 }
 EXPORT_SYMBOL_GPL(kvmppc_handle_store);
 
+#ifdef CONFIG_VSX
+static inline int kvmppc_get_vsr_data(struct kvm_vcpu *vcpu, int rs, u64 *val)
+{
+	u32 dword_offset, word_offset;
+	union kvmppc_one_reg reg;
+	int vsx_offset = 0;
+	int copy_type = vcpu->arch.mmio_vsx_copy_type;
+	int result = 0;
+
+	switch (copy_type) {
+	case KVMPPC_VSX_COPY_DWORD:
+		vsx_offset =
+			kvmppc_get_vsr_dword_offset(vcpu->arch.mmio_vsx_offset);
+
+		if (vsx_offset == -1) {
+			result = -1;
+			break;
+		}
+
+		if (!vcpu->arch.mmio_vsx_tx_sx_enabled) {
+			*val = VCPU_VSX_FPR(vcpu, rs, vsx_offset);
+		} else {
+			reg.vval = VCPU_VSX_VR(vcpu, rs);
+			*val = reg.vsxval[vsx_offset];
+		}
+		break;
+
+	case KVMPPC_VSX_COPY_WORD:
+		vsx_offset =
+			kvmppc_get_vsr_word_offset(vcpu->arch.mmio_vsx_offset);
+
+		if (vsx_offset == -1) {
+			result = -1;
+			break;
+		}
+
+		if (!vcpu->arch.mmio_vsx_tx_sx_enabled) {
+			dword_offset = vsx_offset / 2;
+			word_offset = vsx_offset % 2;
+			reg.vsxval[0] = VCPU_VSX_FPR(vcpu, rs, dword_offset);
+			*val = reg.vsx32val[word_offset];
+		} else {
+			reg.vval = VCPU_VSX_VR(vcpu, rs);
+			*val = reg.vsx32val[vsx_offset];
+		}
+		break;
+
+	default:
+		result = -1;
+		break;
+	}
+
+	return result;
+}
+
+int kvmppc_handle_vsx_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
+			int rs, unsigned int bytes, int is_default_endian)
+{
+	u64 val;
+	enum emulation_result emulated = EMULATE_DONE;
+
+	vcpu->arch.io_gpr = rs;
+
+	/* Currently, mmio_vsx_copy_nums only allowed to be less than 4 */
+	if ( (vcpu->arch.mmio_vsx_copy_nums > 4) ||
+		(vcpu->arch.mmio_vsx_copy_nums < 0) ) {
+		return EMULATE_FAIL;
+	}
+
+	while (vcpu->arch.mmio_vsx_copy_nums) {
+		if (kvmppc_get_vsr_data(vcpu, rs, &val) == -1)
+			return EMULATE_FAIL;
+
+		emulated = kvmppc_handle_store(run, vcpu,
+			 val, bytes, is_default_endian);
+
+		if (emulated != EMULATE_DONE)
+			break;
+
+		vcpu->arch.paddr_accessed += run->mmio.len;
+
+		vcpu->arch.mmio_vsx_copy_nums--;
+		vcpu->arch.mmio_vsx_offset++;
+	}
+
+	return emulated;
+}
+
+static int kvmppc_emulate_mmio_vsx_loadstore(struct kvm_vcpu *vcpu,
+			struct kvm_run *run)
+{
+	enum emulation_result emulated = EMULATE_FAIL;
+	int r;
+
+	vcpu->arch.paddr_accessed += run->mmio.len;
+
+	if (!vcpu->mmio_is_write) {
+		emulated = kvmppc_handle_vsx_load(run, vcpu, vcpu->arch.io_gpr,
+			 run->mmio.len, 1, vcpu->arch.mmio_sign_extend);
+	} else {
+		emulated = kvmppc_handle_vsx_store(run, vcpu,
+			 vcpu->arch.io_gpr, run->mmio.len, 1);
+	}
+
+	switch (emulated) {
+	case EMULATE_DO_MMIO:
+		run->exit_reason = KVM_EXIT_MMIO;
+		r = RESUME_HOST;
+		break;
+	case EMULATE_FAIL:
+		pr_info("KVM: MMIO emulation failed (VSX repeat)\n");
+		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+		r = RESUME_HOST;
+		break;
+	default:
+		r = RESUME_GUEST;
+		break;
+	}
+	return r;
+}
+#endif /* CONFIG_VSX */
+
 int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
 {
 	int r = 0;
@@ -1092,13 +1381,24 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 	int r;
 	sigset_t sigsaved;
 
-	if (vcpu->sigset_active)
-		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
-
 	if (vcpu->mmio_needed) {
+		vcpu->mmio_needed = 0;
 		if (!vcpu->mmio_is_write)
 			kvmppc_complete_mmio_load(vcpu, run);
-		vcpu->mmio_needed = 0;
+#ifdef CONFIG_VSX
+		if (vcpu->arch.mmio_vsx_copy_nums > 0) {
+			vcpu->arch.mmio_vsx_copy_nums--;
+			vcpu->arch.mmio_vsx_offset++;
+		}
+
+		if (vcpu->arch.mmio_vsx_copy_nums > 0) {
+			r = kvmppc_emulate_mmio_vsx_loadstore(vcpu, run);
+			if (r == RESUME_HOST) {
+				vcpu->mmio_needed = 1;
+				return r;
+			}
+		}
+#endif
 	} else if (vcpu->arch.osi_needed) {
 		u64 *gprs = run->osi.gprs;
 		int i;
@@ -1120,6 +1420,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 #endif
 	}
 
+	if (vcpu->sigset_active)
+		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
+
 	if (run->immediate_exit)
 		r = -EINTR;
 	else
diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c
index 497130c..fc67bd7 100644
--- a/arch/powerpc/mm/mmu_context_iommu.c
+++ b/arch/powerpc/mm/mmu_context_iommu.c
@@ -314,6 +314,25 @@ struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm,
 }
 EXPORT_SYMBOL_GPL(mm_iommu_lookup);
 
+struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(struct mm_struct *mm,
+		unsigned long ua, unsigned long size)
+{
+	struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
+
+	list_for_each_entry_lockless(mem, &mm->context.iommu_group_mem_list,
+			next) {
+		if ((mem->ua <= ua) &&
+				(ua + size <= mem->ua +
+				 (mem->entries << PAGE_SHIFT))) {
+			ret = mem;
+			break;
+		}
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_lookup_rm);
+
 struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm,
 		unsigned long ua, unsigned long entries)
 {
@@ -345,6 +364,26 @@ long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
 }
 EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa);
 
+long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
+		unsigned long ua, unsigned long *hpa)
+{
+	const long entry = (ua - mem->ua) >> PAGE_SHIFT;
+	void *va = &mem->hpas[entry];
+	unsigned long *pa;
+
+	if (entry >= mem->entries)
+		return -EFAULT;
+
+	pa = (void *) vmalloc_to_phys(va);
+	if (!pa)
+		return -EFAULT;
+
+	*hpa = *pa | (ua & ~PAGE_MASK);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa_rm);
+
 long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem)
 {
 	if (atomic64_inc_not_zero(&mem->mapped))
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index e367382..ee4cdb5 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1424,8 +1424,7 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe
 		iommu_group_put(pe->table_group.group);
 		BUG_ON(pe->table_group.group);
 	}
-	pnv_pci_ioda2_table_free_pages(tbl);
-	iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
+	iommu_tce_table_put(tbl);
 }
 
 static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
@@ -1860,6 +1859,17 @@ static int pnv_ioda1_tce_xchg(struct iommu_table *tbl, long index,
 
 	return ret;
 }
+
+static int pnv_ioda1_tce_xchg_rm(struct iommu_table *tbl, long index,
+		unsigned long *hpa, enum dma_data_direction *direction)
+{
+	long ret = pnv_tce_xchg(tbl, index, hpa, direction);
+
+	if (!ret)
+		pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, true);
+
+	return ret;
+}
 #endif
 
 static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index,
@@ -1874,6 +1884,7 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = {
 	.set = pnv_ioda1_tce_build,
 #ifdef CONFIG_IOMMU_API
 	.exchange = pnv_ioda1_tce_xchg,
+	.exchange_rm = pnv_ioda1_tce_xchg_rm,
 #endif
 	.clear = pnv_ioda1_tce_free,
 	.get = pnv_tce_get,
@@ -1948,7 +1959,7 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
 {
 	struct iommu_table_group_link *tgl;
 
-	list_for_each_entry_rcu(tgl, &tbl->it_group_list, next) {
+	list_for_each_entry_lockless(tgl, &tbl->it_group_list, next) {
 		struct pnv_ioda_pe *pe = container_of(tgl->table_group,
 				struct pnv_ioda_pe, table_group);
 		struct pnv_phb *phb = pe->phb;
@@ -2004,6 +2015,17 @@ static int pnv_ioda2_tce_xchg(struct iommu_table *tbl, long index,
 
 	return ret;
 }
+
+static int pnv_ioda2_tce_xchg_rm(struct iommu_table *tbl, long index,
+		unsigned long *hpa, enum dma_data_direction *direction)
+{
+	long ret = pnv_tce_xchg(tbl, index, hpa, direction);
+
+	if (!ret)
+		pnv_pci_ioda2_tce_invalidate(tbl, index, 1, true);
+
+	return ret;
+}
 #endif
 
 static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index,
@@ -2017,13 +2039,13 @@ static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index,
 static void pnv_ioda2_table_free(struct iommu_table *tbl)
 {
 	pnv_pci_ioda2_table_free_pages(tbl);
-	iommu_free_table(tbl, "pnv");
 }
 
 static struct iommu_table_ops pnv_ioda2_iommu_ops = {
 	.set = pnv_ioda2_tce_build,
 #ifdef CONFIG_IOMMU_API
 	.exchange = pnv_ioda2_tce_xchg,
+	.exchange_rm = pnv_ioda2_tce_xchg_rm,
 #endif
 	.clear = pnv_ioda2_tce_free,
 	.get = pnv_tce_get,
@@ -2203,7 +2225,7 @@ found:
 		__free_pages(tce_mem, get_order(tce32_segsz * segs));
 	if (tbl) {
 		pnv_pci_unlink_table_and_group(tbl, &pe->table_group);
-		iommu_free_table(tbl, "pnv");
+		iommu_tce_table_put(tbl);
 	}
 }
 
@@ -2293,16 +2315,16 @@ static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group,
 	if (!tbl)
 		return -ENOMEM;
 
+	tbl->it_ops = &pnv_ioda2_iommu_ops;
+
 	ret = pnv_pci_ioda2_table_alloc_pages(nid,
 			bus_offset, page_shift, window_size,
 			levels, tbl);
 	if (ret) {
-		iommu_free_table(tbl, "pnv");
+		iommu_tce_table_put(tbl);
 		return ret;
 	}
 
-	tbl->it_ops = &pnv_ioda2_iommu_ops;
-
 	*ptbl = tbl;
 
 	return 0;
@@ -2343,7 +2365,7 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
 	if (rc) {
 		pe_err(pe, "Failed to configure 32-bit TCE table, err %ld\n",
 				rc);
-		pnv_ioda2_table_free(tbl);
+		iommu_tce_table_put(tbl);
 		return rc;
 	}
 
@@ -2431,7 +2453,7 @@ static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group)
 	pnv_pci_ioda2_unset_window(&pe->table_group, 0);
 	if (pe->pbus)
 		pnv_ioda_setup_bus_dma(pe, pe->pbus, false);
-	pnv_ioda2_table_free(tbl);
+	iommu_tce_table_put(tbl);
 }
 
 static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group)
@@ -3406,7 +3428,7 @@ static void pnv_pci_ioda1_release_pe_dma(struct pnv_ioda_pe *pe)
 	}
 
 	free_pages(tbl->it_base, get_order(tbl->it_size << 3));
-	iommu_free_table(tbl, "pnv");
+	iommu_tce_table_put(tbl);
 }
 
 static void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe)
@@ -3433,7 +3455,7 @@ static void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe)
 	}
 
 	pnv_pci_ioda2_table_free_pages(tbl);
-	iommu_free_table(tbl, "pnv");
+	iommu_tce_table_put(tbl);
 }
 
 static void pnv_ioda_free_pe_seg(struct pnv_ioda_pe *pe,
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index eb835e9..204a829 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -767,6 +767,7 @@ struct iommu_table *pnv_pci_table_alloc(int nid)
 
 	tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, nid);
 	INIT_LIST_HEAD_RCU(&tbl->it_group_list);
+	kref_init(&tbl->it_kref);
 
 	return tbl;
 }
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 4d757ea..7ce5db2 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -74,6 +74,7 @@ static struct iommu_table_group *iommu_pseries_alloc_group(int node)
 		goto fail_exit;
 
 	INIT_LIST_HEAD_RCU(&tbl->it_group_list);
+	kref_init(&tbl->it_kref);
 	tgl->table_group = table_group;
 	list_add_rcu(&tgl->next, &tbl->it_group_list);
 
@@ -115,7 +116,7 @@ static void iommu_pseries_free_group(struct iommu_table_group *table_group,
 		BUG_ON(table_group->group);
 	}
 #endif
-	iommu_free_table(tbl, node_name);
+	iommu_tce_table_put(tbl);
 
 	kfree(table_group);
 }
diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c
index 7204939..28b09fd 100644
--- a/arch/powerpc/platforms/pseries/vio.c
+++ b/arch/powerpc/platforms/pseries/vio.c
@@ -1318,7 +1318,7 @@ static void vio_dev_release(struct device *dev)
 	struct iommu_table *tbl = get_iommu_table_base(dev);
 
 	if (tbl)
-		iommu_free_table(tbl, of_node_full_name(dev->of_node));
+		iommu_tce_table_put(tbl);
 	of_node_put(dev->of_node);
 	kfree(to_vio_dev(dev));
 }
diff --git a/arch/s390/include/asm/cpacf.h b/arch/s390/include/asm/cpacf.h
index e2dfbf2..31cac7d 100644
--- a/arch/s390/include/asm/cpacf.h
+++ b/arch/s390/include/asm/cpacf.h
@@ -26,6 +26,7 @@
 #define CPACF_PCC		0xb92c		/* MSA4 */
 #define CPACF_KMCTR		0xb92d		/* MSA4 */
 #define CPACF_PPNO		0xb93c		/* MSA5 */
+#define CPACF_KMA		0xb929		/* MSA8 */
 
 /*
  * En/decryption modifier bits
@@ -149,8 +150,8 @@ static inline void __cpacf_query(unsigned int opcode, cpacf_mask_t *mask)
 
 	asm volatile(
 		"	spm 0\n" /* pckmo doesn't change the cc */
-		/* Parameter registers are ignored, but may not be 0 */
-		"0:	.insn	rrf,%[opc] << 16,2,2,2,0\n"
+		/* Parameter regs are ignored, but must be nonzero and unique */
+		"0:	.insn	rrf,%[opc] << 16,2,4,6,0\n"
 		"	brc	1,0b\n"	/* handle partial completion */
 		: "=m" (*mask)
 		: [fc] "d" (r0), [pba] "a" (r1), [opc] "i" (opcode)
diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h
index 1d48880..e8f6230 100644
--- a/arch/s390/include/asm/elf.h
+++ b/arch/s390/include/asm/elf.h
@@ -105,6 +105,7 @@
 #define HWCAP_S390_VXRS		2048
 #define HWCAP_S390_VXRS_BCD	4096
 #define HWCAP_S390_VXRS_EXT	8192
+#define HWCAP_S390_GS		16384
 
 /* Internal bits, not exposed via elf */
 #define HWCAP_INT_SIE		1UL
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index a41faf3..426614a 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -25,6 +25,7 @@
 #include <asm/cpu.h>
 #include <asm/fpu/api.h>
 #include <asm/isc.h>
+#include <asm/guarded_storage.h>
 
 #define KVM_S390_BSCA_CPU_SLOTS 64
 #define KVM_S390_ESCA_CPU_SLOTS 248
@@ -121,6 +122,7 @@ struct esca_block {
 #define CPUSTAT_SLSR       0x00002000
 #define CPUSTAT_ZARCH      0x00000800
 #define CPUSTAT_MCDS       0x00000100
+#define CPUSTAT_KSS        0x00000200
 #define CPUSTAT_SM         0x00000080
 #define CPUSTAT_IBS        0x00000040
 #define CPUSTAT_GED2       0x00000010
@@ -164,16 +166,27 @@ struct kvm_s390_sie_block {
 #define ICTL_RRBE	0x00001000
 #define ICTL_TPROT	0x00000200
 	__u32	ictl;			/* 0x0048 */
+#define ECA_CEI		0x80000000
+#define ECA_IB		0x40000000
+#define ECA_SIGPI	0x10000000
+#define ECA_MVPGI	0x01000000
+#define ECA_VX		0x00020000
+#define ECA_PROTEXCI	0x00002000
+#define ECA_SII		0x00000001
 	__u32	eca;			/* 0x004c */
 #define ICPT_INST	0x04
 #define ICPT_PROGI	0x08
 #define ICPT_INSTPROGI	0x0C
+#define ICPT_EXTREQ	0x10
 #define ICPT_EXTINT	0x14
+#define ICPT_IOREQ	0x18
+#define ICPT_WAIT	0x1c
 #define ICPT_VALIDITY	0x20
 #define ICPT_STOP	0x28
 #define ICPT_OPEREXC	0x2C
 #define ICPT_PARTEXEC	0x38
 #define ICPT_IOINST	0x40
+#define ICPT_KSS	0x5c
 	__u8	icptcode;		/* 0x0050 */
 	__u8	icptstatus;		/* 0x0051 */
 	__u16	ihcpu;			/* 0x0052 */
@@ -182,10 +195,19 @@ struct kvm_s390_sie_block {
 	__u32	ipb;			/* 0x0058 */
 	__u32	scaoh;			/* 0x005c */
 	__u8	reserved60;		/* 0x0060 */
+#define ECB_GS		0x40
+#define ECB_TE		0x10
+#define ECB_SRSI	0x04
+#define ECB_HOSTPROTINT	0x02
 	__u8	ecb;			/* 0x0061 */
+#define ECB2_CMMA	0x80
+#define ECB2_IEP	0x20
+#define ECB2_PFMFI	0x08
+#define ECB2_ESCA	0x04
 	__u8    ecb2;                   /* 0x0062 */
-#define ECB3_AES 0x04
 #define ECB3_DEA 0x08
+#define ECB3_AES 0x04
+#define ECB3_RI  0x01
 	__u8    ecb3;			/* 0x0063 */
 	__u32	scaol;			/* 0x0064 */
 	__u8	reserved68[4];		/* 0x0068 */
@@ -219,11 +241,14 @@ struct kvm_s390_sie_block {
 	__u32	crycbd;			/* 0x00fc */
 	__u64	gcr[16];		/* 0x0100 */
 	__u64	gbea;			/* 0x0180 */
-	__u8	reserved188[24];	/* 0x0188 */
+	__u8    reserved188[8];		/* 0x0188 */
+	__u64   sdnxo;			/* 0x0190 */
+	__u8    reserved198[8];		/* 0x0198 */
 	__u32	fac;			/* 0x01a0 */
 	__u8	reserved1a4[20];	/* 0x01a4 */
 	__u64	cbrlo;			/* 0x01b8 */
 	__u8	reserved1c0[8];		/* 0x01c0 */
+#define ECD_HOSTREGMGMT	0x20000000
 	__u32	ecd;			/* 0x01c8 */
 	__u8	reserved1cc[18];	/* 0x01cc */
 	__u64	pp;			/* 0x01de */
@@ -498,6 +523,12 @@ struct kvm_s390_local_interrupt {
 #define FIRQ_CNTR_PFAULT   3
 #define FIRQ_MAX_COUNT     4
 
+/* mask the AIS mode for a given ISC */
+#define AIS_MODE_MASK(isc) (0x80 >> isc)
+
+#define KVM_S390_AIS_MODE_ALL    0
+#define KVM_S390_AIS_MODE_SINGLE 1
+
 struct kvm_s390_float_interrupt {
 	unsigned long pending_irqs;
 	spinlock_t lock;
@@ -507,6 +538,10 @@ struct kvm_s390_float_interrupt {
 	struct kvm_s390_ext_info srv_signal;
 	int next_rr_cpu;
 	unsigned long idle_mask[BITS_TO_LONGS(KVM_MAX_VCPUS)];
+	struct mutex ais_lock;
+	u8 simm;
+	u8 nimm;
+	int ais_enabled;
 };
 
 struct kvm_hw_wp_info_arch {
@@ -554,6 +589,7 @@ struct kvm_vcpu_arch {
 	/* if vsie is active, currently executed shadow sie control block */
 	struct kvm_s390_sie_block *vsie_block;
 	unsigned int      host_acrs[NUM_ACRS];
+	struct gs_cb      *host_gscb;
 	struct fpu	  host_fpregs;
 	struct kvm_s390_local_interrupt local_int;
 	struct hrtimer    ckc_timer;
@@ -574,6 +610,7 @@ struct kvm_vcpu_arch {
 	 */
 	seqcount_t cputm_seqcount;
 	__u64 cputm_start;
+	bool gs_enabled;
 };
 
 struct kvm_vm_stat {
@@ -596,6 +633,7 @@ struct s390_io_adapter {
 	bool maskable;
 	bool masked;
 	bool swap;
+	bool suppressible;
 	struct rw_semaphore maps_lock;
 	struct list_head maps;
 	atomic_t nr_maps;
diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h
index 61261e0..8a5b082 100644
--- a/arch/s390/include/asm/lowcore.h
+++ b/arch/s390/include/asm/lowcore.h
@@ -157,8 +157,8 @@ struct lowcore {
 	__u64	stfle_fac_list[32];		/* 0x0f00 */
 	__u8	pad_0x1000[0x11b0-0x1000];	/* 0x1000 */
 
-	/* Pointer to vector register save area */
-	__u64	vector_save_area_addr;		/* 0x11b0 */
+	/* Pointer to the machine check extended save area */
+	__u64	mcesad;				/* 0x11b0 */
 
 	/* 64 bit extparam used for pfault/diag 250: defined by architecture */
 	__u64	ext_params2;			/* 0x11B8 */
@@ -182,10 +182,7 @@ struct lowcore {
 
 	/* Transaction abort diagnostic block */
 	__u8	pgm_tdb[256];			/* 0x1800 */
-	__u8	pad_0x1900[0x1c00-0x1900];	/* 0x1900 */
-
-	/* Software defined save area for vector registers */
-	__u8	vector_save_area[1024];		/* 0x1c00 */
+	__u8	pad_0x1900[0x2000-0x1900];	/* 0x1900 */
 } __packed;
 
 #define S390_lowcore (*((struct lowcore *) 0))
diff --git a/arch/s390/include/asm/nmi.h b/arch/s390/include/asm/nmi.h
index b75fd91..e3e8895 100644
--- a/arch/s390/include/asm/nmi.h
+++ b/arch/s390/include/asm/nmi.h
@@ -58,7 +58,9 @@ union mci {
 		u64 ie :  1; /* 32 indirect storage error */
 		u64 ar :  1; /* 33 access register validity */
 		u64 da :  1; /* 34 delayed access exception */
-		u64    :  7; /* 35-41 */
+		u64    :  1; /* 35 */
+		u64 gs :  1; /* 36 guarded storage registers */
+		u64    :  5; /* 37-41 */
 		u64 pr :  1; /* 42 tod programmable register validity */
 		u64 fc :  1; /* 43 fp control register validity */
 		u64 ap :  1; /* 44 ancillary report */
@@ -69,6 +71,14 @@ union mci {
 	};
 };
 
+#define MCESA_ORIGIN_MASK	(~0x3ffUL)
+#define MCESA_LC_MASK		(0xfUL)
+
+struct mcesa {
+	u8 vector_save_area[1024];
+	u8 guarded_storage_save_area[32];
+};
+
 struct pt_regs;
 
 extern void s390_handle_mcck(void);
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index e498871..cc101f9 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -135,6 +135,8 @@ struct thread_struct {
 	struct list_head list;
 	/* cpu runtime instrumentation */
 	struct runtime_instr_cb *ri_cb;
+	struct gs_cb *gs_cb;		/* Current guarded storage cb */
+	struct gs_cb *gs_bc_cb;		/* Broadcast guarded storage cb */
 	unsigned char trap_tdb[256];	/* Transaction abort diagnose block */
 	/*
 	 * Warning: 'fpu' is dynamically-sized. It *MUST* be at
@@ -215,6 +217,9 @@ void show_cacheinfo(struct seq_file *m);
 /* Free all resources held by a thread. */
 extern void release_thread(struct task_struct *);
 
+/* Free guarded storage control block for current */
+void exit_thread_gs(void);
+
 /*
  * Return saved PC of a blocked thread.
  */
diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h
index ace3bd3..6f5167b 100644
--- a/arch/s390/include/asm/sclp.h
+++ b/arch/s390/include/asm/sclp.h
@@ -75,6 +75,7 @@ struct sclp_info {
 	unsigned char has_pfmfi : 1;
 	unsigned char has_ibs : 1;
 	unsigned char has_skey : 1;
+	unsigned char has_kss : 1;
 	unsigned int ibc;
 	unsigned int mtid;
 	unsigned int mtid_cp;
diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h
index 30bdb5a..383bd83 100644
--- a/arch/s390/include/asm/setup.h
+++ b/arch/s390/include/asm/setup.h
@@ -31,6 +31,7 @@
 #define MACHINE_FLAG_VX		_BITUL(13)
 #define MACHINE_FLAG_CAD	_BITUL(14)
 #define MACHINE_FLAG_NX		_BITUL(15)
+#define MACHINE_FLAG_GS		_BITUL(16)
 
 #define LPP_MAGIC		_BITUL(31)
 #define LPP_PFAULT_PID_MASK	_AC(0xffffffff, UL)
@@ -70,6 +71,7 @@ extern void detect_memory_memblock(void);
 #define MACHINE_HAS_VX		(S390_lowcore.machine_flags & MACHINE_FLAG_VX)
 #define MACHINE_HAS_CAD		(S390_lowcore.machine_flags & MACHINE_FLAG_CAD)
 #define MACHINE_HAS_NX		(S390_lowcore.machine_flags & MACHINE_FLAG_NX)
+#define MACHINE_HAS_GS		(S390_lowcore.machine_flags & MACHINE_FLAG_GS)
 
 /*
  * Console mode. Override with conmode=
diff --git a/arch/s390/include/asm/switch_to.h b/arch/s390/include/asm/switch_to.h
index 12d45f0..f6c2b58 100644
--- a/arch/s390/include/asm/switch_to.h
+++ b/arch/s390/include/asm/switch_to.h
@@ -10,6 +10,7 @@
 #include <linux/thread_info.h>
 #include <asm/fpu/api.h>
 #include <asm/ptrace.h>
+#include <asm/guarded_storage.h>
 
 extern struct task_struct *__switch_to(void *, void *);
 extern void update_cr_regs(struct task_struct *task);
@@ -33,12 +34,14 @@ static inline void restore_access_regs(unsigned int *acrs)
 		save_fpu_regs();					\
 		save_access_regs(&prev->thread.acrs[0]);		\
 		save_ri_cb(prev->thread.ri_cb);				\
+		save_gs_cb(prev->thread.gs_cb);				\
 	}								\
 	if (next->mm) {							\
 		update_cr_regs(next);					\
 		set_cpu_flag(CIF_FPU);					\
 		restore_access_regs(&next->thread.acrs[0]);		\
 		restore_ri_cb(next->thread.ri_cb, prev->thread.ri_cb);	\
+		restore_gs_cb(next->thread.gs_cb);			\
 	}								\
 	prev = __switch_to(prev,next);					\
 } while (0)
diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h
index a5b54a4..f36e6e2 100644
--- a/arch/s390/include/asm/thread_info.h
+++ b/arch/s390/include/asm/thread_info.h
@@ -54,11 +54,12 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
 #define TIF_NOTIFY_RESUME	0	/* callback before returning to user */
 #define TIF_SIGPENDING		1	/* signal pending */
 #define TIF_NEED_RESCHED	2	/* rescheduling necessary */
-#define TIF_SYSCALL_TRACE	3	/* syscall trace active */
-#define TIF_SYSCALL_AUDIT	4	/* syscall auditing active */
-#define TIF_SECCOMP		5	/* secure computing */
-#define TIF_SYSCALL_TRACEPOINT	6	/* syscall tracepoint instrumentation */
-#define TIF_UPROBE		7	/* breakpointed or single-stepping */
+#define TIF_UPROBE		3	/* breakpointed or single-stepping */
+#define TIF_GUARDED_STORAGE	4	/* load guarded storage control block */
+#define TIF_SYSCALL_TRACE	8	/* syscall trace active */
+#define TIF_SYSCALL_AUDIT	9	/* syscall auditing active */
+#define TIF_SECCOMP		10	/* secure computing */
+#define TIF_SYSCALL_TRACEPOINT	11	/* syscall tracepoint instrumentation */
 #define TIF_31BIT		16	/* 32bit process */
 #define TIF_MEMDIE		17	/* is terminating due to OOM killer */
 #define TIF_RESTORE_SIGMASK	18	/* restore signal mask in do_signal() */
@@ -76,5 +77,6 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
 #define _TIF_UPROBE		_BITUL(TIF_UPROBE)
 #define _TIF_31BIT		_BITUL(TIF_31BIT)
 #define _TIF_SINGLE_STEP	_BITUL(TIF_SINGLE_STEP)
+#define _TIF_GUARDED_STORAGE	_BITUL(TIF_GUARDED_STORAGE)
 
 #endif /* _ASM_THREAD_INFO_H */
diff --git a/arch/s390/include/uapi/asm/Kbuild b/arch/s390/include/uapi/asm/Kbuild
index 6848ba5..86b761e 100644
--- a/arch/s390/include/uapi/asm/Kbuild
+++ b/arch/s390/include/uapi/asm/Kbuild
@@ -12,6 +12,7 @@ header-y += dasd.h
 header-y += debug.h
 header-y += errno.h
 header-y += fcntl.h
+header-y += guarded_storage.h
 header-y += hypfs.h
 header-y += ioctl.h
 header-y += ioctls.h
diff --git a/arch/s390/include/uapi/asm/guarded_storage.h b/arch/s390/include/uapi/asm/guarded_storage.h
new file mode 100644
index 0000000..852850e
--- /dev/null
+++ b/arch/s390/include/uapi/asm/guarded_storage.h
@@ -0,0 +1,77 @@
+#ifndef _GUARDED_STORAGE_H
+#define _GUARDED_STORAGE_H
+
+#include <linux/types.h>
+
+struct gs_cb {
+	__u64 reserved;
+	__u64 gsd;
+	__u64 gssm;
+	__u64 gs_epl_a;
+};
+
+struct gs_epl {
+	__u8 pad1;
+	union {
+		__u8 gs_eam;
+		struct {
+			__u8	: 6;
+			__u8 e	: 1;
+			__u8 b	: 1;
+		};
+	};
+	union {
+		__u8 gs_eci;
+		struct {
+			__u8 tx	: 1;
+			__u8 cx	: 1;
+			__u8	: 5;
+			__u8 in	: 1;
+		};
+	};
+	union {
+		__u8 gs_eai;
+		struct {
+			__u8	: 1;
+			__u8 t	: 1;
+			__u8 as	: 2;
+			__u8 ar	: 4;
+		};
+	};
+	__u32 pad2;
+	__u64 gs_eha;
+	__u64 gs_eia;
+	__u64 gs_eoa;
+	__u64 gs_eir;
+	__u64 gs_era;
+};
+
+#define GS_ENABLE	0
+#define	GS_DISABLE	1
+#define GS_SET_BC_CB	2
+#define GS_CLEAR_BC_CB	3
+#define GS_BROADCAST	4
+
+static inline void load_gs_cb(struct gs_cb *gs_cb)
+{
+	asm volatile(".insn rxy,0xe3000000004d,0,%0" : : "Q" (*gs_cb));
+}
+
+static inline void store_gs_cb(struct gs_cb *gs_cb)
+{
+	asm volatile(".insn rxy,0xe30000000049,0,%0" : : "Q" (*gs_cb));
+}
+
+static inline void save_gs_cb(struct gs_cb *gs_cb)
+{
+	if (gs_cb)
+		store_gs_cb(gs_cb);
+}
+
+static inline void restore_gs_cb(struct gs_cb *gs_cb)
+{
+	if (gs_cb)
+		load_gs_cb(gs_cb);
+}
+
+#endif /* _GUARDED_STORAGE_H */
diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
index a2ffec4..3dd2a1d 100644
--- a/arch/s390/include/uapi/asm/kvm.h
+++ b/arch/s390/include/uapi/asm/kvm.h
@@ -26,6 +26,8 @@
 #define KVM_DEV_FLIC_ADAPTER_REGISTER	6
 #define KVM_DEV_FLIC_ADAPTER_MODIFY	7
 #define KVM_DEV_FLIC_CLEAR_IO_IRQ	8
+#define KVM_DEV_FLIC_AISM		9
+#define KVM_DEV_FLIC_AIRQ_INJECT	10
 /*
  * We can have up to 4*64k pending subchannels + 8 adapter interrupts,
  * as well as up  to ASYNC_PF_PER_VCPU*KVM_MAX_VCPUS pfault done interrupts.
@@ -41,7 +43,14 @@ struct kvm_s390_io_adapter {
 	__u8 isc;
 	__u8 maskable;
 	__u8 swap;
-	__u8 pad;
+	__u8 flags;
+};
+
+#define KVM_S390_ADAPTER_SUPPRESSIBLE 0x01
+
+struct kvm_s390_ais_req {
+	__u8 isc;
+	__u16 mode;
 };
 
 #define KVM_S390_IO_ADAPTER_MASK 1
@@ -110,6 +119,7 @@ struct kvm_s390_vm_cpu_machine {
 #define KVM_S390_VM_CPU_FEAT_CMMA	10
 #define KVM_S390_VM_CPU_FEAT_PFMFI	11
 #define KVM_S390_VM_CPU_FEAT_SIGPIF	12
+#define KVM_S390_VM_CPU_FEAT_KSS	13
 struct kvm_s390_vm_cpu_feat {
 	__u64 feat[16];
 };
@@ -131,7 +141,8 @@ struct kvm_s390_vm_cpu_subfunc {
 	__u8 kmo[16];		/* with MSA4 */
 	__u8 pcc[16];		/* with MSA4 */
 	__u8 ppno[16];		/* with MSA5 */
-	__u8 reserved[1824];
+	__u8 kma[16];		/* with MSA8 */
+	__u8 reserved[1808];
 };
 
 /* kvm attributes for crypto */
@@ -197,6 +208,10 @@ struct kvm_guest_debug_arch {
 #define KVM_SYNC_VRS    (1UL << 6)
 #define KVM_SYNC_RICCB  (1UL << 7)
 #define KVM_SYNC_FPRS   (1UL << 8)
+#define KVM_SYNC_GSCB   (1UL << 9)
+/* length and alignment of the sdnx as a power of two */
+#define SDNXC 8
+#define SDNXL (1UL << SDNXC)
 /* definition of registers in kvm_run */
 struct kvm_sync_regs {
 	__u64 prefix;	/* prefix register */
@@ -217,8 +232,16 @@ struct kvm_sync_regs {
 	};
 	__u8  reserved[512];	/* for future vector expansion */
 	__u32 fpc;		/* valid on KVM_SYNC_VRS or KVM_SYNC_FPRS */
-	__u8 padding[52];	/* riccb needs to be 64byte aligned */
+	__u8 padding1[52];	/* riccb needs to be 64byte aligned */
 	__u8 riccb[64];		/* runtime instrumentation controls block */
+	__u8 padding2[192];	/* sdnx needs to be 256byte aligned */
+	union {
+		__u8 sdnx[SDNXL];  /* state description annex */
+		struct {
+			__u64 reserved1[2];
+			__u64 gscb[4];
+		};
+	};
 };
 
 #define KVM_REG_S390_TODPR	(KVM_REG_S390 | KVM_REG_SIZE_U32 | 0x1)
diff --git a/arch/s390/include/uapi/asm/unistd.h b/arch/s390/include/uapi/asm/unistd.h
index 152de9b..ea42290 100644
--- a/arch/s390/include/uapi/asm/unistd.h
+++ b/arch/s390/include/uapi/asm/unistd.h
@@ -313,7 +313,7 @@
 #define __NR_copy_file_range	375
 #define __NR_preadv2		376
 #define __NR_pwritev2		377
-/* Number 378 is reserved for guarded storage */
+#define __NR_s390_guarded_storage	378
 #define __NR_statx		379
 #define NR_syscalls 380
 
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index 060ce54..aa5adbd 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -57,7 +57,7 @@ obj-y	:= traps.o time.o process.o base.o early.o setup.o idle.o vtime.o
 obj-y	+= processor.o sys_s390.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o
 obj-y	+= debug.o irq.o ipl.o dis.o diag.o vdso.o als.o
 obj-y	+= sysinfo.o jump_label.o lgr.o os_info.o machine_kexec.o pgm_check.o
-obj-y	+= runtime_instr.o cache.o fpu.o dumpstack.o
+obj-y	+= runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o
 obj-y	+= entry.o reipl.o relocate_kernel.o
 
 extra-y				+= head.o head64.o vmlinux.lds
diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c
index c4b3570..6bb2963 100644
--- a/arch/s390/kernel/asm-offsets.c
+++ b/arch/s390/kernel/asm-offsets.c
@@ -175,7 +175,7 @@ int main(void)
 	/* software defined ABI-relevant lowcore locations 0xe00 - 0xe20 */
 	OFFSET(__LC_DUMP_REIPL, lowcore, ipib);
 	/* hardware defined lowcore locations 0x1000 - 0x18ff */
-	OFFSET(__LC_VX_SAVE_AREA_ADDR, lowcore, vector_save_area_addr);
+	OFFSET(__LC_MCESAD, lowcore, mcesad);
 	OFFSET(__LC_EXT_PARAMS2, lowcore, ext_params2);
 	OFFSET(__LC_FPREGS_SAVE_AREA, lowcore, floating_pt_save_area);
 	OFFSET(__LC_GPREGS_SAVE_AREA, lowcore, gpregs_save_area);
diff --git a/arch/s390/kernel/compat_wrapper.c b/arch/s390/kernel/compat_wrapper.c
index e89cc2e..986642a 100644
--- a/arch/s390/kernel/compat_wrapper.c
+++ b/arch/s390/kernel/compat_wrapper.c
@@ -178,4 +178,5 @@ COMPAT_SYSCALL_WRAP3(getpeername, int, fd, struct sockaddr __user *, usockaddr,
 COMPAT_SYSCALL_WRAP6(sendto, int, fd, void __user *, buff, size_t, len, unsigned int, flags, struct sockaddr __user *, addr, int, addr_len);
 COMPAT_SYSCALL_WRAP3(mlock2, unsigned long, start, size_t, len, int, flags);
 COMPAT_SYSCALL_WRAP6(copy_file_range, int, fd_in, loff_t __user *, off_in, int, fd_out, loff_t __user *, off_out, size_t, len, unsigned int, flags);
+COMPAT_SYSCALL_WRAP2(s390_guarded_storage, int, command, struct gs_cb *, gs_cb);
 COMPAT_SYSCALL_WRAP5(statx, int, dfd, const char __user *, path, unsigned, flags, unsigned, mask, struct statx __user *, buffer);
diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c
index 4e65c79..95298a4 100644
--- a/arch/s390/kernel/early.c
+++ b/arch/s390/kernel/early.c
@@ -358,6 +358,8 @@ static __init void detect_machine_facilities(void)
 		S390_lowcore.machine_flags |= MACHINE_FLAG_NX;
 		__ctl_set_bit(0, 20);
 	}
+	if (test_facility(133))
+		S390_lowcore.machine_flags |= MACHINE_FLAG_GS;
 }
 
 static inline void save_vector_registers(void)
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index 6a7d737..fa8b8f2 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -47,7 +47,7 @@ STACK_SIZE  = 1 << STACK_SHIFT
 STACK_INIT = STACK_SIZE - STACK_FRAME_OVERHEAD - __PT_SIZE
 
 _TIF_WORK	= (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \
-		   _TIF_UPROBE)
+		   _TIF_UPROBE | _TIF_GUARDED_STORAGE)
 _TIF_TRACE	= (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | \
 		   _TIF_SYSCALL_TRACEPOINT)
 _CIF_WORK	= (_CIF_MCCK_PENDING | _CIF_ASCE_PRIMARY | \
@@ -332,6 +332,8 @@ ENTRY(system_call)
 	TSTMSK	__TI_flags(%r12),_TIF_UPROBE
 	jo	.Lsysc_uprobe_notify
 #endif
+	TSTMSK	__TI_flags(%r12),_TIF_GUARDED_STORAGE
+	jo	.Lsysc_guarded_storage
 	TSTMSK	__PT_FLAGS(%r11),_PIF_PER_TRAP
 	jo	.Lsysc_singlestep
 	TSTMSK	__TI_flags(%r12),_TIF_SIGPENDING
@@ -409,6 +411,14 @@ ENTRY(system_call)
 #endif
 
 #
+# _TIF_GUARDED_STORAGE is set, call guarded_storage_load
+#
+.Lsysc_guarded_storage:
+	lgr	%r2,%r11		# pass pointer to pt_regs
+	larl	%r14,.Lsysc_return
+	jg	gs_load_bc_cb
+
+#
 # _PIF_PER_TRAP is set, call do_per_trap
 #
 .Lsysc_singlestep:
@@ -663,6 +673,8 @@ ENTRY(io_int_handler)
 	jo	.Lio_sigpending
 	TSTMSK	__TI_flags(%r12),_TIF_NOTIFY_RESUME
 	jo	.Lio_notify_resume
+	TSTMSK	__TI_flags(%r12),_TIF_GUARDED_STORAGE
+	jo	.Lio_guarded_storage
 	TSTMSK	__LC_CPU_FLAGS,_CIF_FPU
 	jo	.Lio_vxrs
 	TSTMSK	__LC_CPU_FLAGS,(_CIF_ASCE_PRIMARY|_CIF_ASCE_SECONDARY)
@@ -697,6 +709,18 @@ ENTRY(io_int_handler)
 	jg	load_fpu_regs
 
 #
+# _TIF_GUARDED_STORAGE is set, call guarded_storage_load
+#
+.Lio_guarded_storage:
+	# TRACE_IRQS_ON already done at .Lio_return
+	ssm	__LC_SVC_NEW_PSW	# reenable interrupts
+	lgr	%r2,%r11		# pass pointer to pt_regs
+	brasl	%r14,gs_load_bc_cb
+	ssm	__LC_PGM_NEW_PSW	# disable I/O and ext. interrupts
+	TRACE_IRQS_OFF
+	j	.Lio_return
+
+#
 # _TIF_NEED_RESCHED is set, call schedule
 #
 .Lio_reschedule:
diff --git a/arch/s390/kernel/entry.h b/arch/s390/kernel/entry.h
index 33f9018..dbf5f7e 100644
--- a/arch/s390/kernel/entry.h
+++ b/arch/s390/kernel/entry.h
@@ -74,12 +74,14 @@ long sys_sigreturn(void);
 
 long sys_s390_personality(unsigned int personality);
 long sys_s390_runtime_instr(int command, int signum);
+long sys_s390_guarded_storage(int command, struct gs_cb __user *);
 long sys_s390_pci_mmio_write(unsigned long, const void __user *, size_t);
 long sys_s390_pci_mmio_read(unsigned long, void __user *, size_t);
 
 DECLARE_PER_CPU(u64, mt_cycles[8]);
 
 void verify_facilities(void);
+void gs_load_bc_cb(struct pt_regs *regs);
 void set_fs_fixup(void);
 
 #endif /* _ENTRY_H */
diff --git a/arch/s390/kernel/guarded_storage.c b/arch/s390/kernel/guarded_storage.c
new file mode 100644
index 0000000..6f06474
--- /dev/null
+++ b/arch/s390/kernel/guarded_storage.c
@@ -0,0 +1,128 @@
+/*
+ * Copyright IBM Corp. 2016
+ * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/syscalls.h>
+#include <linux/signal.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <asm/guarded_storage.h>
+#include "entry.h"
+
+void exit_thread_gs(void)
+{
+	kfree(current->thread.gs_cb);
+	kfree(current->thread.gs_bc_cb);
+	current->thread.gs_cb = current->thread.gs_bc_cb = NULL;
+}
+
+static int gs_enable(void)
+{
+	struct gs_cb *gs_cb;
+
+	if (!current->thread.gs_cb) {
+		gs_cb = kzalloc(sizeof(*gs_cb), GFP_KERNEL);
+		if (!gs_cb)
+			return -ENOMEM;
+		gs_cb->gsd = 25;
+		preempt_disable();
+		__ctl_set_bit(2, 4);
+		load_gs_cb(gs_cb);
+		current->thread.gs_cb = gs_cb;
+		preempt_enable();
+	}
+	return 0;
+}
+
+static int gs_disable(void)
+{
+	if (current->thread.gs_cb) {
+		preempt_disable();
+		kfree(current->thread.gs_cb);
+		current->thread.gs_cb = NULL;
+		__ctl_clear_bit(2, 4);
+		preempt_enable();
+	}
+	return 0;
+}
+
+static int gs_set_bc_cb(struct gs_cb __user *u_gs_cb)
+{
+	struct gs_cb *gs_cb;
+
+	gs_cb = current->thread.gs_bc_cb;
+	if (!gs_cb) {
+		gs_cb = kzalloc(sizeof(*gs_cb), GFP_KERNEL);
+		if (!gs_cb)
+			return -ENOMEM;
+		current->thread.gs_bc_cb = gs_cb;
+	}
+	if (copy_from_user(gs_cb, u_gs_cb, sizeof(*gs_cb)))
+		return -EFAULT;
+	return 0;
+}
+
+static int gs_clear_bc_cb(void)
+{
+	struct gs_cb *gs_cb;
+
+	gs_cb = current->thread.gs_bc_cb;
+	current->thread.gs_bc_cb = NULL;
+	kfree(gs_cb);
+	return 0;
+}
+
+void gs_load_bc_cb(struct pt_regs *regs)
+{
+	struct gs_cb *gs_cb;
+
+	preempt_disable();
+	clear_thread_flag(TIF_GUARDED_STORAGE);
+	gs_cb = current->thread.gs_bc_cb;
+	if (gs_cb) {
+		kfree(current->thread.gs_cb);
+		current->thread.gs_bc_cb = NULL;
+		__ctl_set_bit(2, 4);
+		load_gs_cb(gs_cb);
+		current->thread.gs_cb = gs_cb;
+	}
+	preempt_enable();
+}
+
+static int gs_broadcast(void)
+{
+	struct task_struct *sibling;
+
+	read_lock(&tasklist_lock);
+	for_each_thread(current, sibling) {
+		if (!sibling->thread.gs_bc_cb)
+			continue;
+		if (test_and_set_tsk_thread_flag(sibling, TIF_GUARDED_STORAGE))
+			kick_process(sibling);
+	}
+	read_unlock(&tasklist_lock);
+	return 0;
+}
+
+SYSCALL_DEFINE2(s390_guarded_storage, int, command,
+		struct gs_cb __user *, gs_cb)
+{
+	if (!MACHINE_HAS_GS)
+		return -EOPNOTSUPP;
+	switch (command) {
+	case GS_ENABLE:
+		return gs_enable();
+	case GS_DISABLE:
+		return gs_disable();
+	case GS_SET_BC_CB:
+		return gs_set_bc_cb(gs_cb);
+	case GS_CLEAR_BC_CB:
+		return gs_clear_bc_cb();
+	case GS_BROADCAST:
+		return gs_broadcast();
+	default:
+		return -EINVAL;
+	}
+}
diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c
index 3074c1d..db5658d 100644
--- a/arch/s390/kernel/machine_kexec.c
+++ b/arch/s390/kernel/machine_kexec.c
@@ -27,6 +27,7 @@
 #include <asm/cacheflush.h>
 #include <asm/os_info.h>
 #include <asm/switch_to.h>
+#include <asm/nmi.h>
 
 typedef void (*relocate_kernel_t)(kimage_entry_t *, unsigned long);
 
@@ -102,6 +103,8 @@ static void __do_machine_kdump(void *image)
  */
 static noinline void __machine_kdump(void *image)
 {
+	struct mcesa *mcesa;
+	unsigned long cr2_old, cr2_new;
 	int this_cpu, cpu;
 
 	lgr_info_log();
@@ -114,8 +117,16 @@ static noinline void __machine_kdump(void *image)
 			continue;
 	}
 	/* Store status of the boot CPU */
+	mcesa = (struct mcesa *)(S390_lowcore.mcesad & MCESA_ORIGIN_MASK);
 	if (MACHINE_HAS_VX)
-		save_vx_regs((void *) &S390_lowcore.vector_save_area);
+		save_vx_regs((__vector128 *) mcesa->vector_save_area);
+	if (MACHINE_HAS_GS) {
+		__ctl_store(cr2_old, 2, 2);
+		cr2_new = cr2_old | (1UL << 4);
+		__ctl_load(cr2_new, 2, 2);
+		save_gs_cb((struct gs_cb *) mcesa->guarded_storage_save_area);
+		__ctl_load(cr2_old, 2, 2);
+	}
 	/*
 	 * To create a good backchain for this CPU in the dump store_status
 	 * is passed the address of a function. The address is saved into
diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c
index 9bf8327..9855895 100644
--- a/arch/s390/kernel/nmi.c
+++ b/arch/s390/kernel/nmi.c
@@ -106,6 +106,7 @@ static int notrace s390_validate_registers(union mci mci, int umode)
 	int kill_task;
 	u64 zero;
 	void *fpt_save_area;
+	struct mcesa *mcesa;
 
 	kill_task = 0;
 	zero = 0;
@@ -165,6 +166,7 @@ static int notrace s390_validate_registers(union mci mci, int umode)
 			     : : "Q" (S390_lowcore.fpt_creg_save_area));
 	}
 
+	mcesa = (struct mcesa *)(S390_lowcore.mcesad & MCESA_ORIGIN_MASK);
 	if (!MACHINE_HAS_VX) {
 		/* Validate floating point registers */
 		asm volatile(
@@ -209,8 +211,8 @@ static int notrace s390_validate_registers(union mci mci, int umode)
 			"	la	1,%0\n"
 			"	.word	0xe70f,0x1000,0x0036\n"	/* vlm 0,15,0(1) */
 			"	.word	0xe70f,0x1100,0x0c36\n"	/* vlm 16,31,256(1) */
-			: : "Q" (*(struct vx_array *)
-				 &S390_lowcore.vector_save_area) : "1");
+			: : "Q" (*(struct vx_array *) mcesa->vector_save_area)
+			: "1");
 		__ctl_load(S390_lowcore.cregs_save_area[0], 0, 0);
 	}
 	/* Validate access registers */
@@ -224,6 +226,19 @@ static int notrace s390_validate_registers(union mci mci, int umode)
 		 */
 		kill_task = 1;
 	}
+	/* Validate guarded storage registers */
+	if (MACHINE_HAS_GS && (S390_lowcore.cregs_save_area[2] & (1UL << 4))) {
+		if (!mci.gs)
+			/*
+			 * Guarded storage register can't be restored and
+			 * the current processes uses guarded storage.
+			 * It has to be terminated.
+			 */
+			kill_task = 1;
+		else
+			load_gs_cb((struct gs_cb *)
+				   mcesa->guarded_storage_save_area);
+	}
 	/*
 	 * We don't even try to validate the TOD register, since we simply
 	 * can't write something sensible into that register.
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index f29e41c..999d715 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -73,8 +73,10 @@ extern void kernel_thread_starter(void);
  */
 void exit_thread(struct task_struct *tsk)
 {
-	if (tsk == current)
+	if (tsk == current) {
 		exit_thread_runtime_instr();
+		exit_thread_gs();
+	}
 }
 
 void flush_thread(void)
@@ -159,6 +161,9 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long new_stackp,
 	/* Don't copy runtime instrumentation info */
 	p->thread.ri_cb = NULL;
 	frame->childregs.psw.mask &= ~PSW_MASK_RI;
+	/* Don't copy guarded storage control block */
+	p->thread.gs_cb = NULL;
+	p->thread.gs_bc_cb = NULL;
 
 	/* Set a new TLS ?  */
 	if (clone_flags & CLONE_SETTLS) {
diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c
index 928b929..c737098 100644
--- a/arch/s390/kernel/processor.c
+++ b/arch/s390/kernel/processor.c
@@ -95,7 +95,7 @@ static void show_cpu_summary(struct seq_file *m, void *v)
 {
 	static const char *hwcap_str[] = {
 		"esan3", "zarch", "stfle", "msa", "ldisp", "eimm", "dfp",
-		"edat", "etf3eh", "highgprs", "te", "vx", "vxd", "vxe"
+		"edat", "etf3eh", "highgprs", "te", "vx", "vxd", "vxe", "gs"
 	};
 	static const char * const int_hwcap_str[] = {
 		"sie"
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index c14df0a..c933e25 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -44,30 +44,42 @@ void update_cr_regs(struct task_struct *task)
 	struct pt_regs *regs = task_pt_regs(task);
 	struct thread_struct *thread = &task->thread;
 	struct per_regs old, new;
-
+	unsigned long cr0_old, cr0_new;
+	unsigned long cr2_old, cr2_new;
+	int cr0_changed, cr2_changed;
+
+	__ctl_store(cr0_old, 0, 0);
+	__ctl_store(cr2_old, 2, 2);
+	cr0_new = cr0_old;
+	cr2_new = cr2_old;
 	/* Take care of the enable/disable of transactional execution. */
 	if (MACHINE_HAS_TE) {
-		unsigned long cr, cr_new;
-
-		__ctl_store(cr, 0, 0);
 		/* Set or clear transaction execution TXC bit 8. */
-		cr_new = cr | (1UL << 55);
+		cr0_new |= (1UL << 55);
 		if (task->thread.per_flags & PER_FLAG_NO_TE)
-			cr_new &= ~(1UL << 55);
-		if (cr_new != cr)
-			__ctl_load(cr_new, 0, 0);
+			cr0_new &= ~(1UL << 55);
 		/* Set or clear transaction execution TDC bits 62 and 63. */
-		__ctl_store(cr, 2, 2);
-		cr_new = cr & ~3UL;
+		cr2_new &= ~3UL;
 		if (task->thread.per_flags & PER_FLAG_TE_ABORT_RAND) {
 			if (task->thread.per_flags & PER_FLAG_TE_ABORT_RAND_TEND)
-				cr_new |= 1UL;
+				cr2_new |= 1UL;
 			else
-				cr_new |= 2UL;
+				cr2_new |= 2UL;
 		}
-		if (cr_new != cr)
-			__ctl_load(cr_new, 2, 2);
 	}
+	/* Take care of enable/disable of guarded storage. */
+	if (MACHINE_HAS_GS) {
+		cr2_new &= ~(1UL << 4);
+		if (task->thread.gs_cb)
+			cr2_new |= (1UL << 4);
+	}
+	/* Load control register 0/2 iff changed */
+	cr0_changed = cr0_new != cr0_old;
+	cr2_changed = cr2_new != cr2_old;
+	if (cr0_changed)
+		__ctl_load(cr0_new, 0, 0);
+	if (cr2_changed)
+		__ctl_load(cr2_new, 2, 2);
 	/* Copy user specified PER registers */
 	new.control = thread->per_user.control;
 	new.start = thread->per_user.start;
@@ -1137,6 +1149,36 @@ static int s390_system_call_set(struct task_struct *target,
 				  data, 0, sizeof(unsigned int));
 }
 
+static int s390_gs_cb_get(struct task_struct *target,
+			  const struct user_regset *regset,
+			  unsigned int pos, unsigned int count,
+			  void *kbuf, void __user *ubuf)
+{
+	struct gs_cb *data = target->thread.gs_cb;
+
+	if (!MACHINE_HAS_GS)
+		return -ENODEV;
+	if (!data)
+		return -ENODATA;
+	return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				   data, 0, sizeof(struct gs_cb));
+}
+
+static int s390_gs_cb_set(struct task_struct *target,
+			  const struct user_regset *regset,
+			  unsigned int pos, unsigned int count,
+			  const void *kbuf, const void __user *ubuf)
+{
+	struct gs_cb *data = target->thread.gs_cb;
+
+	if (!MACHINE_HAS_GS)
+		return -ENODEV;
+	if (!data)
+		return -ENODATA;
+	return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				  data, 0, sizeof(struct gs_cb));
+}
+
 static const struct user_regset s390_regsets[] = {
 	{
 		.core_note_type = NT_PRSTATUS,
@@ -1194,6 +1236,14 @@ static const struct user_regset s390_regsets[] = {
 		.get = s390_vxrs_high_get,
 		.set = s390_vxrs_high_set,
 	},
+	{
+		.core_note_type = NT_S390_GS_CB,
+		.n = sizeof(struct gs_cb) / sizeof(__u64),
+		.size = sizeof(__u64),
+		.align = sizeof(__u64),
+		.get = s390_gs_cb_get,
+		.set = s390_gs_cb_set,
+	},
 };
 
 static const struct user_regset_view user_s390_view = {
@@ -1422,6 +1472,14 @@ static const struct user_regset s390_compat_regsets[] = {
 		.get = s390_compat_regs_high_get,
 		.set = s390_compat_regs_high_set,
 	},
+	{
+		.core_note_type = NT_S390_GS_CB,
+		.n = sizeof(struct gs_cb) / sizeof(__u64),
+		.size = sizeof(__u64),
+		.align = sizeof(__u64),
+		.get = s390_gs_cb_get,
+		.set = s390_gs_cb_set,
+	},
 };
 
 static const struct user_regset_view user_s390_compat_view = {
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 911dc0b..3ae756c 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -339,9 +339,15 @@ static void __init setup_lowcore(void)
 	lc->stfl_fac_list = S390_lowcore.stfl_fac_list;
 	memcpy(lc->stfle_fac_list, S390_lowcore.stfle_fac_list,
 	       MAX_FACILITY_BIT/8);
-	if (MACHINE_HAS_VX)
-		lc->vector_save_area_addr =
-			(unsigned long) &lc->vector_save_area;
+	if (MACHINE_HAS_VX || MACHINE_HAS_GS) {
+		unsigned long bits, size;
+
+		bits = MACHINE_HAS_GS ? 11 : 10;
+		size = 1UL << bits;
+		lc->mcesad = (__u64) memblock_virt_alloc(size, size);
+		if (MACHINE_HAS_GS)
+			lc->mcesad |= bits;
+	}
 	lc->vdso_per_cpu_data = (unsigned long) &lc->paste[0];
 	lc->sync_enter_timer = S390_lowcore.sync_enter_timer;
 	lc->async_enter_timer = S390_lowcore.async_enter_timer;
@@ -779,6 +785,12 @@ static int __init setup_hwcaps(void)
 			elf_hwcap |= HWCAP_S390_VXRS_BCD;
 	}
 
+	/*
+	 * Guarded storage support HWCAP_S390_GS is bit 12.
+	 */
+	if (MACHINE_HAS_GS)
+		elf_hwcap |= HWCAP_S390_GS;
+
 	get_cpu_id(&cpu_id);
 	add_device_randomness(&cpu_id, sizeof(cpu_id));
 	switch (cpu_id.machine) {
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 47a973b..286bcee8 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -51,6 +51,7 @@
 #include <asm/os_info.h>
 #include <asm/sigp.h>
 #include <asm/idle.h>
+#include <asm/nmi.h>
 #include "entry.h"
 
 enum {
@@ -78,6 +79,8 @@ struct pcpu {
 static u8 boot_core_type;
 static struct pcpu pcpu_devices[NR_CPUS];
 
+static struct kmem_cache *pcpu_mcesa_cache;
+
 unsigned int smp_cpu_mt_shift;
 EXPORT_SYMBOL(smp_cpu_mt_shift);
 
@@ -188,8 +191,10 @@ static void pcpu_ec_call(struct pcpu *pcpu, int ec_bit)
 static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu)
 {
 	unsigned long async_stack, panic_stack;
+	unsigned long mcesa_origin, mcesa_bits;
 	struct lowcore *lc;
 
+	mcesa_origin = mcesa_bits = 0;
 	if (pcpu != &pcpu_devices[0]) {
 		pcpu->lowcore =	(struct lowcore *)
 			__get_free_pages(GFP_KERNEL | GFP_DMA, LC_ORDER);
@@ -197,20 +202,27 @@ static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu)
 		panic_stack = __get_free_page(GFP_KERNEL);
 		if (!pcpu->lowcore || !panic_stack || !async_stack)
 			goto out;
+		if (MACHINE_HAS_VX || MACHINE_HAS_GS) {
+			mcesa_origin = (unsigned long)
+				kmem_cache_alloc(pcpu_mcesa_cache, GFP_KERNEL);
+			if (!mcesa_origin)
+				goto out;
+			mcesa_bits = MACHINE_HAS_GS ? 11 : 0;
+		}
 	} else {
 		async_stack = pcpu->lowcore->async_stack - ASYNC_FRAME_OFFSET;
 		panic_stack = pcpu->lowcore->panic_stack - PANIC_FRAME_OFFSET;
+		mcesa_origin = pcpu->lowcore->mcesad & MCESA_ORIGIN_MASK;
+		mcesa_bits = pcpu->lowcore->mcesad & MCESA_LC_MASK;
 	}
 	lc = pcpu->lowcore;
 	memcpy(lc, &S390_lowcore, 512);
 	memset((char *) lc + 512, 0, sizeof(*lc) - 512);
 	lc->async_stack = async_stack + ASYNC_FRAME_OFFSET;
 	lc->panic_stack = panic_stack + PANIC_FRAME_OFFSET;
+	lc->mcesad = mcesa_origin | mcesa_bits;
 	lc->cpu_nr = cpu;
 	lc->spinlock_lockval = arch_spin_lockval(cpu);
-	if (MACHINE_HAS_VX)
-		lc->vector_save_area_addr =
-			(unsigned long) &lc->vector_save_area;
 	if (vdso_alloc_per_cpu(lc))
 		goto out;
 	lowcore_ptr[cpu] = lc;
@@ -218,6 +230,9 @@ static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu)
 	return 0;
 out:
 	if (pcpu != &pcpu_devices[0]) {
+		if (mcesa_origin)
+			kmem_cache_free(pcpu_mcesa_cache,
+					(void *) mcesa_origin);
 		free_page(panic_stack);
 		free_pages(async_stack, ASYNC_ORDER);
 		free_pages((unsigned long) pcpu->lowcore, LC_ORDER);
@@ -229,11 +244,17 @@ out:
 
 static void pcpu_free_lowcore(struct pcpu *pcpu)
 {
+	unsigned long mcesa_origin;
+
 	pcpu_sigp_retry(pcpu, SIGP_SET_PREFIX, 0);
 	lowcore_ptr[pcpu - pcpu_devices] = NULL;
 	vdso_free_per_cpu(pcpu->lowcore);
 	if (pcpu == &pcpu_devices[0])
 		return;
+	if (MACHINE_HAS_VX || MACHINE_HAS_GS) {
+		mcesa_origin = pcpu->lowcore->mcesad & MCESA_ORIGIN_MASK;
+		kmem_cache_free(pcpu_mcesa_cache, (void *) mcesa_origin);
+	}
 	free_page(pcpu->lowcore->panic_stack-PANIC_FRAME_OFFSET);
 	free_pages(pcpu->lowcore->async_stack-ASYNC_FRAME_OFFSET, ASYNC_ORDER);
 	free_pages((unsigned long) pcpu->lowcore, LC_ORDER);
@@ -550,9 +571,11 @@ int smp_store_status(int cpu)
 	if (__pcpu_sigp_relax(pcpu->address, SIGP_STORE_STATUS_AT_ADDRESS,
 			      pa) != SIGP_CC_ORDER_CODE_ACCEPTED)
 		return -EIO;
-	if (!MACHINE_HAS_VX)
+	if (!MACHINE_HAS_VX && !MACHINE_HAS_GS)
 		return 0;
-	pa = __pa(pcpu->lowcore->vector_save_area_addr);
+	pa = __pa(pcpu->lowcore->mcesad & MCESA_ORIGIN_MASK);
+	if (MACHINE_HAS_GS)
+		pa |= pcpu->lowcore->mcesad & MCESA_LC_MASK;
 	if (__pcpu_sigp_relax(pcpu->address, SIGP_STORE_ADDITIONAL_STATUS,
 			      pa) != SIGP_CC_ORDER_CODE_ACCEPTED)
 		return -EIO;
@@ -897,12 +920,22 @@ void __init smp_fill_possible_mask(void)
 
 void __init smp_prepare_cpus(unsigned int max_cpus)
 {
+	unsigned long size;
+
 	/* request the 0x1201 emergency signal external interrupt */
 	if (register_external_irq(EXT_IRQ_EMERGENCY_SIG, do_ext_call_interrupt))
 		panic("Couldn't request external interrupt 0x1201");
 	/* request the 0x1202 external call external interrupt */
 	if (register_external_irq(EXT_IRQ_EXTERNAL_CALL, do_ext_call_interrupt))
 		panic("Couldn't request external interrupt 0x1202");
+	/* create slab cache for the machine-check-extended-save-areas */
+	if (MACHINE_HAS_VX || MACHINE_HAS_GS) {
+		size = 1UL << (MACHINE_HAS_GS ? 11 : 10);
+		pcpu_mcesa_cache = kmem_cache_create("nmi_save_areas",
+						     size, size, 0, NULL);
+		if (!pcpu_mcesa_cache)
+			panic("Couldn't create nmi save area cache");
+	}
 }
 
 void __init smp_prepare_boot_cpu(void)
diff --git a/arch/s390/kernel/syscalls.S b/arch/s390/kernel/syscalls.S
index 2659b5c..54fce7b 100644
--- a/arch/s390/kernel/syscalls.S
+++ b/arch/s390/kernel/syscalls.S
@@ -386,5 +386,5 @@ SYSCALL(sys_mlock2,compat_sys_mlock2)
 SYSCALL(sys_copy_file_range,compat_sys_copy_file_range) /* 375 */
 SYSCALL(sys_preadv2,compat_sys_preadv2)
 SYSCALL(sys_pwritev2,compat_sys_pwritev2)
-NI_SYSCALL
+SYSCALL(sys_s390_guarded_storage,compat_sys_s390_guarded_storage) /* 378 */
 SYSCALL(sys_statx,compat_sys_statx)
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index d55c829..709aca9 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -262,7 +262,7 @@ struct aste {
 
 int ipte_lock_held(struct kvm_vcpu *vcpu)
 {
-	if (vcpu->arch.sie_block->eca & 1) {
+	if (vcpu->arch.sie_block->eca & ECA_SII) {
 		int rc;
 
 		read_lock(&vcpu->kvm->arch.sca_lock);
@@ -361,7 +361,7 @@ static void ipte_unlock_siif(struct kvm_vcpu *vcpu)
 
 void ipte_lock(struct kvm_vcpu *vcpu)
 {
-	if (vcpu->arch.sie_block->eca & 1)
+	if (vcpu->arch.sie_block->eca & ECA_SII)
 		ipte_lock_siif(vcpu);
 	else
 		ipte_lock_simple(vcpu);
@@ -369,7 +369,7 @@ void ipte_lock(struct kvm_vcpu *vcpu)
 
 void ipte_unlock(struct kvm_vcpu *vcpu)
 {
-	if (vcpu->arch.sie_block->eca & 1)
+	if (vcpu->arch.sie_block->eca & ECA_SII)
 		ipte_unlock_siif(vcpu);
 	else
 		ipte_unlock_simple(vcpu);
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 59920f9..a4752bf 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -35,6 +35,7 @@ static const intercept_handler_t instruction_handlers[256] = {
 	[0xb6] = kvm_s390_handle_stctl,
 	[0xb7] = kvm_s390_handle_lctl,
 	[0xb9] = kvm_s390_handle_b9,
+	[0xe3] = kvm_s390_handle_e3,
 	[0xe5] = kvm_s390_handle_e5,
 	[0xeb] = kvm_s390_handle_eb,
 };
@@ -368,8 +369,7 @@ static int handle_operexc(struct kvm_vcpu *vcpu)
 	trace_kvm_s390_handle_operexc(vcpu, vcpu->arch.sie_block->ipa,
 				      vcpu->arch.sie_block->ipb);
 
-	if (vcpu->arch.sie_block->ipa == 0xb256 &&
-	    test_kvm_facility(vcpu->kvm, 74))
+	if (vcpu->arch.sie_block->ipa == 0xb256)
 		return handle_sthyi(vcpu);
 
 	if (vcpu->arch.sie_block->ipa == 0 && vcpu->kvm->arch.user_instr0)
@@ -404,28 +404,31 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
 		return -EOPNOTSUPP;
 
 	switch (vcpu->arch.sie_block->icptcode) {
-	case 0x10:
-	case 0x18:
+	case ICPT_EXTREQ:
+	case ICPT_IOREQ:
 		return handle_noop(vcpu);
-	case 0x04:
+	case ICPT_INST:
 		rc = handle_instruction(vcpu);
 		break;
-	case 0x08:
+	case ICPT_PROGI:
 		return handle_prog(vcpu);
-	case 0x14:
+	case ICPT_EXTINT:
 		return handle_external_interrupt(vcpu);
-	case 0x1c:
+	case ICPT_WAIT:
 		return kvm_s390_handle_wait(vcpu);
-	case 0x20:
+	case ICPT_VALIDITY:
 		return handle_validity(vcpu);
-	case 0x28:
+	case ICPT_STOP:
 		return handle_stop(vcpu);
-	case 0x2c:
+	case ICPT_OPEREXC:
 		rc = handle_operexc(vcpu);
 		break;
-	case 0x38:
+	case ICPT_PARTEXEC:
 		rc = handle_partial_execution(vcpu);
 		break;
+	case ICPT_KSS:
+		rc = kvm_s390_skey_check_enable(vcpu);
+		break;
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 0f8f141..caf15c8 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -410,6 +410,7 @@ static int __write_machine_check(struct kvm_vcpu *vcpu,
 				 struct kvm_s390_mchk_info *mchk)
 {
 	unsigned long ext_sa_addr;
+	unsigned long lc;
 	freg_t fprs[NUM_FPRS];
 	union mci mci;
 	int rc;
@@ -418,12 +419,34 @@ static int __write_machine_check(struct kvm_vcpu *vcpu,
 	/* take care of lazy register loading */
 	save_fpu_regs();
 	save_access_regs(vcpu->run->s.regs.acrs);
+	if (MACHINE_HAS_GS && vcpu->arch.gs_enabled)
+		save_gs_cb(current->thread.gs_cb);
 
 	/* Extended save area */
-	rc = read_guest_lc(vcpu, __LC_VX_SAVE_AREA_ADDR, &ext_sa_addr,
-			    sizeof(unsigned long));
-	/* Only bits 0-53 are used for address formation */
-	ext_sa_addr &= ~0x3ffUL;
+	rc = read_guest_lc(vcpu, __LC_MCESAD, &ext_sa_addr,
+			   sizeof(unsigned long));
+	/* Only bits 0 through 63-LC are used for address formation */
+	lc = ext_sa_addr & MCESA_LC_MASK;
+	if (test_kvm_facility(vcpu->kvm, 133)) {
+		switch (lc) {
+		case 0:
+		case 10:
+			ext_sa_addr &= ~0x3ffUL;
+			break;
+		case 11:
+			ext_sa_addr &= ~0x7ffUL;
+			break;
+		case 12:
+			ext_sa_addr &= ~0xfffUL;
+			break;
+		default:
+			ext_sa_addr = 0;
+			break;
+		}
+	} else {
+		ext_sa_addr &= ~0x3ffUL;
+	}
+
 	if (!rc && mci.vr && ext_sa_addr && test_kvm_facility(vcpu->kvm, 129)) {
 		if (write_guest_abs(vcpu, ext_sa_addr, vcpu->run->s.regs.vrs,
 				    512))
@@ -431,6 +454,14 @@ static int __write_machine_check(struct kvm_vcpu *vcpu,
 	} else {
 		mci.vr = 0;
 	}
+	if (!rc && mci.gs && ext_sa_addr && test_kvm_facility(vcpu->kvm, 133)
+	    && (lc == 11 || lc == 12)) {
+		if (write_guest_abs(vcpu, ext_sa_addr + 1024,
+				    &vcpu->run->s.regs.gscb, 32))
+			mci.gs = 0;
+	} else {
+		mci.gs = 0;
+	}
 
 	/* General interruption information */
 	rc |= put_guest_lc(vcpu, 1, (u8 __user *) __LC_AR_MODE_ID);
@@ -1968,6 +1999,8 @@ static int register_io_adapter(struct kvm_device *dev,
 	adapter->maskable = adapter_info.maskable;
 	adapter->masked = false;
 	adapter->swap = adapter_info.swap;
+	adapter->suppressible = (adapter_info.flags) &
+				KVM_S390_ADAPTER_SUPPRESSIBLE;
 	dev->kvm->arch.adapters[adapter->id] = adapter;
 
 	return 0;
@@ -2121,6 +2154,87 @@ static int clear_io_irq(struct kvm *kvm, struct kvm_device_attr *attr)
 	return 0;
 }
 
+static int modify_ais_mode(struct kvm *kvm, struct kvm_device_attr *attr)
+{
+	struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int;
+	struct kvm_s390_ais_req req;
+	int ret = 0;
+
+	if (!fi->ais_enabled)
+		return -ENOTSUPP;
+
+	if (copy_from_user(&req, (void __user *)attr->addr, sizeof(req)))
+		return -EFAULT;
+
+	if (req.isc > MAX_ISC)
+		return -EINVAL;
+
+	trace_kvm_s390_modify_ais_mode(req.isc,
+				       (fi->simm & AIS_MODE_MASK(req.isc)) ?
+				       (fi->nimm & AIS_MODE_MASK(req.isc)) ?
+				       2 : KVM_S390_AIS_MODE_SINGLE :
+				       KVM_S390_AIS_MODE_ALL, req.mode);
+
+	mutex_lock(&fi->ais_lock);
+	switch (req.mode) {
+	case KVM_S390_AIS_MODE_ALL:
+		fi->simm &= ~AIS_MODE_MASK(req.isc);
+		fi->nimm &= ~AIS_MODE_MASK(req.isc);
+		break;
+	case KVM_S390_AIS_MODE_SINGLE:
+		fi->simm |= AIS_MODE_MASK(req.isc);
+		fi->nimm &= ~AIS_MODE_MASK(req.isc);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+	mutex_unlock(&fi->ais_lock);
+
+	return ret;
+}
+
+static int kvm_s390_inject_airq(struct kvm *kvm,
+				struct s390_io_adapter *adapter)
+{
+	struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int;
+	struct kvm_s390_interrupt s390int = {
+		.type = KVM_S390_INT_IO(1, 0, 0, 0),
+		.parm = 0,
+		.parm64 = (adapter->isc << 27) | 0x80000000,
+	};
+	int ret = 0;
+
+	if (!fi->ais_enabled || !adapter->suppressible)
+		return kvm_s390_inject_vm(kvm, &s390int);
+
+	mutex_lock(&fi->ais_lock);
+	if (fi->nimm & AIS_MODE_MASK(adapter->isc)) {
+		trace_kvm_s390_airq_suppressed(adapter->id, adapter->isc);
+		goto out;
+	}
+
+	ret = kvm_s390_inject_vm(kvm, &s390int);
+	if (!ret && (fi->simm & AIS_MODE_MASK(adapter->isc))) {
+		fi->nimm |= AIS_MODE_MASK(adapter->isc);
+		trace_kvm_s390_modify_ais_mode(adapter->isc,
+					       KVM_S390_AIS_MODE_SINGLE, 2);
+	}
+out:
+	mutex_unlock(&fi->ais_lock);
+	return ret;
+}
+
+static int flic_inject_airq(struct kvm *kvm, struct kvm_device_attr *attr)
+{
+	unsigned int id = attr->attr;
+	struct s390_io_adapter *adapter = get_io_adapter(kvm, id);
+
+	if (!adapter)
+		return -EINVAL;
+
+	return kvm_s390_inject_airq(kvm, adapter);
+}
+
 static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
 {
 	int r = 0;
@@ -2157,6 +2271,12 @@ static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
 	case KVM_DEV_FLIC_CLEAR_IO_IRQ:
 		r = clear_io_irq(dev->kvm, attr);
 		break;
+	case KVM_DEV_FLIC_AISM:
+		r = modify_ais_mode(dev->kvm, attr);
+		break;
+	case KVM_DEV_FLIC_AIRQ_INJECT:
+		r = flic_inject_airq(dev->kvm, attr);
+		break;
 	default:
 		r = -EINVAL;
 	}
@@ -2176,6 +2296,8 @@ static int flic_has_attr(struct kvm_device *dev,
 	case KVM_DEV_FLIC_ADAPTER_REGISTER:
 	case KVM_DEV_FLIC_ADAPTER_MODIFY:
 	case KVM_DEV_FLIC_CLEAR_IO_IRQ:
+	case KVM_DEV_FLIC_AISM:
+	case KVM_DEV_FLIC_AIRQ_INJECT:
 		return 0;
 	}
 	return -ENXIO;
@@ -2286,12 +2408,7 @@ static int set_adapter_int(struct kvm_kernel_irq_routing_entry *e,
 	ret = adapter_indicators_set(kvm, adapter, &e->adapter);
 	up_read(&adapter->maps_lock);
 	if ((ret > 0) && !adapter->masked) {
-		struct kvm_s390_interrupt s390int = {
-			.type = KVM_S390_INT_IO(1, 0, 0, 0),
-			.parm = 0,
-			.parm64 = (adapter->isc << 27) | 0x80000000,
-		};
-		ret = kvm_s390_inject_vm(kvm, &s390int);
+		ret = kvm_s390_inject_airq(kvm, adapter);
 		if (ret == 0)
 			ret = 1;
 	}
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index fd6cd05b..4bafb0a 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -276,6 +276,10 @@ static void kvm_s390_cpu_feat_init(void)
 		__cpacf_query(CPACF_PPNO, (cpacf_mask_t *)
 			      kvm_s390_available_subfunc.ppno);
 
+	if (test_facility(146)) /* MSA8 */
+		__cpacf_query(CPACF_KMA, (cpacf_mask_t *)
+			      kvm_s390_available_subfunc.kma);
+
 	if (MACHINE_HAS_ESOP)
 		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_ESOP);
 	/*
@@ -300,6 +304,8 @@ static void kvm_s390_cpu_feat_init(void)
 		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_CEI);
 	if (sclp.has_ibs)
 		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_IBS);
+	if (sclp.has_kss)
+		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_KSS);
 	/*
 	 * KVM_S390_VM_CPU_FEAT_SKEY: Wrong shadow of PTE.I bits will make
 	 * all skey handling functions read/set the skey from the PGSTE
@@ -380,6 +386,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_S390_SKEYS:
 	case KVM_CAP_S390_IRQ_STATE:
 	case KVM_CAP_S390_USER_INSTR0:
+	case KVM_CAP_S390_AIS:
 		r = 1;
 		break;
 	case KVM_CAP_S390_MEM_OP:
@@ -405,6 +412,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_S390_RI:
 		r = test_facility(64);
 		break;
+	case KVM_CAP_S390_GS:
+		r = test_facility(133);
+		break;
 	default:
 		r = 0;
 	}
@@ -541,6 +551,34 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
 		VM_EVENT(kvm, 3, "ENABLE: CAP_S390_RI %s",
 			 r ? "(not available)" : "(success)");
 		break;
+	case KVM_CAP_S390_AIS:
+		mutex_lock(&kvm->lock);
+		if (kvm->created_vcpus) {
+			r = -EBUSY;
+		} else {
+			set_kvm_facility(kvm->arch.model.fac_mask, 72);
+			set_kvm_facility(kvm->arch.model.fac_list, 72);
+			kvm->arch.float_int.ais_enabled = 1;
+			r = 0;
+		}
+		mutex_unlock(&kvm->lock);
+		VM_EVENT(kvm, 3, "ENABLE: AIS %s",
+			 r ? "(not available)" : "(success)");
+		break;
+	case KVM_CAP_S390_GS:
+		r = -EINVAL;
+		mutex_lock(&kvm->lock);
+		if (atomic_read(&kvm->online_vcpus)) {
+			r = -EBUSY;
+		} else if (test_facility(133)) {
+			set_kvm_facility(kvm->arch.model.fac_mask, 133);
+			set_kvm_facility(kvm->arch.model.fac_list, 133);
+			r = 0;
+		}
+		mutex_unlock(&kvm->lock);
+		VM_EVENT(kvm, 3, "ENABLE: CAP_S390_GS %s",
+			 r ? "(not available)" : "(success)");
+		break;
 	case KVM_CAP_S390_USER_STSI:
 		VM_EVENT(kvm, 3, "%s", "ENABLE: CAP_S390_USER_STSI");
 		kvm->arch.user_stsi = 1;
@@ -1498,6 +1536,10 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 
 	kvm_s390_crypto_init(kvm);
 
+	mutex_init(&kvm->arch.float_int.ais_lock);
+	kvm->arch.float_int.simm = 0;
+	kvm->arch.float_int.nimm = 0;
+	kvm->arch.float_int.ais_enabled = 0;
 	spin_lock_init(&kvm->arch.float_int.lock);
 	for (i = 0; i < FIRQ_LIST_COUNT; i++)
 		INIT_LIST_HEAD(&kvm->arch.float_int.lists[i]);
@@ -1646,7 +1688,7 @@ static void sca_add_vcpu(struct kvm_vcpu *vcpu)
 		sca->cpu[vcpu->vcpu_id].sda = (__u64) vcpu->arch.sie_block;
 		vcpu->arch.sie_block->scaoh = (__u32)(((__u64)sca) >> 32);
 		vcpu->arch.sie_block->scaol = (__u32)(__u64)sca & ~0x3fU;
-		vcpu->arch.sie_block->ecb2 |= 0x04U;
+		vcpu->arch.sie_block->ecb2 |= ECB2_ESCA;
 		set_bit_inv(vcpu->vcpu_id, (unsigned long *) sca->mcn);
 	} else {
 		struct bsca_block *sca = vcpu->kvm->arch.sca;
@@ -1700,7 +1742,7 @@ static int sca_switch_to_extended(struct kvm *kvm)
 	kvm_for_each_vcpu(vcpu_idx, vcpu, kvm) {
 		vcpu->arch.sie_block->scaoh = scaoh;
 		vcpu->arch.sie_block->scaol = scaol;
-		vcpu->arch.sie_block->ecb2 |= 0x04U;
+		vcpu->arch.sie_block->ecb2 |= ECB2_ESCA;
 	}
 	kvm->arch.sca = new_sca;
 	kvm->arch.use_esca = 1;
@@ -1749,6 +1791,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	kvm_s390_set_prefix(vcpu, 0);
 	if (test_kvm_facility(vcpu->kvm, 64))
 		vcpu->run->kvm_valid_regs |= KVM_SYNC_RICCB;
+	if (test_kvm_facility(vcpu->kvm, 133))
+		vcpu->run->kvm_valid_regs |= KVM_SYNC_GSCB;
 	/* fprs can be synchronized via vrs, even if the guest has no vx. With
 	 * MACHINE_HAS_VX, (load|store)_fpu_regs() will work with vrs format.
 	 */
@@ -1939,8 +1983,8 @@ int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu)
 	if (!vcpu->arch.sie_block->cbrlo)
 		return -ENOMEM;
 
-	vcpu->arch.sie_block->ecb2 |= 0x80;
-	vcpu->arch.sie_block->ecb2 &= ~0x08;
+	vcpu->arch.sie_block->ecb2 |= ECB2_CMMA;
+	vcpu->arch.sie_block->ecb2 &= ~ECB2_PFMFI;
 	return 0;
 }
 
@@ -1970,31 +2014,37 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 
 	/* pgste_set_pte has special handling for !MACHINE_HAS_ESOP */
 	if (MACHINE_HAS_ESOP)
-		vcpu->arch.sie_block->ecb |= 0x02;
+		vcpu->arch.sie_block->ecb |= ECB_HOSTPROTINT;
 	if (test_kvm_facility(vcpu->kvm, 9))
-		vcpu->arch.sie_block->ecb |= 0x04;
+		vcpu->arch.sie_block->ecb |= ECB_SRSI;
 	if (test_kvm_facility(vcpu->kvm, 73))
-		vcpu->arch.sie_block->ecb |= 0x10;
+		vcpu->arch.sie_block->ecb |= ECB_TE;
 
 	if (test_kvm_facility(vcpu->kvm, 8) && sclp.has_pfmfi)
-		vcpu->arch.sie_block->ecb2 |= 0x08;
+		vcpu->arch.sie_block->ecb2 |= ECB2_PFMFI;
 	if (test_kvm_facility(vcpu->kvm, 130))
-		vcpu->arch.sie_block->ecb2 |= 0x20;
-	vcpu->arch.sie_block->eca = 0x1002000U;
+		vcpu->arch.sie_block->ecb2 |= ECB2_IEP;
+	vcpu->arch.sie_block->eca = ECA_MVPGI | ECA_PROTEXCI;
 	if (sclp.has_cei)
-		vcpu->arch.sie_block->eca |= 0x80000000U;
+		vcpu->arch.sie_block->eca |= ECA_CEI;
 	if (sclp.has_ib)
-		vcpu->arch.sie_block->eca |= 0x40000000U;
+		vcpu->arch.sie_block->eca |= ECA_IB;
 	if (sclp.has_siif)
-		vcpu->arch.sie_block->eca |= 1;
+		vcpu->arch.sie_block->eca |= ECA_SII;
 	if (sclp.has_sigpif)
-		vcpu->arch.sie_block->eca |= 0x10000000U;
+		vcpu->arch.sie_block->eca |= ECA_SIGPI;
 	if (test_kvm_facility(vcpu->kvm, 129)) {
-		vcpu->arch.sie_block->eca |= 0x00020000;
-		vcpu->arch.sie_block->ecd |= 0x20000000;
+		vcpu->arch.sie_block->eca |= ECA_VX;
+		vcpu->arch.sie_block->ecd |= ECD_HOSTREGMGMT;
 	}
+	vcpu->arch.sie_block->sdnxo = ((unsigned long) &vcpu->run->s.regs.sdnx)
+					| SDNXC;
 	vcpu->arch.sie_block->riccbd = (unsigned long) &vcpu->run->s.regs.riccb;
-	vcpu->arch.sie_block->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE;
+
+	if (sclp.has_kss)
+		atomic_or(CPUSTAT_KSS, &vcpu->arch.sie_block->cpuflags);
+	else
+		vcpu->arch.sie_block->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE;
 
 	if (vcpu->kvm->arch.use_cmma) {
 		rc = kvm_s390_vcpu_setup_cmma(vcpu);
@@ -2446,7 +2496,7 @@ retry:
 	}
 
 	/* nothing to do, just clear the request */
-	clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
+	kvm_clear_request(KVM_REQ_UNHALT, vcpu);
 
 	return 0;
 }
@@ -2719,6 +2769,11 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
 
 static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
+	struct runtime_instr_cb *riccb;
+	struct gs_cb *gscb;
+
+	riccb = (struct runtime_instr_cb *) &kvm_run->s.regs.riccb;
+	gscb = (struct gs_cb *) &kvm_run->s.regs.gscb;
 	vcpu->arch.sie_block->gpsw.mask = kvm_run->psw_mask;
 	vcpu->arch.sie_block->gpsw.addr = kvm_run->psw_addr;
 	if (kvm_run->kvm_dirty_regs & KVM_SYNC_PREFIX)
@@ -2747,12 +2802,24 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	 * we should enable RI here instead of doing the lazy enablement.
 	 */
 	if ((kvm_run->kvm_dirty_regs & KVM_SYNC_RICCB) &&
-	    test_kvm_facility(vcpu->kvm, 64)) {
-		struct runtime_instr_cb *riccb =
-			(struct runtime_instr_cb *) &kvm_run->s.regs.riccb;
-
-		if (riccb->valid)
-			vcpu->arch.sie_block->ecb3 |= 0x01;
+	    test_kvm_facility(vcpu->kvm, 64) &&
+	    riccb->valid &&
+	    !(vcpu->arch.sie_block->ecb3 & ECB3_RI)) {
+		VCPU_EVENT(vcpu, 3, "%s", "ENABLE: RI (sync_regs)");
+		vcpu->arch.sie_block->ecb3 |= ECB3_RI;
+	}
+	/*
+	 * If userspace sets the gscb (e.g. after migration) to non-zero,
+	 * we should enable GS here instead of doing the lazy enablement.
+	 */
+	if ((kvm_run->kvm_dirty_regs & KVM_SYNC_GSCB) &&
+	    test_kvm_facility(vcpu->kvm, 133) &&
+	    gscb->gssm &&
+	    !vcpu->arch.gs_enabled) {
+		VCPU_EVENT(vcpu, 3, "%s", "ENABLE: GS (sync_regs)");
+		vcpu->arch.sie_block->ecb |= ECB_GS;
+		vcpu->arch.sie_block->ecd |= ECD_HOSTREGMGMT;
+		vcpu->arch.gs_enabled = 1;
 	}
 	save_access_regs(vcpu->arch.host_acrs);
 	restore_access_regs(vcpu->run->s.regs.acrs);
@@ -2768,6 +2835,20 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	if (test_fp_ctl(current->thread.fpu.fpc))
 		/* User space provided an invalid FPC, let's clear it */
 		current->thread.fpu.fpc = 0;
+	if (MACHINE_HAS_GS) {
+		preempt_disable();
+		__ctl_set_bit(2, 4);
+		if (current->thread.gs_cb) {
+			vcpu->arch.host_gscb = current->thread.gs_cb;
+			save_gs_cb(vcpu->arch.host_gscb);
+		}
+		if (vcpu->arch.gs_enabled) {
+			current->thread.gs_cb = (struct gs_cb *)
+						&vcpu->run->s.regs.gscb;
+			restore_gs_cb(current->thread.gs_cb);
+		}
+		preempt_enable();
+	}
 
 	kvm_run->kvm_dirty_regs = 0;
 }
@@ -2794,6 +2875,18 @@ static void store_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	/* Restore will be done lazily at return */
 	current->thread.fpu.fpc = vcpu->arch.host_fpregs.fpc;
 	current->thread.fpu.regs = vcpu->arch.host_fpregs.regs;
+	if (MACHINE_HAS_GS) {
+		__ctl_set_bit(2, 4);
+		if (vcpu->arch.gs_enabled)
+			save_gs_cb(current->thread.gs_cb);
+		preempt_disable();
+		current->thread.gs_cb = vcpu->arch.host_gscb;
+		restore_gs_cb(vcpu->arch.host_gscb);
+		preempt_enable();
+		if (!vcpu->arch.host_gscb)
+			__ctl_clear_bit(2, 4);
+		vcpu->arch.host_gscb = NULL;
+	}
 
 }
 
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index af9fa91..55f5c84 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -25,7 +25,7 @@
 typedef int (*intercept_handler_t)(struct kvm_vcpu *vcpu);
 
 /* Transactional Memory Execution related macros */
-#define IS_TE_ENABLED(vcpu)	((vcpu->arch.sie_block->ecb & 0x10))
+#define IS_TE_ENABLED(vcpu)	((vcpu->arch.sie_block->ecb & ECB_TE))
 #define TDB_FORMAT1		1
 #define IS_ITDB_VALID(vcpu)	((*(char *)vcpu->arch.sie_block->itdba == TDB_FORMAT1))
 
@@ -246,6 +246,7 @@ static inline void kvm_s390_retry_instr(struct kvm_vcpu *vcpu)
 int is_valid_psw(psw_t *psw);
 int kvm_s390_handle_aa(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_b2(struct kvm_vcpu *vcpu);
+int kvm_s390_handle_e3(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_e5(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_01(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_b9(struct kvm_vcpu *vcpu);
@@ -253,6 +254,7 @@ int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_stctl(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_lctl(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_eb(struct kvm_vcpu *vcpu);
+int kvm_s390_skey_check_enable(struct kvm_vcpu *vcpu);
 
 /* implemented in vsie.c */
 int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu);
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 64b6a30..c03106c 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -37,7 +37,8 @@
 static int handle_ri(struct kvm_vcpu *vcpu)
 {
 	if (test_kvm_facility(vcpu->kvm, 64)) {
-		vcpu->arch.sie_block->ecb3 |= 0x01;
+		VCPU_EVENT(vcpu, 3, "%s", "ENABLE: RI (lazy)");
+		vcpu->arch.sie_block->ecb3 |= ECB3_RI;
 		kvm_s390_retry_instr(vcpu);
 		return 0;
 	} else
@@ -52,6 +53,33 @@ int kvm_s390_handle_aa(struct kvm_vcpu *vcpu)
 		return -EOPNOTSUPP;
 }
 
+static int handle_gs(struct kvm_vcpu *vcpu)
+{
+	if (test_kvm_facility(vcpu->kvm, 133)) {
+		VCPU_EVENT(vcpu, 3, "%s", "ENABLE: GS (lazy)");
+		preempt_disable();
+		__ctl_set_bit(2, 4);
+		current->thread.gs_cb = (struct gs_cb *)&vcpu->run->s.regs.gscb;
+		restore_gs_cb(current->thread.gs_cb);
+		preempt_enable();
+		vcpu->arch.sie_block->ecb |= ECB_GS;
+		vcpu->arch.sie_block->ecd |= ECD_HOSTREGMGMT;
+		vcpu->arch.gs_enabled = 1;
+		kvm_s390_retry_instr(vcpu);
+		return 0;
+	} else
+		return kvm_s390_inject_program_int(vcpu, PGM_OPERATION);
+}
+
+int kvm_s390_handle_e3(struct kvm_vcpu *vcpu)
+{
+	int code = vcpu->arch.sie_block->ipb & 0xff;
+
+	if (code == 0x49 || code == 0x4d)
+		return handle_gs(vcpu);
+	else
+		return -EOPNOTSUPP;
+}
 /* Handle SCK (SET CLOCK) interception */
 static int handle_set_clock(struct kvm_vcpu *vcpu)
 {
@@ -170,18 +198,25 @@ static int handle_store_cpu_address(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
-static int __skey_check_enable(struct kvm_vcpu *vcpu)
+int kvm_s390_skey_check_enable(struct kvm_vcpu *vcpu)
 {
 	int rc = 0;
+	struct kvm_s390_sie_block *sie_block = vcpu->arch.sie_block;
 
 	trace_kvm_s390_skey_related_inst(vcpu);
-	if (!(vcpu->arch.sie_block->ictl & (ICTL_ISKE | ICTL_SSKE | ICTL_RRBE)))
+	if (!(sie_block->ictl & (ICTL_ISKE | ICTL_SSKE | ICTL_RRBE)) &&
+	    !(atomic_read(&sie_block->cpuflags) & CPUSTAT_KSS))
 		return rc;
 
 	rc = s390_enable_skey();
 	VCPU_EVENT(vcpu, 3, "enabling storage keys for guest: %d", rc);
-	if (!rc)
-		vcpu->arch.sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE | ICTL_RRBE);
+	if (!rc) {
+		if (atomic_read(&sie_block->cpuflags) & CPUSTAT_KSS)
+			atomic_andnot(CPUSTAT_KSS, &sie_block->cpuflags);
+		else
+			sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE |
+					     ICTL_RRBE);
+	}
 	return rc;
 }
 
@@ -190,7 +225,7 @@ static int try_handle_skey(struct kvm_vcpu *vcpu)
 	int rc;
 
 	vcpu->stat.instruction_storage_key++;
-	rc = __skey_check_enable(vcpu);
+	rc = kvm_s390_skey_check_enable(vcpu);
 	if (rc)
 		return rc;
 	if (sclp.has_skey) {
@@ -759,6 +794,7 @@ static const intercept_handler_t b2_handlers[256] = {
 	[0x3b] = handle_io_inst,
 	[0x3c] = handle_io_inst,
 	[0x50] = handle_ipte_interlock,
+	[0x56] = handle_sthyi,
 	[0x5f] = handle_io_inst,
 	[0x74] = handle_io_inst,
 	[0x76] = handle_io_inst,
@@ -887,7 +923,7 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
 		}
 
 		if (vcpu->run->s.regs.gprs[reg1] & PFMF_SK) {
-			int rc = __skey_check_enable(vcpu);
+			int rc = kvm_s390_skey_check_enable(vcpu);
 
 			if (rc)
 				return rc;
diff --git a/arch/s390/kvm/sthyi.c b/arch/s390/kvm/sthyi.c
index 05c98bb..926b524 100644
--- a/arch/s390/kvm/sthyi.c
+++ b/arch/s390/kvm/sthyi.c
@@ -404,6 +404,9 @@ int handle_sthyi(struct kvm_vcpu *vcpu)
 	u64 code, addr, cc = 0;
 	struct sthyi_sctns *sctns = NULL;
 
+	if (!test_kvm_facility(vcpu->kvm, 74))
+		return kvm_s390_inject_program_int(vcpu, PGM_OPERATION);
+
 	/*
 	 * STHYI requires extensive locking in the higher hypervisors
 	 * and is very computational/memory expensive. Therefore we
diff --git a/arch/s390/kvm/trace-s390.h b/arch/s390/kvm/trace-s390.h
index 396485b..78b7e84 100644
--- a/arch/s390/kvm/trace-s390.h
+++ b/arch/s390/kvm/trace-s390.h
@@ -280,6 +280,58 @@ TRACE_EVENT(kvm_s390_enable_disable_ibs,
 		      __entry->state ? "enabling" : "disabling", __entry->id)
 	);
 
+/*
+ * Trace point for modifying ais mode for a given isc.
+ */
+TRACE_EVENT(kvm_s390_modify_ais_mode,
+	    TP_PROTO(__u8 isc, __u16 from, __u16 to),
+	    TP_ARGS(isc, from, to),
+
+	    TP_STRUCT__entry(
+		    __field(__u8, isc)
+		    __field(__u16, from)
+		    __field(__u16, to)
+		    ),
+
+	    TP_fast_assign(
+		    __entry->isc = isc;
+		    __entry->from = from;
+		    __entry->to = to;
+		    ),
+
+	    TP_printk("for isc %x, modifying interruption mode from %s to %s",
+		      __entry->isc,
+		      (__entry->from == KVM_S390_AIS_MODE_ALL) ?
+		      "ALL-Interruptions Mode" :
+		      (__entry->from == KVM_S390_AIS_MODE_SINGLE) ?
+		      "Single-Interruption Mode" : "No-Interruptions Mode",
+		      (__entry->to == KVM_S390_AIS_MODE_ALL) ?
+		      "ALL-Interruptions Mode" :
+		      (__entry->to == KVM_S390_AIS_MODE_SINGLE) ?
+		      "Single-Interruption Mode" : "No-Interruptions Mode")
+	);
+
+/*
+ * Trace point for suppressed adapter I/O interrupt.
+ */
+TRACE_EVENT(kvm_s390_airq_suppressed,
+	    TP_PROTO(__u32 id, __u8 isc),
+	    TP_ARGS(id, isc),
+
+	    TP_STRUCT__entry(
+		    __field(__u32, id)
+		    __field(__u8, isc)
+		    ),
+
+	    TP_fast_assign(
+		    __entry->id = id;
+		    __entry->isc = isc;
+		    ),
+
+	    TP_printk("adapter I/O interrupt suppressed (id:%x isc:%x)",
+		      __entry->id, __entry->isc)
+	);
+
 
 #endif /* _TRACE_KVMS390_H */
 
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 5491be3..4719ecb 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -117,6 +117,8 @@ static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 		newflags |= cpuflags & CPUSTAT_SM;
 	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IBS))
 		newflags |= cpuflags & CPUSTAT_IBS;
+	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_KSS))
+		newflags |= cpuflags & CPUSTAT_KSS;
 
 	atomic_set(&scb_s->cpuflags, newflags);
 	return 0;
@@ -249,7 +251,7 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 {
 	struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
 	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
-	bool had_tx = scb_s->ecb & 0x10U;
+	bool had_tx = scb_s->ecb & ECB_TE;
 	unsigned long new_mso = 0;
 	int rc;
 
@@ -289,7 +291,9 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	 * bits. Therefore we cannot provide interpretation and would later
 	 * have to provide own emulation handlers.
 	 */
-	scb_s->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE;
+	if (!(atomic_read(&scb_s->cpuflags) & CPUSTAT_KSS))
+		scb_s->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE;
+
 	scb_s->icpua = scb_o->icpua;
 
 	if (!(atomic_read(&scb_s->cpuflags) & CPUSTAT_SM))
@@ -307,34 +311,39 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 		scb_s->ihcpu = scb_o->ihcpu;
 
 	/* MVPG and Protection Exception Interpretation are always available */
-	scb_s->eca |= scb_o->eca & 0x01002000U;
+	scb_s->eca |= scb_o->eca & (ECA_MVPGI | ECA_PROTEXCI);
 	/* Host-protection-interruption introduced with ESOP */
 	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_ESOP))
-		scb_s->ecb |= scb_o->ecb & 0x02U;
+		scb_s->ecb |= scb_o->ecb & ECB_HOSTPROTINT;
 	/* transactional execution */
 	if (test_kvm_facility(vcpu->kvm, 73)) {
 		/* remap the prefix is tx is toggled on */
-		if ((scb_o->ecb & 0x10U) && !had_tx)
+		if ((scb_o->ecb & ECB_TE) && !had_tx)
 			prefix_unmapped(vsie_page);
-		scb_s->ecb |= scb_o->ecb & 0x10U;
+		scb_s->ecb |= scb_o->ecb & ECB_TE;
 	}
 	/* SIMD */
 	if (test_kvm_facility(vcpu->kvm, 129)) {
-		scb_s->eca |= scb_o->eca & 0x00020000U;
-		scb_s->ecd |= scb_o->ecd & 0x20000000U;
+		scb_s->eca |= scb_o->eca & ECA_VX;
+		scb_s->ecd |= scb_o->ecd & ECD_HOSTREGMGMT;
 	}
 	/* Run-time-Instrumentation */
 	if (test_kvm_facility(vcpu->kvm, 64))
-		scb_s->ecb3 |= scb_o->ecb3 & 0x01U;
+		scb_s->ecb3 |= scb_o->ecb3 & ECB3_RI;
 	/* Instruction Execution Prevention */
 	if (test_kvm_facility(vcpu->kvm, 130))
-		scb_s->ecb2 |= scb_o->ecb2 & 0x20U;
+		scb_s->ecb2 |= scb_o->ecb2 & ECB2_IEP;
+	/* Guarded Storage */
+	if (test_kvm_facility(vcpu->kvm, 133)) {
+		scb_s->ecb |= scb_o->ecb & ECB_GS;
+		scb_s->ecd |= scb_o->ecd & ECD_HOSTREGMGMT;
+	}
 	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIIF))
-		scb_s->eca |= scb_o->eca & 0x00000001U;
+		scb_s->eca |= scb_o->eca & ECA_SII;
 	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IB))
-		scb_s->eca |= scb_o->eca & 0x40000000U;
+		scb_s->eca |= scb_o->eca & ECA_IB;
 	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_CEI))
-		scb_s->eca |= scb_o->eca & 0x80000000U;
+		scb_s->eca |= scb_o->eca & ECA_CEI;
 
 	prepare_ibc(vcpu, vsie_page);
 	rc = shadow_crycb(vcpu, vsie_page);
@@ -406,7 +415,7 @@ static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	prefix += scb_s->mso;
 
 	rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix);
-	if (!rc && (scb_s->ecb & 0x10U))
+	if (!rc && (scb_s->ecb & ECB_TE))
 		rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
 					   prefix + PAGE_SIZE);
 	/*
@@ -496,6 +505,13 @@ static void unpin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 		unpin_guest_page(vcpu->kvm, gpa, hpa);
 		scb_s->riccbd = 0;
 	}
+
+	hpa = scb_s->sdnxo;
+	if (hpa) {
+		gpa = scb_o->sdnxo;
+		unpin_guest_page(vcpu->kvm, gpa, hpa);
+		scb_s->sdnxo = 0;
+	}
 }
 
 /*
@@ -543,7 +559,7 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	}
 
 	gpa = scb_o->itdba & ~0xffUL;
-	if (gpa && (scb_s->ecb & 0x10U)) {
+	if (gpa && (scb_s->ecb & ECB_TE)) {
 		if (!(gpa & ~0x1fffU)) {
 			rc = set_validity_icpt(scb_s, 0x0080U);
 			goto unpin;
@@ -558,8 +574,7 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	}
 
 	gpa = scb_o->gvrd & ~0x1ffUL;
-	if (gpa && (scb_s->eca & 0x00020000U) &&
-	    !(scb_s->ecd & 0x20000000U)) {
+	if (gpa && (scb_s->eca & ECA_VX) && !(scb_s->ecd & ECD_HOSTREGMGMT)) {
 		if (!(gpa & ~0x1fffUL)) {
 			rc = set_validity_icpt(scb_s, 0x1310U);
 			goto unpin;
@@ -577,7 +592,7 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	}
 
 	gpa = scb_o->riccbd & ~0x3fUL;
-	if (gpa && (scb_s->ecb3 & 0x01U)) {
+	if (gpa && (scb_s->ecb3 & ECB3_RI)) {
 		if (!(gpa & ~0x1fffUL)) {
 			rc = set_validity_icpt(scb_s, 0x0043U);
 			goto unpin;
@@ -591,6 +606,33 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 			goto unpin;
 		scb_s->riccbd = hpa;
 	}
+	if ((scb_s->ecb & ECB_GS) && !(scb_s->ecd & ECD_HOSTREGMGMT)) {
+		unsigned long sdnxc;
+
+		gpa = scb_o->sdnxo & ~0xfUL;
+		sdnxc = scb_o->sdnxo & 0xfUL;
+		if (!gpa || !(gpa & ~0x1fffUL)) {
+			rc = set_validity_icpt(scb_s, 0x10b0U);
+			goto unpin;
+		}
+		if (sdnxc < 6 || sdnxc > 12) {
+			rc = set_validity_icpt(scb_s, 0x10b1U);
+			goto unpin;
+		}
+		if (gpa & ((1 << sdnxc) - 1)) {
+			rc = set_validity_icpt(scb_s, 0x10b2U);
+			goto unpin;
+		}
+		/* Due to alignment rules (checked above) this cannot
+		 * cross page boundaries
+		 */
+		rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+		if (rc == -EINVAL)
+			rc = set_validity_icpt(scb_s, 0x10b0U);
+		if (rc)
+			goto unpin;
+		scb_s->sdnxo = hpa | sdnxc;
+	}
 	return 0;
 unpin:
 	unpin_blocks(vcpu, vsie_page);
diff --git a/arch/s390/tools/gen_facilities.c b/arch/s390/tools/gen_facilities.c
index 0cf802d..be63fbd 100644
--- a/arch/s390/tools/gen_facilities.c
+++ b/arch/s390/tools/gen_facilities.c
@@ -82,6 +82,7 @@ static struct facility_def facility_defs[] = {
 			78, /* enhanced-DAT 2 */
 			130, /* instruction-execution-protection */
 			131, /* enhanced-SOP 2 and side-effect */
+			146, /* msa extension 8 */
 			-1  /* END */
 		}
 	},
diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h
index de5d572..cd1fa97 100644
--- a/arch/um/include/shared/os.h
+++ b/arch/um/include/shared/os.h
@@ -302,8 +302,8 @@ extern int ignore_sigio_fd(int fd);
 extern void maybe_sigio_broken(int fd, int read);
 extern void sigio_broken(int fd, int read);
 
-/* sys-x86_64/prctl.c */
-extern int os_arch_prctl(int pid, int code, unsigned long *addr);
+/* prctl.c */
+extern int os_arch_prctl(int pid, int option, unsigned long *arg2);
 
 /* tty.c */
 extern int get_pty(void);
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 9ba050f..0af59fa 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -390,3 +390,4 @@
 381	i386	pkey_alloc		sys_pkey_alloc
 382	i386	pkey_free		sys_pkey_free
 383	i386	statx			sys_statx
+384	i386	arch_prctl		sys_arch_prctl			compat_sys_arch_prctl
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index b04bb6d..0fe0044 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -187,6 +187,7 @@
  * Reuse free bits when adding new feature flags!
  */
 #define X86_FEATURE_RING3MWAIT	( 7*32+ 0) /* Ring 3 MONITOR/MWAIT */
+#define X86_FEATURE_CPUID_FAULT ( 7*32+ 1) /* Intel CPUID faulting */
 #define X86_FEATURE_CPB		( 7*32+ 2) /* AMD Core Performance Boost */
 #define X86_FEATURE_EPB		( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
 #define X86_FEATURE_CAT_L3	( 7*32+ 4) /* Cache Allocation Technology L3 */
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 3e8c287..0559626 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -221,6 +221,9 @@ struct x86_emulate_ops {
 	void (*get_cpuid)(struct x86_emulate_ctxt *ctxt,
 			  u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);
 	void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked);
+
+	unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt);
+	void (*set_hflags)(struct x86_emulate_ctxt *ctxt, unsigned hflags);
 };
 
 typedef u32 __attribute__((vector_size(16))) sse128_t;
@@ -290,7 +293,6 @@ struct x86_emulate_ctxt {
 
 	/* interruptibility state, as a result of execution of STI or MOV SS */
 	int interruptibility;
-	int emul_flags;
 
 	bool perm_ok; /* do not check permissions if true */
 	bool ud;	/* inject an #UD if host doesn't support insn */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 74ef58c..84c8489 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -43,8 +43,6 @@
 #define KVM_PRIVATE_MEM_SLOTS 3
 #define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS)
 
-#define KVM_PIO_PAGE_OFFSET 1
-#define KVM_COALESCED_MMIO_PAGE_OFFSET 2
 #define KVM_HALT_POLL_NS_DEFAULT 400000
 
 #define KVM_IRQCHIP_NUM_PINS  KVM_IOAPIC_NUM_PINS
@@ -63,10 +61,10 @@
 #define KVM_REQ_PMI               19
 #define KVM_REQ_SMI               20
 #define KVM_REQ_MASTERCLOCK_UPDATE 21
-#define KVM_REQ_MCLOCK_INPROGRESS 22
-#define KVM_REQ_SCAN_IOAPIC       23
+#define KVM_REQ_MCLOCK_INPROGRESS (22 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_SCAN_IOAPIC       (23 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
 #define KVM_REQ_GLOBAL_CLOCK_UPDATE 24
-#define KVM_REQ_APIC_PAGE_RELOAD  25
+#define KVM_REQ_APIC_PAGE_RELOAD  (25 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
 #define KVM_REQ_HV_CRASH          26
 #define KVM_REQ_IOAPIC_EOI_EXIT   27
 #define KVM_REQ_HV_RESET          28
@@ -343,9 +341,10 @@ struct kvm_mmu {
 	void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			   u64 *spte, const void *pte);
 	hpa_t root_hpa;
-	int root_level;
-	int shadow_root_level;
 	union kvm_mmu_page_role base_role;
+	u8 root_level;
+	u8 shadow_root_level;
+	u8 ept_ad;
 	bool direct_map;
 
 	/*
@@ -612,6 +611,8 @@ struct kvm_vcpu_arch {
 	unsigned long dr7;
 	unsigned long eff_db[KVM_NR_DB_REGS];
 	unsigned long guest_debug_dr7;
+	u64 msr_platform_info;
+	u64 msr_misc_features_enables;
 
 	u64 mcg_cap;
 	u64 mcg_status;
@@ -727,6 +728,7 @@ struct kvm_hv {
 
 enum kvm_irqchip_mode {
 	KVM_IRQCHIP_NONE,
+	KVM_IRQCHIP_INIT_IN_PROGRESS, /* temporarily set during creation */
 	KVM_IRQCHIP_KERNEL,       /* created with KVM_CREATE_IRQCHIP */
 	KVM_IRQCHIP_SPLIT,        /* created with KVM_CAP_SPLIT_IRQCHIP */
 };
diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h
index d74747b..c4eda79 100644
--- a/arch/x86/include/asm/kvm_page_track.h
+++ b/arch/x86/include/asm/kvm_page_track.h
@@ -46,6 +46,7 @@ struct kvm_page_track_notifier_node {
 };
 
 void kvm_page_track_init(struct kvm *kvm);
+void kvm_page_track_cleanup(struct kvm *kvm);
 
 void kvm_page_track_free_memslot(struct kvm_memory_slot *free,
 				 struct kvm_memory_slot *dont);
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index d8b5f8a..673f9ac 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -45,6 +45,8 @@
 #define MSR_IA32_PERFCTR1		0x000000c2
 #define MSR_FSB_FREQ			0x000000cd
 #define MSR_PLATFORM_INFO		0x000000ce
+#define MSR_PLATFORM_INFO_CPUID_FAULT_BIT	31
+#define MSR_PLATFORM_INFO_CPUID_FAULT		BIT_ULL(MSR_PLATFORM_INFO_CPUID_FAULT_BIT)
 
 #define MSR_PKG_CST_CONFIG_CONTROL	0x000000e2
 #define NHM_C3_AUTO_DEMOTE		(1UL << 25)
@@ -127,6 +129,7 @@
 
 /* DEBUGCTLMSR bits (others vary by model): */
 #define DEBUGCTLMSR_LBR			(1UL <<  0) /* last branch recording */
+#define DEBUGCTLMSR_BTF_SHIFT		1
 #define DEBUGCTLMSR_BTF			(1UL <<  1) /* single-step on branches */
 #define DEBUGCTLMSR_TR			(1UL <<  6)
 #define DEBUGCTLMSR_BTS			(1UL <<  7)
@@ -552,10 +555,12 @@
 #define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE_BIT	39
 #define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE		(1ULL << MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE_BIT)
 
-/* MISC_FEATURE_ENABLES non-architectural features */
-#define MSR_MISC_FEATURE_ENABLES	0x00000140
+/* MISC_FEATURES_ENABLES non-architectural features */
+#define MSR_MISC_FEATURES_ENABLES	0x00000140
 
-#define MSR_MISC_FEATURE_ENABLES_RING3MWAIT_BIT		1
+#define MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT	0
+#define MSR_MISC_FEATURES_ENABLES_CPUID_FAULT		BIT_ULL(MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT)
+#define MSR_MISC_FEATURES_ENABLES_RING3MWAIT_BIT	1
 
 #define MSR_IA32_TSC_DEADLINE		0x000006E0
 
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index f385eca..a80c1b3 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -884,6 +884,8 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
 extern int get_tsc_mode(unsigned long adr);
 extern int set_tsc_mode(unsigned int val);
 
+DECLARE_PER_CPU(u64, msr_misc_features_shadow);
+
 /* Register/unregister a process' MPX related resource */
 #define MPX_ENABLE_MANAGEMENT()	mpx_enable_management()
 #define MPX_DISABLE_MANAGEMENT()	mpx_disable_management()
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index 9b9b30b1..8d3964f 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -9,6 +9,7 @@ void syscall_init(void);
 
 #ifdef CONFIG_X86_64
 void entry_SYSCALL_64(void);
+long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2);
 #endif
 
 #ifdef CONFIG_X86_32
@@ -30,6 +31,7 @@ void x86_report_nx(void);
 
 extern int reboot_force;
 
-long do_arch_prctl(struct task_struct *task, int code, unsigned long addr);
+long do_arch_prctl_common(struct task_struct *task, int option,
+			  unsigned long cpuid_enabled);
 
 #endif /* _ASM_X86_PROTO_H */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index ad6f5eb0..9fc44b9 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -87,6 +87,7 @@ struct thread_info {
 #define TIF_SECCOMP		8	/* secure computing */
 #define TIF_USER_RETURN_NOTIFY	11	/* notify kernel of userspace return */
 #define TIF_UPROBE		12	/* breakpointed or singlestepping */
+#define TIF_NOCPUID		15	/* CPUID is not accessible in userland */
 #define TIF_NOTSC		16	/* TSC is not accessible in userland */
 #define TIF_IA32		17	/* IA32 compatibility process */
 #define TIF_NOHZ		19	/* in adaptive nohz mode */
@@ -110,6 +111,7 @@ struct thread_info {
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
 #define _TIF_USER_RETURN_NOTIFY	(1 << TIF_USER_RETURN_NOTIFY)
 #define _TIF_UPROBE		(1 << TIF_UPROBE)
+#define _TIF_NOCPUID		(1 << TIF_NOCPUID)
 #define _TIF_NOTSC		(1 << TIF_NOTSC)
 #define _TIF_IA32		(1 << TIF_IA32)
 #define _TIF_NOHZ		(1 << TIF_NOHZ)
@@ -138,7 +140,7 @@ struct thread_info {
 
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW							\
-	(_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP)
+	(_TIF_IO_BITMAP|_TIF_NOCPUID|_TIF_NOTSC|_TIF_BLOCKSTEP)
 
 #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
 #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
@@ -239,6 +241,8 @@ static inline int arch_within_stack_frames(const void * const stack,
 extern void arch_task_cache_init(void);
 extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
 extern void arch_release_task_struct(struct task_struct *tsk);
+extern void arch_setup_new_exec(void);
+#define arch_setup_new_exec arch_setup_new_exec
 #endif	/* !__ASSEMBLY__ */
 
 #endif /* _ASM_X86_THREAD_INFO_H */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index fc5abff..75d002b 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -110,6 +110,16 @@ static inline void cr4_clear_bits(unsigned long mask)
 	}
 }
 
+static inline void cr4_toggle_bits(unsigned long mask)
+{
+	unsigned long cr4;
+
+	cr4 = this_cpu_read(cpu_tlbstate.cr4);
+	cr4 ^= mask;
+	this_cpu_write(cpu_tlbstate.cr4, cr4);
+	__write_cr4(cr4);
+}
+
 /* Read the CR4 shadow. */
 static inline unsigned long cr4_read_shadow(void)
 {
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index cc54b70..35cd06f 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -70,8 +70,10 @@
 #define SECONDARY_EXEC_APIC_REGISTER_VIRT       0x00000100
 #define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY    0x00000200
 #define SECONDARY_EXEC_PAUSE_LOOP_EXITING	0x00000400
+#define SECONDARY_EXEC_RDRAND			0x00000800
 #define SECONDARY_EXEC_ENABLE_INVPCID		0x00001000
 #define SECONDARY_EXEC_SHADOW_VMCS              0x00004000
+#define SECONDARY_EXEC_RDSEED			0x00010000
 #define SECONDARY_EXEC_ENABLE_PML               0x00020000
 #define SECONDARY_EXEC_XSAVES			0x00100000
 #define SECONDARY_EXEC_TSC_SCALING              0x02000000
@@ -516,12 +518,14 @@ struct vmx_msr_entry {
 #define EPT_VIOLATION_READABLE_BIT	3
 #define EPT_VIOLATION_WRITABLE_BIT	4
 #define EPT_VIOLATION_EXECUTABLE_BIT	5
+#define EPT_VIOLATION_GVA_TRANSLATED_BIT 8
 #define EPT_VIOLATION_ACC_READ		(1 << EPT_VIOLATION_ACC_READ_BIT)
 #define EPT_VIOLATION_ACC_WRITE		(1 << EPT_VIOLATION_ACC_WRITE_BIT)
 #define EPT_VIOLATION_ACC_INSTR		(1 << EPT_VIOLATION_ACC_INSTR_BIT)
 #define EPT_VIOLATION_READABLE		(1 << EPT_VIOLATION_READABLE_BIT)
 #define EPT_VIOLATION_WRITABLE		(1 << EPT_VIOLATION_WRITABLE_BIT)
 #define EPT_VIOLATION_EXECUTABLE	(1 << EPT_VIOLATION_EXECUTABLE_BIT)
+#define EPT_VIOLATION_GVA_TRANSLATED	(1 << EPT_VIOLATION_GVA_TRANSLATED_BIT)
 
 /*
  * VM-instruction error numbers
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 739c0c5..c2824d0 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -9,6 +9,9 @@
 #include <linux/types.h>
 #include <linux/ioctl.h>
 
+#define KVM_PIO_PAGE_OFFSET 1
+#define KVM_COALESCED_MMIO_PAGE_OFFSET 2
+
 #define DE_VECTOR 0
 #define DB_VECTOR 1
 #define BP_VECTOR 3
diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h
index 835aa51..c457655 100644
--- a/arch/x86/include/uapi/asm/prctl.h
+++ b/arch/x86/include/uapi/asm/prctl.h
@@ -1,10 +1,13 @@
 #ifndef _ASM_X86_PRCTL_H
 #define _ASM_X86_PRCTL_H
 
-#define ARCH_SET_GS 0x1001
-#define ARCH_SET_FS 0x1002
-#define ARCH_GET_FS 0x1003
-#define ARCH_GET_GS 0x1004
+#define ARCH_SET_GS		0x1001
+#define ARCH_SET_FS		0x1002
+#define ARCH_GET_FS		0x1003
+#define ARCH_GET_GS		0x1004
+
+#define ARCH_GET_CPUID		0x1011
+#define ARCH_SET_CPUID		0x1012
 
 #define ARCH_MAP_VDSO_X32	0x2001
 #define ARCH_MAP_VDSO_32	0x2002
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index 1445865..690a2dc 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -76,7 +76,11 @@
 #define EXIT_REASON_WBINVD              54
 #define EXIT_REASON_XSETBV              55
 #define EXIT_REASON_APIC_WRITE          56
+#define EXIT_REASON_RDRAND              57
 #define EXIT_REASON_INVPCID             58
+#define EXIT_REASON_VMFUNC              59
+#define EXIT_REASON_ENCLS               60
+#define EXIT_REASON_RDSEED              61
 #define EXIT_REASON_PML_FULL            62
 #define EXIT_REASON_XSAVES              63
 #define EXIT_REASON_XRSTORS             64
@@ -90,6 +94,7 @@
 	{ EXIT_REASON_TASK_SWITCH,           "TASK_SWITCH" }, \
 	{ EXIT_REASON_CPUID,                 "CPUID" }, \
 	{ EXIT_REASON_HLT,                   "HLT" }, \
+	{ EXIT_REASON_INVD,                  "INVD" }, \
 	{ EXIT_REASON_INVLPG,                "INVLPG" }, \
 	{ EXIT_REASON_RDPMC,                 "RDPMC" }, \
 	{ EXIT_REASON_RDTSC,                 "RDTSC" }, \
@@ -108,6 +113,8 @@
 	{ EXIT_REASON_IO_INSTRUCTION,        "IO_INSTRUCTION" }, \
 	{ EXIT_REASON_MSR_READ,              "MSR_READ" }, \
 	{ EXIT_REASON_MSR_WRITE,             "MSR_WRITE" }, \
+	{ EXIT_REASON_INVALID_STATE,         "INVALID_STATE" }, \
+	{ EXIT_REASON_MSR_LOAD_FAIL,         "MSR_LOAD_FAIL" }, \
 	{ EXIT_REASON_MWAIT_INSTRUCTION,     "MWAIT_INSTRUCTION" }, \
 	{ EXIT_REASON_MONITOR_TRAP_FLAG,     "MONITOR_TRAP_FLAG" }, \
 	{ EXIT_REASON_MONITOR_INSTRUCTION,   "MONITOR_INSTRUCTION" }, \
@@ -115,20 +122,24 @@
 	{ EXIT_REASON_MCE_DURING_VMENTRY,    "MCE_DURING_VMENTRY" }, \
 	{ EXIT_REASON_TPR_BELOW_THRESHOLD,   "TPR_BELOW_THRESHOLD" }, \
 	{ EXIT_REASON_APIC_ACCESS,           "APIC_ACCESS" }, \
-	{ EXIT_REASON_GDTR_IDTR,	     "GDTR_IDTR" }, \
-	{ EXIT_REASON_LDTR_TR,		     "LDTR_TR" }, \
+	{ EXIT_REASON_EOI_INDUCED,           "EOI_INDUCED" }, \
+	{ EXIT_REASON_GDTR_IDTR,             "GDTR_IDTR" }, \
+	{ EXIT_REASON_LDTR_TR,               "LDTR_TR" }, \
 	{ EXIT_REASON_EPT_VIOLATION,         "EPT_VIOLATION" }, \
 	{ EXIT_REASON_EPT_MISCONFIG,         "EPT_MISCONFIG" }, \
 	{ EXIT_REASON_INVEPT,                "INVEPT" }, \
+	{ EXIT_REASON_RDTSCP,                "RDTSCP" }, \
 	{ EXIT_REASON_PREEMPTION_TIMER,      "PREEMPTION_TIMER" }, \
+	{ EXIT_REASON_INVVPID,               "INVVPID" }, \
 	{ EXIT_REASON_WBINVD,                "WBINVD" }, \
+	{ EXIT_REASON_XSETBV,                "XSETBV" }, \
 	{ EXIT_REASON_APIC_WRITE,            "APIC_WRITE" }, \
-	{ EXIT_REASON_EOI_INDUCED,           "EOI_INDUCED" }, \
-	{ EXIT_REASON_INVALID_STATE,         "INVALID_STATE" }, \
-	{ EXIT_REASON_MSR_LOAD_FAIL,         "MSR_LOAD_FAIL" }, \
-	{ EXIT_REASON_INVD,                  "INVD" }, \
-	{ EXIT_REASON_INVVPID,               "INVVPID" }, \
+	{ EXIT_REASON_RDRAND,                "RDRAND" }, \
 	{ EXIT_REASON_INVPCID,               "INVPCID" }, \
+	{ EXIT_REASON_VMFUNC,                "VMFUNC" }, \
+	{ EXIT_REASON_ENCLS,                 "ENCLS" }, \
+	{ EXIT_REASON_RDSEED,                "RDSEED" }, \
+	{ EXIT_REASON_PML_FULL,              "PML_FULL" }, \
 	{ EXIT_REASON_XSAVES,                "XSAVES" }, \
 	{ EXIT_REASON_XRSTORS,               "XRSTORS" }
 
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 0631977..dfa90a3 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -90,16 +90,12 @@ static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c)
 		return;
 	}
 
-	if (ring3mwait_disabled) {
-		msr_clear_bit(MSR_MISC_FEATURE_ENABLES,
-			      MSR_MISC_FEATURE_ENABLES_RING3MWAIT_BIT);
+	if (ring3mwait_disabled)
 		return;
-	}
-
-	msr_set_bit(MSR_MISC_FEATURE_ENABLES,
-		    MSR_MISC_FEATURE_ENABLES_RING3MWAIT_BIT);
 
 	set_cpu_cap(c, X86_FEATURE_RING3MWAIT);
+	this_cpu_or(msr_misc_features_shadow,
+		    1UL << MSR_MISC_FEATURES_ENABLES_RING3MWAIT_BIT);
 
 	if (c == &boot_cpu_data)
 		ELF_HWCAP2 |= HWCAP2_RING3MWAIT;
@@ -488,6 +484,34 @@ static void intel_bsp_resume(struct cpuinfo_x86 *c)
 	init_intel_energy_perf(c);
 }
 
+static void init_cpuid_fault(struct cpuinfo_x86 *c)
+{
+	u64 msr;
+
+	if (!rdmsrl_safe(MSR_PLATFORM_INFO, &msr)) {
+		if (msr & MSR_PLATFORM_INFO_CPUID_FAULT)
+			set_cpu_cap(c, X86_FEATURE_CPUID_FAULT);
+	}
+}
+
+static void init_intel_misc_features(struct cpuinfo_x86 *c)
+{
+	u64 msr;
+
+	if (rdmsrl_safe(MSR_MISC_FEATURES_ENABLES, &msr))
+		return;
+
+	/* Clear all MISC features */
+	this_cpu_write(msr_misc_features_shadow, 0);
+
+	/* Check features and update capabilities and shadow control bits */
+	init_cpuid_fault(c);
+	probe_xeon_phi_r3mwait(c);
+
+	msr = this_cpu_read(msr_misc_features_shadow);
+	wrmsrl(MSR_MISC_FEATURES_ENABLES, msr);
+}
+
 static void init_intel(struct cpuinfo_x86 *c)
 {
 	unsigned int l2 = 0;
@@ -602,7 +626,7 @@ static void init_intel(struct cpuinfo_x86 *c)
 
 	init_intel_energy_perf(c);
 
-	probe_xeon_phi_r3mwait(c);
+	init_intel_misc_features(c);
 }
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 14f65a5..da5c097 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -396,9 +396,9 @@ static u64 kvm_steal_clock(int cpu)
 	src = &per_cpu(steal_time, cpu);
 	do {
 		version = src->version;
-		rmb();
+		virt_rmb();
 		steal = src->steal;
-		rmb();
+		virt_rmb();
 	} while ((version & 1) || (version != src->version));
 
 	return steal;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index f675915..0bb8842 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -37,6 +37,7 @@
 #include <asm/vm86.h>
 #include <asm/switch_to.h>
 #include <asm/desc.h>
+#include <asm/prctl.h>
 
 /*
  * per-CPU TSS segments. Threads are completely 'soft' on Linux,
@@ -124,11 +125,6 @@ void flush_thread(void)
 	fpu__clear(&tsk->thread.fpu);
 }
 
-static void hard_disable_TSC(void)
-{
-	cr4_set_bits(X86_CR4_TSD);
-}
-
 void disable_TSC(void)
 {
 	preempt_disable();
@@ -137,15 +133,10 @@ void disable_TSC(void)
 		 * Must flip the CPU state synchronously with
 		 * TIF_NOTSC in the current running context.
 		 */
-		hard_disable_TSC();
+		cr4_set_bits(X86_CR4_TSD);
 	preempt_enable();
 }
 
-static void hard_enable_TSC(void)
-{
-	cr4_clear_bits(X86_CR4_TSD);
-}
-
 static void enable_TSC(void)
 {
 	preempt_disable();
@@ -154,7 +145,7 @@ static void enable_TSC(void)
 		 * Must flip the CPU state synchronously with
 		 * TIF_NOTSC in the current running context.
 		 */
-		hard_enable_TSC();
+		cr4_clear_bits(X86_CR4_TSD);
 	preempt_enable();
 }
 
@@ -182,54 +173,129 @@ int set_tsc_mode(unsigned int val)
 	return 0;
 }
 
-void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
-		      struct tss_struct *tss)
-{
-	struct thread_struct *prev, *next;
-
-	prev = &prev_p->thread;
-	next = &next_p->thread;
+DEFINE_PER_CPU(u64, msr_misc_features_shadow);
 
-	if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^
-	    test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) {
-		unsigned long debugctl = get_debugctlmsr();
+static void set_cpuid_faulting(bool on)
+{
+	u64 msrval;
 
-		debugctl &= ~DEBUGCTLMSR_BTF;
-		if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP))
-			debugctl |= DEBUGCTLMSR_BTF;
+	msrval = this_cpu_read(msr_misc_features_shadow);
+	msrval &= ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT;
+	msrval |= (on << MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT);
+	this_cpu_write(msr_misc_features_shadow, msrval);
+	wrmsrl(MSR_MISC_FEATURES_ENABLES, msrval);
+}
 
-		update_debugctlmsr(debugctl);
+static void disable_cpuid(void)
+{
+	preempt_disable();
+	if (!test_and_set_thread_flag(TIF_NOCPUID)) {
+		/*
+		 * Must flip the CPU state synchronously with
+		 * TIF_NOCPUID in the current running context.
+		 */
+		set_cpuid_faulting(true);
 	}
+	preempt_enable();
+}
 
-	if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
-	    test_tsk_thread_flag(next_p, TIF_NOTSC)) {
-		/* prev and next are different */
-		if (test_tsk_thread_flag(next_p, TIF_NOTSC))
-			hard_disable_TSC();
-		else
-			hard_enable_TSC();
+static void enable_cpuid(void)
+{
+	preempt_disable();
+	if (test_and_clear_thread_flag(TIF_NOCPUID)) {
+		/*
+		 * Must flip the CPU state synchronously with
+		 * TIF_NOCPUID in the current running context.
+		 */
+		set_cpuid_faulting(false);
 	}
+	preempt_enable();
+}
+
+static int get_cpuid_mode(void)
+{
+	return !test_thread_flag(TIF_NOCPUID);
+}
+
+static int set_cpuid_mode(struct task_struct *task, unsigned long cpuid_enabled)
+{
+	if (!static_cpu_has(X86_FEATURE_CPUID_FAULT))
+		return -ENODEV;
+
+	if (cpuid_enabled)
+		enable_cpuid();
+	else
+		disable_cpuid();
+
+	return 0;
+}
+
+/*
+ * Called immediately after a successful exec.
+ */
+void arch_setup_new_exec(void)
+{
+	/* If cpuid was previously disabled for this task, re-enable it. */
+	if (test_thread_flag(TIF_NOCPUID))
+		enable_cpuid();
+}
 
-	if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
+static inline void switch_to_bitmap(struct tss_struct *tss,
+				    struct thread_struct *prev,
+				    struct thread_struct *next,
+				    unsigned long tifp, unsigned long tifn)
+{
+	if (tifn & _TIF_IO_BITMAP) {
 		/*
 		 * Copy the relevant range of the IO bitmap.
 		 * Normally this is 128 bytes or less:
 		 */
 		memcpy(tss->io_bitmap, next->io_bitmap_ptr,
 		       max(prev->io_bitmap_max, next->io_bitmap_max));
-
 		/*
 		 * Make sure that the TSS limit is correct for the CPU
 		 * to notice the IO bitmap.
 		 */
 		refresh_tss_limit();
-	} else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
+	} else if (tifp & _TIF_IO_BITMAP) {
 		/*
 		 * Clear any possible leftover bits:
 		 */
 		memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
 	}
+}
+
+void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
+		      struct tss_struct *tss)
+{
+	struct thread_struct *prev, *next;
+	unsigned long tifp, tifn;
+
+	prev = &prev_p->thread;
+	next = &next_p->thread;
+
+	tifn = READ_ONCE(task_thread_info(next_p)->flags);
+	tifp = READ_ONCE(task_thread_info(prev_p)->flags);
+	switch_to_bitmap(tss, prev, next, tifp, tifn);
+
 	propagate_user_return_notify(prev_p, next_p);
+
+	if ((tifp & _TIF_BLOCKSTEP || tifn & _TIF_BLOCKSTEP) &&
+	    arch_has_block_step()) {
+		unsigned long debugctl, msk;
+
+		rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+		debugctl &= ~DEBUGCTLMSR_BTF;
+		msk = tifn & _TIF_BLOCKSTEP;
+		debugctl |= (msk >> TIF_BLOCKSTEP) << DEBUGCTLMSR_BTF_SHIFT;
+		wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+	}
+
+	if ((tifp ^ tifn) & _TIF_NOTSC)
+		cr4_toggle_bits(X86_CR4_TSD);
+
+	if ((tifp ^ tifn) & _TIF_NOCPUID)
+		set_cpuid_faulting(!!(tifn & _TIF_NOCPUID));
 }
 
 /*
@@ -550,3 +616,16 @@ out:
 	put_task_stack(p);
 	return ret;
 }
+
+long do_arch_prctl_common(struct task_struct *task, int option,
+			  unsigned long cpuid_enabled)
+{
+	switch (option) {
+	case ARCH_GET_CPUID:
+		return get_cpuid_mode();
+	case ARCH_SET_CPUID:
+		return set_cpuid_mode(task, cpuid_enabled);
+	}
+
+	return -EINVAL;
+}
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 4c818f8..ff40e74 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -37,6 +37,7 @@
 #include <linux/uaccess.h>
 #include <linux/io.h>
 #include <linux/kdebug.h>
+#include <linux/syscalls.h>
 
 #include <asm/pgtable.h>
 #include <asm/ldt.h>
@@ -56,6 +57,7 @@
 #include <asm/switch_to.h>
 #include <asm/vm86.h>
 #include <asm/intel_rdt.h>
+#include <asm/proto.h>
 
 void __show_regs(struct pt_regs *regs, int all)
 {
@@ -304,3 +306,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 
 	return prev_p;
 }
+
+SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
+{
+	return do_arch_prctl_common(current, option, arg2);
+}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index d6b784a..ea1a618 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -37,6 +37,7 @@
 #include <linux/uaccess.h>
 #include <linux/io.h>
 #include <linux/ftrace.h>
+#include <linux/syscalls.h>
 
 #include <asm/pgtable.h>
 #include <asm/processor.h>
@@ -204,7 +205,7 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
 				(struct user_desc __user *)tls, 0);
 		else
 #endif
-			err = do_arch_prctl(p, ARCH_SET_FS, tls);
+			err = do_arch_prctl_64(p, ARCH_SET_FS, tls);
 		if (err)
 			goto out;
 	}
@@ -547,70 +548,72 @@ static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
 }
 #endif
 
-long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
+long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
 {
 	int ret = 0;
 	int doit = task == current;
 	int cpu;
 
-	switch (code) {
+	switch (option) {
 	case ARCH_SET_GS:
-		if (addr >= TASK_SIZE_MAX)
+		if (arg2 >= TASK_SIZE_MAX)
 			return -EPERM;
 		cpu = get_cpu();
 		task->thread.gsindex = 0;
-		task->thread.gsbase = addr;
+		task->thread.gsbase = arg2;
 		if (doit) {
 			load_gs_index(0);
-			ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr);
+			ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, arg2);
 		}
 		put_cpu();
 		break;
 	case ARCH_SET_FS:
 		/* Not strictly needed for fs, but do it for symmetry
 		   with gs */
-		if (addr >= TASK_SIZE_MAX)
+		if (arg2 >= TASK_SIZE_MAX)
 			return -EPERM;
 		cpu = get_cpu();
 		task->thread.fsindex = 0;
-		task->thread.fsbase = addr;
+		task->thread.fsbase = arg2;
 		if (doit) {
 			/* set the selector to 0 to not confuse __switch_to */
 			loadsegment(fs, 0);
-			ret = wrmsrl_safe(MSR_FS_BASE, addr);
+			ret = wrmsrl_safe(MSR_FS_BASE, arg2);
 		}
 		put_cpu();
 		break;
 	case ARCH_GET_FS: {
 		unsigned long base;
+
 		if (doit)
 			rdmsrl(MSR_FS_BASE, base);
 		else
 			base = task->thread.fsbase;
-		ret = put_user(base, (unsigned long __user *)addr);
+		ret = put_user(base, (unsigned long __user *)arg2);
 		break;
 	}
 	case ARCH_GET_GS: {
 		unsigned long base;
+
 		if (doit)
 			rdmsrl(MSR_KERNEL_GS_BASE, base);
 		else
 			base = task->thread.gsbase;
-		ret = put_user(base, (unsigned long __user *)addr);
+		ret = put_user(base, (unsigned long __user *)arg2);
 		break;
 	}
 
 #ifdef CONFIG_CHECKPOINT_RESTORE
 # ifdef CONFIG_X86_X32_ABI
 	case ARCH_MAP_VDSO_X32:
-		return prctl_map_vdso(&vdso_image_x32, addr);
+		return prctl_map_vdso(&vdso_image_x32, arg2);
 # endif
 # if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
 	case ARCH_MAP_VDSO_32:
-		return prctl_map_vdso(&vdso_image_32, addr);
+		return prctl_map_vdso(&vdso_image_32, arg2);
 # endif
 	case ARCH_MAP_VDSO_64:
-		return prctl_map_vdso(&vdso_image_64, addr);
+		return prctl_map_vdso(&vdso_image_64, arg2);
 #endif
 
 	default:
@@ -621,10 +624,23 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
 	return ret;
 }
 
-long sys_arch_prctl(int code, unsigned long addr)
+SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
+{
+	long ret;
+
+	ret = do_arch_prctl_64(current, option, arg2);
+	if (ret == -EINVAL)
+		ret = do_arch_prctl_common(current, option, arg2);
+
+	return ret;
+}
+
+#ifdef CONFIG_IA32_EMULATION
+COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
 {
-	return do_arch_prctl(current, code, addr);
+	return do_arch_prctl_common(current, option, arg2);
 }
+#endif
 
 unsigned long KSTK_ESP(struct task_struct *task)
 {
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 2364b23..f37d181 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -396,12 +396,12 @@ static int putreg(struct task_struct *child,
 		if (value >= TASK_SIZE_MAX)
 			return -EIO;
 		/*
-		 * When changing the segment base, use do_arch_prctl
+		 * When changing the segment base, use do_arch_prctl_64
 		 * to set either thread.fs or thread.fsindex and the
 		 * corresponding GDT slot.
 		 */
 		if (child->thread.fsbase != value)
-			return do_arch_prctl(child, ARCH_SET_FS, value);
+			return do_arch_prctl_64(child, ARCH_SET_FS, value);
 		return 0;
 	case offsetof(struct user_regs_struct,gs_base):
 		/*
@@ -410,7 +410,7 @@ static int putreg(struct task_struct *child,
 		if (value >= TASK_SIZE_MAX)
 			return -EIO;
 		if (child->thread.gsbase != value)
-			return do_arch_prctl(child, ARCH_SET_GS, value);
+			return do_arch_prctl_64(child, ARCH_SET_GS, value);
 		return 0;
 #endif
 	}
@@ -869,7 +869,7 @@ long arch_ptrace(struct task_struct *child, long request,
 		   Works just like arch_prctl, except that the arguments
 		   are reversed. */
 	case PTRACE_ARCH_PRCTL:
-		ret = do_arch_prctl(child, data, addr);
+		ret = do_arch_prctl_64(child, data, addr);
 		break;
 #endif
 
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index ab8e32f..760433b 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -86,18 +86,6 @@ config KVM_MMU_AUDIT
 	 This option adds a R/W kVM module parameter 'mmu_audit', which allows
 	 auditing of KVM MMU events at runtime.
 
-config KVM_DEVICE_ASSIGNMENT
-	bool "KVM legacy PCI device assignment support (DEPRECATED)"
-	depends on KVM && PCI && IOMMU_API
-	default n
-	---help---
-	  Provide support for legacy PCI device assignment through KVM.  The
-	  kernel now also supports a full featured userspace device driver
-	  framework through VFIO, which supersedes this support and provides
-	  better security.
-
-	  If unsure, say N.
-
 # OK, it's a little counter-intuitive to do this, but it puts it neatly under
 # the virtualization menu.
 source drivers/vhost/Kconfig
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 3bff207..09d4b17 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -15,8 +15,6 @@ kvm-y			+= x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
 			   i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
 			   hyperv.o page_track.o debugfs.o
 
-kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT)	+= assigned-dev.o iommu.o
-
 kvm-intel-y		+= vmx.o pmu_intel.o
 kvm-amd-y		+= svm.o pmu_amd.o
 
diff --git a/arch/x86/kvm/assigned-dev.c b/arch/x86/kvm/assigned-dev.c
deleted file mode 100644
index 308b859..0000000
--- a/arch/x86/kvm/assigned-dev.c
+++ /dev/null
@@ -1,1058 +0,0 @@
-/*
- * Kernel-based Virtual Machine - device assignment support
- *
- * Copyright (C) 2010 Red Hat, Inc. and/or its affiliates.
- *
- * This work is licensed under the terms of the GNU GPL, version 2.  See
- * the COPYING file in the top-level directory.
- *
- */
-
-#include <linux/kvm_host.h>
-#include <linux/kvm.h>
-#include <linux/uaccess.h>
-#include <linux/vmalloc.h>
-#include <linux/errno.h>
-#include <linux/spinlock.h>
-#include <linux/pci.h>
-#include <linux/interrupt.h>
-#include <linux/slab.h>
-#include <linux/namei.h>
-#include <linux/fs.h>
-#include "irq.h"
-#include "assigned-dev.h"
-#include "trace/events/kvm.h"
-
-struct kvm_assigned_dev_kernel {
-	struct kvm_irq_ack_notifier ack_notifier;
-	struct list_head list;
-	int assigned_dev_id;
-	int host_segnr;
-	int host_busnr;
-	int host_devfn;
-	unsigned int entries_nr;
-	int host_irq;
-	bool host_irq_disabled;
-	bool pci_2_3;
-	struct msix_entry *host_msix_entries;
-	int guest_irq;
-	struct msix_entry *guest_msix_entries;
-	unsigned long irq_requested_type;
-	int irq_source_id;
-	int flags;
-	struct pci_dev *dev;
-	struct kvm *kvm;
-	spinlock_t intx_lock;
-	spinlock_t intx_mask_lock;
-	char irq_name[32];
-	struct pci_saved_state *pci_saved_state;
-};
-
-static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
-						      int assigned_dev_id)
-{
-	struct kvm_assigned_dev_kernel *match;
-
-	list_for_each_entry(match, head, list) {
-		if (match->assigned_dev_id == assigned_dev_id)
-			return match;
-	}
-	return NULL;
-}
-
-static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
-				    *assigned_dev, int irq)
-{
-	int i, index;
-	struct msix_entry *host_msix_entries;
-
-	host_msix_entries = assigned_dev->host_msix_entries;
-
-	index = -1;
-	for (i = 0; i < assigned_dev->entries_nr; i++)
-		if (irq == host_msix_entries[i].vector) {
-			index = i;
-			break;
-		}
-	if (index < 0)
-		printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n");
-
-	return index;
-}
-
-static irqreturn_t kvm_assigned_dev_intx(int irq, void *dev_id)
-{
-	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
-	int ret;
-
-	spin_lock(&assigned_dev->intx_lock);
-	if (pci_check_and_mask_intx(assigned_dev->dev)) {
-		assigned_dev->host_irq_disabled = true;
-		ret = IRQ_WAKE_THREAD;
-	} else
-		ret = IRQ_NONE;
-	spin_unlock(&assigned_dev->intx_lock);
-
-	return ret;
-}
-
-static void
-kvm_assigned_dev_raise_guest_irq(struct kvm_assigned_dev_kernel *assigned_dev,
-				 int vector)
-{
-	if (unlikely(assigned_dev->irq_requested_type &
-		     KVM_DEV_IRQ_GUEST_INTX)) {
-		spin_lock(&assigned_dev->intx_mask_lock);
-		if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX))
-			kvm_set_irq(assigned_dev->kvm,
-				    assigned_dev->irq_source_id, vector, 1,
-				    false);
-		spin_unlock(&assigned_dev->intx_mask_lock);
-	} else
-		kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
-			    vector, 1, false);
-}
-
-static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id)
-{
-	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
-
-	if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
-		spin_lock_irq(&assigned_dev->intx_lock);
-		disable_irq_nosync(irq);
-		assigned_dev->host_irq_disabled = true;
-		spin_unlock_irq(&assigned_dev->intx_lock);
-	}
-
-	kvm_assigned_dev_raise_guest_irq(assigned_dev,
-					 assigned_dev->guest_irq);
-
-	return IRQ_HANDLED;
-}
-
-/*
- * Deliver an IRQ in an atomic context if we can, or return a failure,
- * user can retry in a process context.
- * Return value:
- *  -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context.
- *  Other values - No need to retry.
- */
-static int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq,
-				int level)
-{
-	struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
-	struct kvm_kernel_irq_routing_entry *e;
-	int ret = -EINVAL;
-	int idx;
-
-	trace_kvm_set_irq(irq, level, irq_source_id);
-
-	/*
-	 * Injection into either PIC or IOAPIC might need to scan all CPUs,
-	 * which would need to be retried from thread context;  when same GSI
-	 * is connected to both PIC and IOAPIC, we'd have to report a
-	 * partial failure here.
-	 * Since there's no easy way to do this, we only support injecting MSI
-	 * which is limited to 1:1 GSI mapping.
-	 */
-	idx = srcu_read_lock(&kvm->irq_srcu);
-	if (kvm_irq_map_gsi(kvm, entries, irq) > 0) {
-		e = &entries[0];
-		ret = kvm_arch_set_irq_inatomic(e, kvm, irq_source_id,
-						irq, level);
-	}
-	srcu_read_unlock(&kvm->irq_srcu, idx);
-	return ret;
-}
-
-
-static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id)
-{
-	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
-	int ret = kvm_set_irq_inatomic(assigned_dev->kvm,
-				       assigned_dev->irq_source_id,
-				       assigned_dev->guest_irq, 1);
-	return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED;
-}
-
-static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id)
-{
-	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
-
-	kvm_assigned_dev_raise_guest_irq(assigned_dev,
-					 assigned_dev->guest_irq);
-
-	return IRQ_HANDLED;
-}
-
-static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id)
-{
-	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
-	int index = find_index_from_host_irq(assigned_dev, irq);
-	u32 vector;
-	int ret = 0;
-
-	if (index >= 0) {
-		vector = assigned_dev->guest_msix_entries[index].vector;
-		ret = kvm_set_irq_inatomic(assigned_dev->kvm,
-					   assigned_dev->irq_source_id,
-					   vector, 1);
-	}
-
-	return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED;
-}
-
-static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
-{
-	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
-	int index = find_index_from_host_irq(assigned_dev, irq);
-	u32 vector;
-
-	if (index >= 0) {
-		vector = assigned_dev->guest_msix_entries[index].vector;
-		kvm_assigned_dev_raise_guest_irq(assigned_dev, vector);
-	}
-
-	return IRQ_HANDLED;
-}
-
-/* Ack the irq line for an assigned device */
-static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
-{
-	struct kvm_assigned_dev_kernel *dev =
-		container_of(kian, struct kvm_assigned_dev_kernel,
-			     ack_notifier);
-
-	kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0, false);
-
-	spin_lock(&dev->intx_mask_lock);
-
-	if (!(dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) {
-		bool reassert = false;
-
-		spin_lock_irq(&dev->intx_lock);
-		/*
-		 * The guest IRQ may be shared so this ack can come from an
-		 * IRQ for another guest device.
-		 */
-		if (dev->host_irq_disabled) {
-			if (!(dev->flags & KVM_DEV_ASSIGN_PCI_2_3))
-				enable_irq(dev->host_irq);
-			else if (!pci_check_and_unmask_intx(dev->dev))
-				reassert = true;
-			dev->host_irq_disabled = reassert;
-		}
-		spin_unlock_irq(&dev->intx_lock);
-
-		if (reassert)
-			kvm_set_irq(dev->kvm, dev->irq_source_id,
-				    dev->guest_irq, 1, false);
-	}
-
-	spin_unlock(&dev->intx_mask_lock);
-}
-
-static void deassign_guest_irq(struct kvm *kvm,
-			       struct kvm_assigned_dev_kernel *assigned_dev)
-{
-	if (assigned_dev->ack_notifier.gsi != -1)
-		kvm_unregister_irq_ack_notifier(kvm,
-						&assigned_dev->ack_notifier);
-
-	kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
-		    assigned_dev->guest_irq, 0, false);
-
-	if (assigned_dev->irq_source_id != -1)
-		kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
-	assigned_dev->irq_source_id = -1;
-	assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_GUEST_MASK);
-}
-
-/* The function implicit hold kvm->lock mutex due to cancel_work_sync() */
-static void deassign_host_irq(struct kvm *kvm,
-			      struct kvm_assigned_dev_kernel *assigned_dev)
-{
-	/*
-	 * We disable irq here to prevent further events.
-	 *
-	 * Notice this maybe result in nested disable if the interrupt type is
-	 * INTx, but it's OK for we are going to free it.
-	 *
-	 * If this function is a part of VM destroy, please ensure that till
-	 * now, the kvm state is still legal for probably we also have to wait
-	 * on a currently running IRQ handler.
-	 */
-	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
-		int i;
-		for (i = 0; i < assigned_dev->entries_nr; i++)
-			disable_irq(assigned_dev->host_msix_entries[i].vector);
-
-		for (i = 0; i < assigned_dev->entries_nr; i++)
-			free_irq(assigned_dev->host_msix_entries[i].vector,
-				 assigned_dev);
-
-		assigned_dev->entries_nr = 0;
-		kfree(assigned_dev->host_msix_entries);
-		kfree(assigned_dev->guest_msix_entries);
-		pci_disable_msix(assigned_dev->dev);
-	} else {
-		/* Deal with MSI and INTx */
-		if ((assigned_dev->irq_requested_type &
-		     KVM_DEV_IRQ_HOST_INTX) &&
-		    (assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
-			spin_lock_irq(&assigned_dev->intx_lock);
-			pci_intx(assigned_dev->dev, false);
-			spin_unlock_irq(&assigned_dev->intx_lock);
-			synchronize_irq(assigned_dev->host_irq);
-		} else
-			disable_irq(assigned_dev->host_irq);
-
-		free_irq(assigned_dev->host_irq, assigned_dev);
-
-		if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI)
-			pci_disable_msi(assigned_dev->dev);
-	}
-
-	assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK);
-}
-
-static int kvm_deassign_irq(struct kvm *kvm,
-			    struct kvm_assigned_dev_kernel *assigned_dev,
-			    unsigned long irq_requested_type)
-{
-	unsigned long guest_irq_type, host_irq_type;
-
-	if (!irqchip_in_kernel(kvm))
-		return -EINVAL;
-	/* no irq assignment to deassign */
-	if (!assigned_dev->irq_requested_type)
-		return -ENXIO;
-
-	host_irq_type = irq_requested_type & KVM_DEV_IRQ_HOST_MASK;
-	guest_irq_type = irq_requested_type & KVM_DEV_IRQ_GUEST_MASK;
-
-	if (host_irq_type)
-		deassign_host_irq(kvm, assigned_dev);
-	if (guest_irq_type)
-		deassign_guest_irq(kvm, assigned_dev);
-
-	return 0;
-}
-
-static void kvm_free_assigned_irq(struct kvm *kvm,
-				  struct kvm_assigned_dev_kernel *assigned_dev)
-{
-	kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
-}
-
-static void kvm_free_assigned_device(struct kvm *kvm,
-				     struct kvm_assigned_dev_kernel
-				     *assigned_dev)
-{
-	kvm_free_assigned_irq(kvm, assigned_dev);
-
-	pci_reset_function(assigned_dev->dev);
-	if (pci_load_and_free_saved_state(assigned_dev->dev,
-					  &assigned_dev->pci_saved_state))
-		printk(KERN_INFO "%s: Couldn't reload %s saved state\n",
-		       __func__, dev_name(&assigned_dev->dev->dev));
-	else
-		pci_restore_state(assigned_dev->dev);
-
-	pci_clear_dev_assigned(assigned_dev->dev);
-
-	pci_release_regions(assigned_dev->dev);
-	pci_disable_device(assigned_dev->dev);
-	pci_dev_put(assigned_dev->dev);
-
-	list_del(&assigned_dev->list);
-	kfree(assigned_dev);
-}
-
-void kvm_free_all_assigned_devices(struct kvm *kvm)
-{
-	struct kvm_assigned_dev_kernel *assigned_dev, *tmp;
-
-	list_for_each_entry_safe(assigned_dev, tmp,
-				 &kvm->arch.assigned_dev_head, list) {
-		kvm_free_assigned_device(kvm, assigned_dev);
-	}
-}
-
-static int assigned_device_enable_host_intx(struct kvm *kvm,
-					    struct kvm_assigned_dev_kernel *dev)
-{
-	irq_handler_t irq_handler;
-	unsigned long flags;
-
-	dev->host_irq = dev->dev->irq;
-
-	/*
-	 * We can only share the IRQ line with other host devices if we are
-	 * able to disable the IRQ source at device-level - independently of
-	 * the guest driver. Otherwise host devices may suffer from unbounded
-	 * IRQ latencies when the guest keeps the line asserted.
-	 */
-	if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
-		irq_handler = kvm_assigned_dev_intx;
-		flags = IRQF_SHARED;
-	} else {
-		irq_handler = NULL;
-		flags = IRQF_ONESHOT;
-	}
-	if (request_threaded_irq(dev->host_irq, irq_handler,
-				 kvm_assigned_dev_thread_intx, flags,
-				 dev->irq_name, dev))
-		return -EIO;
-
-	if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
-		spin_lock_irq(&dev->intx_lock);
-		pci_intx(dev->dev, true);
-		spin_unlock_irq(&dev->intx_lock);
-	}
-	return 0;
-}
-
-static int assigned_device_enable_host_msi(struct kvm *kvm,
-					   struct kvm_assigned_dev_kernel *dev)
-{
-	int r;
-
-	if (!dev->dev->msi_enabled) {
-		r = pci_enable_msi(dev->dev);
-		if (r)
-			return r;
-	}
-
-	dev->host_irq = dev->dev->irq;
-	if (request_threaded_irq(dev->host_irq, kvm_assigned_dev_msi,
-				 kvm_assigned_dev_thread_msi, 0,
-				 dev->irq_name, dev)) {
-		pci_disable_msi(dev->dev);
-		return -EIO;
-	}
-
-	return 0;
-}
-
-static int assigned_device_enable_host_msix(struct kvm *kvm,
-					    struct kvm_assigned_dev_kernel *dev)
-{
-	int i, r = -EINVAL;
-
-	/* host_msix_entries and guest_msix_entries should have been
-	 * initialized */
-	if (dev->entries_nr == 0)
-		return r;
-
-	r = pci_enable_msix_exact(dev->dev,
-				  dev->host_msix_entries, dev->entries_nr);
-	if (r)
-		return r;
-
-	for (i = 0; i < dev->entries_nr; i++) {
-		r = request_threaded_irq(dev->host_msix_entries[i].vector,
-					 kvm_assigned_dev_msix,
-					 kvm_assigned_dev_thread_msix,
-					 0, dev->irq_name, dev);
-		if (r)
-			goto err;
-	}
-
-	return 0;
-err:
-	for (i -= 1; i >= 0; i--)
-		free_irq(dev->host_msix_entries[i].vector, dev);
-	pci_disable_msix(dev->dev);
-	return r;
-}
-
-static int assigned_device_enable_guest_intx(struct kvm *kvm,
-				struct kvm_assigned_dev_kernel *dev,
-				struct kvm_assigned_irq *irq)
-{
-	dev->guest_irq = irq->guest_irq;
-	dev->ack_notifier.gsi = irq->guest_irq;
-	return 0;
-}
-
-static int assigned_device_enable_guest_msi(struct kvm *kvm,
-			struct kvm_assigned_dev_kernel *dev,
-			struct kvm_assigned_irq *irq)
-{
-	dev->guest_irq = irq->guest_irq;
-	dev->ack_notifier.gsi = -1;
-	return 0;
-}
-
-static int assigned_device_enable_guest_msix(struct kvm *kvm,
-			struct kvm_assigned_dev_kernel *dev,
-			struct kvm_assigned_irq *irq)
-{
-	dev->guest_irq = irq->guest_irq;
-	dev->ack_notifier.gsi = -1;
-	return 0;
-}
-
-static int assign_host_irq(struct kvm *kvm,
-			   struct kvm_assigned_dev_kernel *dev,
-			   __u32 host_irq_type)
-{
-	int r = -EEXIST;
-
-	if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK)
-		return r;
-
-	snprintf(dev->irq_name, sizeof(dev->irq_name), "kvm:%s",
-		 pci_name(dev->dev));
-
-	switch (host_irq_type) {
-	case KVM_DEV_IRQ_HOST_INTX:
-		r = assigned_device_enable_host_intx(kvm, dev);
-		break;
-	case KVM_DEV_IRQ_HOST_MSI:
-		r = assigned_device_enable_host_msi(kvm, dev);
-		break;
-	case KVM_DEV_IRQ_HOST_MSIX:
-		r = assigned_device_enable_host_msix(kvm, dev);
-		break;
-	default:
-		r = -EINVAL;
-	}
-	dev->host_irq_disabled = false;
-
-	if (!r)
-		dev->irq_requested_type |= host_irq_type;
-
-	return r;
-}
-
-static int assign_guest_irq(struct kvm *kvm,
-			    struct kvm_assigned_dev_kernel *dev,
-			    struct kvm_assigned_irq *irq,
-			    unsigned long guest_irq_type)
-{
-	int id;
-	int r = -EEXIST;
-
-	if (dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MASK)
-		return r;
-
-	id = kvm_request_irq_source_id(kvm);
-	if (id < 0)
-		return id;
-
-	dev->irq_source_id = id;
-
-	switch (guest_irq_type) {
-	case KVM_DEV_IRQ_GUEST_INTX:
-		r = assigned_device_enable_guest_intx(kvm, dev, irq);
-		break;
-	case KVM_DEV_IRQ_GUEST_MSI:
-		r = assigned_device_enable_guest_msi(kvm, dev, irq);
-		break;
-	case KVM_DEV_IRQ_GUEST_MSIX:
-		r = assigned_device_enable_guest_msix(kvm, dev, irq);
-		break;
-	default:
-		r = -EINVAL;
-	}
-
-	if (!r) {
-		dev->irq_requested_type |= guest_irq_type;
-		if (dev->ack_notifier.gsi != -1)
-			kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier);
-	} else {
-		kvm_free_irq_source_id(kvm, dev->irq_source_id);
-		dev->irq_source_id = -1;
-	}
-
-	return r;
-}
-
-/* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */
-static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
-				   struct kvm_assigned_irq *assigned_irq)
-{
-	int r = -EINVAL;
-	struct kvm_assigned_dev_kernel *match;
-	unsigned long host_irq_type, guest_irq_type;
-
-	if (!irqchip_in_kernel(kvm))
-		return r;
-
-	mutex_lock(&kvm->lock);
-	r = -ENODEV;
-	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-				      assigned_irq->assigned_dev_id);
-	if (!match)
-		goto out;
-
-	host_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_HOST_MASK);
-	guest_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_GUEST_MASK);
-
-	r = -EINVAL;
-	/* can only assign one type at a time */
-	if (hweight_long(host_irq_type) > 1)
-		goto out;
-	if (hweight_long(guest_irq_type) > 1)
-		goto out;
-	if (host_irq_type == 0 && guest_irq_type == 0)
-		goto out;
-
-	r = 0;
-	if (host_irq_type)
-		r = assign_host_irq(kvm, match, host_irq_type);
-	if (r)
-		goto out;
-
-	if (guest_irq_type)
-		r = assign_guest_irq(kvm, match, assigned_irq, guest_irq_type);
-out:
-	mutex_unlock(&kvm->lock);
-	return r;
-}
-
-static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
-					 struct kvm_assigned_irq
-					 *assigned_irq)
-{
-	int r = -ENODEV;
-	struct kvm_assigned_dev_kernel *match;
-	unsigned long irq_type;
-
-	mutex_lock(&kvm->lock);
-
-	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-				      assigned_irq->assigned_dev_id);
-	if (!match)
-		goto out;
-
-	irq_type = assigned_irq->flags & (KVM_DEV_IRQ_HOST_MASK |
-					  KVM_DEV_IRQ_GUEST_MASK);
-	r = kvm_deassign_irq(kvm, match, irq_type);
-out:
-	mutex_unlock(&kvm->lock);
-	return r;
-}
-
-/*
- * We want to test whether the caller has been granted permissions to
- * use this device.  To be able to configure and control the device,
- * the user needs access to PCI configuration space and BAR resources.
- * These are accessed through PCI sysfs.  PCI config space is often
- * passed to the process calling this ioctl via file descriptor, so we
- * can't rely on access to that file.  We can check for permissions
- * on each of the BAR resource files, which is a pretty clear
- * indicator that the user has been granted access to the device.
- */
-static int probe_sysfs_permissions(struct pci_dev *dev)
-{
-#ifdef CONFIG_SYSFS
-	int i;
-	bool bar_found = false;
-
-	for (i = PCI_STD_RESOURCES; i <= PCI_STD_RESOURCE_END; i++) {
-		char *kpath, *syspath;
-		struct path path;
-		struct inode *inode;
-		int r;
-
-		if (!pci_resource_len(dev, i))
-			continue;
-
-		kpath = kobject_get_path(&dev->dev.kobj, GFP_KERNEL);
-		if (!kpath)
-			return -ENOMEM;
-
-		/* Per sysfs-rules, sysfs is always at /sys */
-		syspath = kasprintf(GFP_KERNEL, "/sys%s/resource%d", kpath, i);
-		kfree(kpath);
-		if (!syspath)
-			return -ENOMEM;
-
-		r = kern_path(syspath, LOOKUP_FOLLOW, &path);
-		kfree(syspath);
-		if (r)
-			return r;
-
-		inode = d_backing_inode(path.dentry);
-
-		r = inode_permission(inode, MAY_READ | MAY_WRITE | MAY_ACCESS);
-		path_put(&path);
-		if (r)
-			return r;
-
-		bar_found = true;
-	}
-
-	/* If no resources, probably something special */
-	if (!bar_found)
-		return -EPERM;
-
-	return 0;
-#else
-	return -EINVAL; /* No way to control the device without sysfs */
-#endif
-}
-
-static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
-				      struct kvm_assigned_pci_dev *assigned_dev)
-{
-	int r = 0, idx;
-	struct kvm_assigned_dev_kernel *match;
-	struct pci_dev *dev;
-
-	if (!(assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU))
-		return -EINVAL;
-
-	mutex_lock(&kvm->lock);
-	idx = srcu_read_lock(&kvm->srcu);
-
-	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-				      assigned_dev->assigned_dev_id);
-	if (match) {
-		/* device already assigned */
-		r = -EEXIST;
-		goto out;
-	}
-
-	match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL);
-	if (match == NULL) {
-		printk(KERN_INFO "%s: Couldn't allocate memory\n",
-		       __func__);
-		r = -ENOMEM;
-		goto out;
-	}
-	dev = pci_get_domain_bus_and_slot(assigned_dev->segnr,
-				   assigned_dev->busnr,
-				   assigned_dev->devfn);
-	if (!dev) {
-		printk(KERN_INFO "%s: host device not found\n", __func__);
-		r = -EINVAL;
-		goto out_free;
-	}
-
-	/* Don't allow bridges to be assigned */
-	if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) {
-		r = -EPERM;
-		goto out_put;
-	}
-
-	r = probe_sysfs_permissions(dev);
-	if (r)
-		goto out_put;
-
-	if (pci_enable_device(dev)) {
-		printk(KERN_INFO "%s: Could not enable PCI device\n", __func__);
-		r = -EBUSY;
-		goto out_put;
-	}
-	r = pci_request_regions(dev, "kvm_assigned_device");
-	if (r) {
-		printk(KERN_INFO "%s: Could not get access to device regions\n",
-		       __func__);
-		goto out_disable;
-	}
-
-	pci_reset_function(dev);
-	pci_save_state(dev);
-	match->pci_saved_state = pci_store_saved_state(dev);
-	if (!match->pci_saved_state)
-		printk(KERN_DEBUG "%s: Couldn't store %s saved state\n",
-		       __func__, dev_name(&dev->dev));
-
-	if (!pci_intx_mask_supported(dev))
-		assigned_dev->flags &= ~KVM_DEV_ASSIGN_PCI_2_3;
-
-	match->assigned_dev_id = assigned_dev->assigned_dev_id;
-	match->host_segnr = assigned_dev->segnr;
-	match->host_busnr = assigned_dev->busnr;
-	match->host_devfn = assigned_dev->devfn;
-	match->flags = assigned_dev->flags;
-	match->dev = dev;
-	spin_lock_init(&match->intx_lock);
-	spin_lock_init(&match->intx_mask_lock);
-	match->irq_source_id = -1;
-	match->kvm = kvm;
-	match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
-
-	list_add(&match->list, &kvm->arch.assigned_dev_head);
-
-	if (!kvm->arch.iommu_domain) {
-		r = kvm_iommu_map_guest(kvm);
-		if (r)
-			goto out_list_del;
-	}
-	r = kvm_assign_device(kvm, match->dev);
-	if (r)
-		goto out_list_del;
-
-out:
-	srcu_read_unlock(&kvm->srcu, idx);
-	mutex_unlock(&kvm->lock);
-	return r;
-out_list_del:
-	if (pci_load_and_free_saved_state(dev, &match->pci_saved_state))
-		printk(KERN_INFO "%s: Couldn't reload %s saved state\n",
-		       __func__, dev_name(&dev->dev));
-	list_del(&match->list);
-	pci_release_regions(dev);
-out_disable:
-	pci_disable_device(dev);
-out_put:
-	pci_dev_put(dev);
-out_free:
-	kfree(match);
-	srcu_read_unlock(&kvm->srcu, idx);
-	mutex_unlock(&kvm->lock);
-	return r;
-}
-
-static int kvm_vm_ioctl_deassign_device(struct kvm *kvm,
-		struct kvm_assigned_pci_dev *assigned_dev)
-{
-	int r = 0;
-	struct kvm_assigned_dev_kernel *match;
-
-	mutex_lock(&kvm->lock);
-
-	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-				      assigned_dev->assigned_dev_id);
-	if (!match) {
-		printk(KERN_INFO "%s: device hasn't been assigned before, "
-		  "so cannot be deassigned\n", __func__);
-		r = -EINVAL;
-		goto out;
-	}
-
-	kvm_deassign_device(kvm, match->dev);
-
-	kvm_free_assigned_device(kvm, match);
-
-out:
-	mutex_unlock(&kvm->lock);
-	return r;
-}
-
-
-static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
-				    struct kvm_assigned_msix_nr *entry_nr)
-{
-	int r = 0;
-	struct kvm_assigned_dev_kernel *adev;
-
-	mutex_lock(&kvm->lock);
-
-	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-				      entry_nr->assigned_dev_id);
-	if (!adev) {
-		r = -EINVAL;
-		goto msix_nr_out;
-	}
-
-	if (adev->entries_nr == 0) {
-		adev->entries_nr = entry_nr->entry_nr;
-		if (adev->entries_nr == 0 ||
-		    adev->entries_nr > KVM_MAX_MSIX_PER_DEV) {
-			r = -EINVAL;
-			goto msix_nr_out;
-		}
-
-		adev->host_msix_entries = kzalloc(sizeof(struct msix_entry) *
-						entry_nr->entry_nr,
-						GFP_KERNEL);
-		if (!adev->host_msix_entries) {
-			r = -ENOMEM;
-			goto msix_nr_out;
-		}
-		adev->guest_msix_entries =
-			kzalloc(sizeof(struct msix_entry) * entry_nr->entry_nr,
-				GFP_KERNEL);
-		if (!adev->guest_msix_entries) {
-			kfree(adev->host_msix_entries);
-			r = -ENOMEM;
-			goto msix_nr_out;
-		}
-	} else /* Not allowed set MSI-X number twice */
-		r = -EINVAL;
-msix_nr_out:
-	mutex_unlock(&kvm->lock);
-	return r;
-}
-
-static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm,
-				       struct kvm_assigned_msix_entry *entry)
-{
-	int r = 0, i;
-	struct kvm_assigned_dev_kernel *adev;
-
-	mutex_lock(&kvm->lock);
-
-	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-				      entry->assigned_dev_id);
-
-	if (!adev) {
-		r = -EINVAL;
-		goto msix_entry_out;
-	}
-
-	for (i = 0; i < adev->entries_nr; i++)
-		if (adev->guest_msix_entries[i].vector == 0 ||
-		    adev->guest_msix_entries[i].entry == entry->entry) {
-			adev->guest_msix_entries[i].entry = entry->entry;
-			adev->guest_msix_entries[i].vector = entry->gsi;
-			adev->host_msix_entries[i].entry = entry->entry;
-			break;
-		}
-	if (i == adev->entries_nr) {
-		r = -ENOSPC;
-		goto msix_entry_out;
-	}
-
-msix_entry_out:
-	mutex_unlock(&kvm->lock);
-
-	return r;
-}
-
-static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm,
-		struct kvm_assigned_pci_dev *assigned_dev)
-{
-	int r = 0;
-	struct kvm_assigned_dev_kernel *match;
-
-	mutex_lock(&kvm->lock);
-
-	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-				      assigned_dev->assigned_dev_id);
-	if (!match) {
-		r = -ENODEV;
-		goto out;
-	}
-
-	spin_lock(&match->intx_mask_lock);
-
-	match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX;
-	match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX;
-
-	if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
-		if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) {
-			kvm_set_irq(match->kvm, match->irq_source_id,
-				    match->guest_irq, 0, false);
-			/*
-			 * Masking at hardware-level is performed on demand,
-			 * i.e. when an IRQ actually arrives at the host.
-			 */
-		} else if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
-			/*
-			 * Unmask the IRQ line if required. Unmasking at
-			 * device level will be performed by user space.
-			 */
-			spin_lock_irq(&match->intx_lock);
-			if (match->host_irq_disabled) {
-				enable_irq(match->host_irq);
-				match->host_irq_disabled = false;
-			}
-			spin_unlock_irq(&match->intx_lock);
-		}
-	}
-
-	spin_unlock(&match->intx_mask_lock);
-
-out:
-	mutex_unlock(&kvm->lock);
-	return r;
-}
-
-long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
-				  unsigned long arg)
-{
-	void __user *argp = (void __user *)arg;
-	int r;
-
-	switch (ioctl) {
-	case KVM_ASSIGN_PCI_DEVICE: {
-		struct kvm_assigned_pci_dev assigned_dev;
-
-		r = -EFAULT;
-		if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
-			goto out;
-		r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev);
-		if (r)
-			goto out;
-		break;
-	}
-	case KVM_ASSIGN_IRQ: {
-		r = -EOPNOTSUPP;
-		break;
-	}
-	case KVM_ASSIGN_DEV_IRQ: {
-		struct kvm_assigned_irq assigned_irq;
-
-		r = -EFAULT;
-		if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
-			goto out;
-		r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq);
-		if (r)
-			goto out;
-		break;
-	}
-	case KVM_DEASSIGN_DEV_IRQ: {
-		struct kvm_assigned_irq assigned_irq;
-
-		r = -EFAULT;
-		if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
-			goto out;
-		r = kvm_vm_ioctl_deassign_dev_irq(kvm, &assigned_irq);
-		if (r)
-			goto out;
-		break;
-	}
-	case KVM_DEASSIGN_PCI_DEVICE: {
-		struct kvm_assigned_pci_dev assigned_dev;
-
-		r = -EFAULT;
-		if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
-			goto out;
-		r = kvm_vm_ioctl_deassign_device(kvm, &assigned_dev);
-		if (r)
-			goto out;
-		break;
-	}
-	case KVM_ASSIGN_SET_MSIX_NR: {
-		struct kvm_assigned_msix_nr entry_nr;
-		r = -EFAULT;
-		if (copy_from_user(&entry_nr, argp, sizeof entry_nr))
-			goto out;
-		r = kvm_vm_ioctl_set_msix_nr(kvm, &entry_nr);
-		if (r)
-			goto out;
-		break;
-	}
-	case KVM_ASSIGN_SET_MSIX_ENTRY: {
-		struct kvm_assigned_msix_entry entry;
-		r = -EFAULT;
-		if (copy_from_user(&entry, argp, sizeof entry))
-			goto out;
-		r = kvm_vm_ioctl_set_msix_entry(kvm, &entry);
-		if (r)
-			goto out;
-		break;
-	}
-	case KVM_ASSIGN_SET_INTX_MASK: {
-		struct kvm_assigned_pci_dev assigned_dev;
-
-		r = -EFAULT;
-		if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
-			goto out;
-		r = kvm_vm_ioctl_set_pci_irq_mask(kvm, &assigned_dev);
-		break;
-	}
-	default:
-		r = -ENOTTY;
-		break;
-	}
-out:
-	return r;
-}
diff --git a/arch/x86/kvm/assigned-dev.h b/arch/x86/kvm/assigned-dev.h
deleted file mode 100644
index a428c1a..0000000
--- a/arch/x86/kvm/assigned-dev.h
+++ /dev/null
@@ -1,32 +0,0 @@
-#ifndef ARCH_X86_KVM_ASSIGNED_DEV_H
-#define ARCH_X86_KVM_ASSIGNED_DEV_H
-
-#include <linux/kvm_host.h>
-
-#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
-int kvm_assign_device(struct kvm *kvm, struct pci_dev *pdev);
-int kvm_deassign_device(struct kvm *kvm, struct pci_dev *pdev);
-
-int kvm_iommu_map_guest(struct kvm *kvm);
-int kvm_iommu_unmap_guest(struct kvm *kvm);
-
-long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
-				  unsigned long arg);
-
-void kvm_free_all_assigned_devices(struct kvm *kvm);
-#else
-static inline int kvm_iommu_unmap_guest(struct kvm *kvm)
-{
-	return 0;
-}
-
-static inline long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
-						unsigned long arg)
-{
-	return -ENOTTY;
-}
-
-static inline void kvm_free_all_assigned_devices(struct kvm *kvm) {}
-#endif /* CONFIG_KVM_DEVICE_ASSIGNMENT */
-
-#endif /* ARCH_X86_KVM_ASSIGNED_DEV_H */
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index efde6cc..a181ae7 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -876,6 +876,9 @@ int kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
 {
 	u32 eax, ebx, ecx, edx;
 
+	if (cpuid_fault_enabled(vcpu) && !kvm_require_cpl(vcpu, 0))
+		return 1;
+
 	eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
 	ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
 	kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx);
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 35058c2..a6fd40a 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -205,4 +205,15 @@ static inline int guest_cpuid_stepping(struct kvm_vcpu *vcpu)
 	return x86_stepping(best->eax);
 }
 
+static inline bool supports_cpuid_fault(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.msr_platform_info & MSR_PLATFORM_INFO_CPUID_FAULT;
+}
+
+static inline bool cpuid_fault_enabled(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.msr_misc_features_enables &
+		  MSR_MISC_FEATURES_ENABLES_CPUID_FAULT;
+}
+
 #endif
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 45c7306..c25cfaf 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2547,7 +2547,7 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt)
 	u64 smbase;
 	int ret;
 
-	if ((ctxt->emul_flags & X86EMUL_SMM_MASK) == 0)
+	if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_MASK) == 0)
 		return emulate_ud(ctxt);
 
 	/*
@@ -2596,11 +2596,11 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt)
 		return X86EMUL_UNHANDLEABLE;
 	}
 
-	if ((ctxt->emul_flags & X86EMUL_SMM_INSIDE_NMI_MASK) == 0)
+	if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_INSIDE_NMI_MASK) == 0)
 		ctxt->ops->set_nmi_mask(ctxt, false);
 
-	ctxt->emul_flags &= ~X86EMUL_SMM_INSIDE_NMI_MASK;
-	ctxt->emul_flags &= ~X86EMUL_SMM_MASK;
+	ctxt->ops->set_hflags(ctxt, ctxt->ops->get_hflags(ctxt) &
+		~(X86EMUL_SMM_INSIDE_NMI_MASK | X86EMUL_SMM_MASK));
 	return X86EMUL_CONTINUE;
 }
 
@@ -3854,6 +3854,13 @@ static int em_sti(struct x86_emulate_ctxt *ctxt)
 static int em_cpuid(struct x86_emulate_ctxt *ctxt)
 {
 	u32 eax, ebx, ecx, edx;
+	u64 msr = 0;
+
+	ctxt->ops->get_msr(ctxt, MSR_MISC_FEATURES_ENABLES, &msr);
+	if (msr & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT &&
+	    ctxt->ops->cpl(ctxt)) {
+		return emulate_gp(ctxt, 0);
+	}
 
 	eax = reg_read(ctxt, VCPU_REGS_RAX);
 	ecx = reg_read(ctxt, VCPU_REGS_RCX);
@@ -5316,6 +5323,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 	const struct x86_emulate_ops *ops = ctxt->ops;
 	int rc = X86EMUL_CONTINUE;
 	int saved_dst_type = ctxt->dst.type;
+	unsigned emul_flags;
 
 	ctxt->mem_read.pos = 0;
 
@@ -5330,6 +5338,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 		goto done;
 	}
 
+	emul_flags = ctxt->ops->get_hflags(ctxt);
 	if (unlikely(ctxt->d &
 		     (No64|Undefined|Sse|Mmx|Intercept|CheckPerm|Priv|Prot|String))) {
 		if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) ||
@@ -5363,7 +5372,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 				fetch_possible_mmx_operand(ctxt, &ctxt->dst);
 		}
 
-		if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && ctxt->intercept) {
+		if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && ctxt->intercept) {
 			rc = emulator_check_intercept(ctxt, ctxt->intercept,
 						      X86_ICPT_PRE_EXCEPT);
 			if (rc != X86EMUL_CONTINUE)
@@ -5392,7 +5401,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 				goto done;
 		}
 
-		if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) {
+		if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) {
 			rc = emulator_check_intercept(ctxt, ctxt->intercept,
 						      X86_ICPT_POST_EXCEPT);
 			if (rc != X86EMUL_CONTINUE)
@@ -5446,7 +5455,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 
 special_insn:
 
-	if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) {
+	if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) {
 		rc = emulator_check_intercept(ctxt, ctxt->intercept,
 					      X86_ICPT_POST_MEMACCESS);
 		if (rc != X86EMUL_CONTINUE)
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 73ea24d..bdcd413 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -49,7 +49,7 @@ static void pic_unlock(struct kvm_pic *s)
 	__releases(&s->lock)
 {
 	bool wakeup = s->wakeup_needed;
-	struct kvm_vcpu *vcpu, *found = NULL;
+	struct kvm_vcpu *vcpu;
 	int i;
 
 	s->wakeup_needed = false;
@@ -59,16 +59,11 @@ static void pic_unlock(struct kvm_pic *s)
 	if (wakeup) {
 		kvm_for_each_vcpu(i, vcpu, s->kvm) {
 			if (kvm_apic_accept_pic_intr(vcpu)) {
-				found = vcpu;
-				break;
+				kvm_make_request(KVM_REQ_EVENT, vcpu);
+				kvm_vcpu_kick(vcpu);
+				return;
 			}
 		}
-
-		if (!found)
-			return;
-
-		kvm_make_request(KVM_REQ_EVENT, found);
-		kvm_vcpu_kick(found);
 	}
 }
 
@@ -239,7 +234,7 @@ static inline void pic_intack(struct kvm_kpic_state *s, int irq)
 int kvm_pic_read_irq(struct kvm *kvm)
 {
 	int irq, irq2, intno;
-	struct kvm_pic *s = pic_irqchip(kvm);
+	struct kvm_pic *s = kvm->arch.vpic;
 
 	s->output = 0;
 
@@ -273,7 +268,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
 	return intno;
 }
 
-void kvm_pic_reset(struct kvm_kpic_state *s)
+static void kvm_pic_reset(struct kvm_kpic_state *s)
 {
 	int irq, i;
 	struct kvm_vcpu *vcpu;
@@ -422,19 +417,16 @@ static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1)
 	return ret;
 }
 
-static u32 pic_ioport_read(void *opaque, u32 addr1)
+static u32 pic_ioport_read(void *opaque, u32 addr)
 {
 	struct kvm_kpic_state *s = opaque;
-	unsigned int addr;
 	int ret;
 
-	addr = addr1;
-	addr &= 1;
 	if (s->poll) {
-		ret = pic_poll_read(s, addr1);
+		ret = pic_poll_read(s, addr);
 		s->poll = 0;
 	} else
-		if (addr == 0)
+		if ((addr & 1) == 0)
 			if (s->read_reg_select)
 				ret = s->isr;
 			else
@@ -456,76 +448,64 @@ static u32 elcr_ioport_read(void *opaque, u32 addr1)
 	return s->elcr;
 }
 
-static int picdev_in_range(gpa_t addr)
-{
-	switch (addr) {
-	case 0x20:
-	case 0x21:
-	case 0xa0:
-	case 0xa1:
-	case 0x4d0:
-	case 0x4d1:
-		return 1;
-	default:
-		return 0;
-	}
-}
-
 static int picdev_write(struct kvm_pic *s,
 			 gpa_t addr, int len, const void *val)
 {
 	unsigned char data = *(unsigned char *)val;
-	if (!picdev_in_range(addr))
-		return -EOPNOTSUPP;
 
 	if (len != 1) {
 		pr_pic_unimpl("non byte write\n");
 		return 0;
 	}
-	pic_lock(s);
 	switch (addr) {
 	case 0x20:
 	case 0x21:
 	case 0xa0:
 	case 0xa1:
+		pic_lock(s);
 		pic_ioport_write(&s->pics[addr >> 7], addr, data);
+		pic_unlock(s);
 		break;
 	case 0x4d0:
 	case 0x4d1:
+		pic_lock(s);
 		elcr_ioport_write(&s->pics[addr & 1], addr, data);
+		pic_unlock(s);
 		break;
+	default:
+		return -EOPNOTSUPP;
 	}
-	pic_unlock(s);
 	return 0;
 }
 
 static int picdev_read(struct kvm_pic *s,
 		       gpa_t addr, int len, void *val)
 {
-	unsigned char data = 0;
-	if (!picdev_in_range(addr))
-		return -EOPNOTSUPP;
+	unsigned char *data = (unsigned char *)val;
 
 	if (len != 1) {
 		memset(val, 0, len);
 		pr_pic_unimpl("non byte read\n");
 		return 0;
 	}
-	pic_lock(s);
 	switch (addr) {
 	case 0x20:
 	case 0x21:
 	case 0xa0:
 	case 0xa1:
-		data = pic_ioport_read(&s->pics[addr >> 7], addr);
+		pic_lock(s);
+		*data = pic_ioport_read(&s->pics[addr >> 7], addr);
+		pic_unlock(s);
 		break;
 	case 0x4d0:
 	case 0x4d1:
-		data = elcr_ioport_read(&s->pics[addr & 1], addr);
+		pic_lock(s);
+		*data = elcr_ioport_read(&s->pics[addr & 1], addr);
+		pic_unlock(s);
 		break;
+	default:
+		return -EOPNOTSUPP;
 	}
-	*(unsigned char *)val = data;
-	pic_unlock(s);
 	return 0;
 }
 
@@ -576,7 +556,7 @@ static int picdev_eclr_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
  */
 static void pic_irq_request(struct kvm *kvm, int level)
 {
-	struct kvm_pic *s = pic_irqchip(kvm);
+	struct kvm_pic *s = kvm->arch.vpic;
 
 	if (!s->output)
 		s->wakeup_needed = true;
@@ -657,9 +637,14 @@ void kvm_pic_destroy(struct kvm *kvm)
 {
 	struct kvm_pic *vpic = kvm->arch.vpic;
 
+	if (!vpic)
+		return;
+
+	mutex_lock(&kvm->slots_lock);
 	kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_master);
 	kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_slave);
 	kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_eclr);
+	mutex_unlock(&kvm->slots_lock);
 
 	kvm->arch.vpic = NULL;
 	kfree(vpic);
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 6e219e5..bdff437 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -266,11 +266,9 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors)
 	spin_unlock(&ioapic->lock);
 }
 
-void kvm_vcpu_request_scan_ioapic(struct kvm *kvm)
+void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm)
 {
-	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
-
-	if (!ioapic)
+	if (!ioapic_in_kernel(kvm))
 		return;
 	kvm_make_scan_ioapic_request(kvm);
 }
@@ -315,7 +313,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
 		if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG
 		    && ioapic->irr & (1 << index))
 			ioapic_service(ioapic, index, false);
-		kvm_vcpu_request_scan_ioapic(ioapic->kvm);
+		kvm_make_scan_ioapic_request(ioapic->kvm);
 		break;
 	}
 }
@@ -624,10 +622,8 @@ int kvm_ioapic_init(struct kvm *kvm)
 	if (ret < 0) {
 		kvm->arch.vioapic = NULL;
 		kfree(ioapic);
-		return ret;
 	}
 
-	kvm_vcpu_request_scan_ioapic(kvm);
 	return ret;
 }
 
@@ -635,37 +631,36 @@ void kvm_ioapic_destroy(struct kvm *kvm)
 {
 	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
 
+	if (!ioapic)
+		return;
+
 	cancel_delayed_work_sync(&ioapic->eoi_inject);
+	mutex_lock(&kvm->slots_lock);
 	kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev);
+	mutex_unlock(&kvm->slots_lock);
 	kvm->arch.vioapic = NULL;
 	kfree(ioapic);
 }
 
-int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
+void kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
 {
-	struct kvm_ioapic *ioapic = ioapic_irqchip(kvm);
-	if (!ioapic)
-		return -EINVAL;
+	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
 
 	spin_lock(&ioapic->lock);
 	memcpy(state, ioapic, sizeof(struct kvm_ioapic_state));
 	state->irr &= ~ioapic->irr_delivered;
 	spin_unlock(&ioapic->lock);
-	return 0;
 }
 
-int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
+void kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
 {
-	struct kvm_ioapic *ioapic = ioapic_irqchip(kvm);
-	if (!ioapic)
-		return -EINVAL;
+	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
 
 	spin_lock(&ioapic->lock);
 	memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
 	ioapic->irr = 0;
 	ioapic->irr_delivered = 0;
-	kvm_vcpu_request_scan_ioapic(kvm);
+	kvm_make_scan_ioapic_request(kvm);
 	kvm_ioapic_inject_all(ioapic, state->irr);
 	spin_unlock(&ioapic->lock);
-	return 0;
 }
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index 1cc6e54..29ce197 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -105,17 +105,13 @@ do {									\
 #define ASSERT(x) do { } while (0)
 #endif
 
-static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
-{
-	return kvm->arch.vioapic;
-}
-
 static inline int ioapic_in_kernel(struct kvm *kvm)
 {
-	int ret;
+	int mode = kvm->arch.irqchip_mode;
 
-	ret = (ioapic_irqchip(kvm) != NULL);
-	return ret;
+	/* Matches smp_wmb() when setting irqchip_mode */
+	smp_rmb();
+	return mode == KVM_IRQCHIP_KERNEL;
 }
 
 void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu);
@@ -132,8 +128,8 @@ void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id);
 int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
 			     struct kvm_lapic_irq *irq,
 			     struct dest_map *dest_map);
-int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
-int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
+void kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
+void kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
 void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu,
 			   ulong *ioapic_handled_vectors);
 void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/iommu.c b/arch/x86/kvm/iommu.c
deleted file mode 100644
index b181426..0000000
--- a/arch/x86/kvm/iommu.c
+++ /dev/null
@@ -1,356 +0,0 @@
-/*
- * Copyright (c) 2006, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
- *
- * Copyright (C) 2006-2008 Intel Corporation
- * Copyright IBM Corporation, 2008
- * Copyright 2010 Red Hat, Inc. and/or its affiliates.
- *
- * Author: Allen M. Kay <allen.m.kay@intel.com>
- * Author: Weidong Han <weidong.han@intel.com>
- * Author: Ben-Ami Yassour <benami@il.ibm.com>
- */
-
-#include <linux/list.h>
-#include <linux/kvm_host.h>
-#include <linux/moduleparam.h>
-#include <linux/pci.h>
-#include <linux/stat.h>
-#include <linux/iommu.h>
-#include "assigned-dev.h"
-
-static bool allow_unsafe_assigned_interrupts;
-module_param_named(allow_unsafe_assigned_interrupts,
-		   allow_unsafe_assigned_interrupts, bool, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(allow_unsafe_assigned_interrupts,
- "Enable device assignment on platforms without interrupt remapping support.");
-
-static int kvm_iommu_unmap_memslots(struct kvm *kvm);
-static void kvm_iommu_put_pages(struct kvm *kvm,
-				gfn_t base_gfn, unsigned long npages);
-
-static kvm_pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn,
-			   unsigned long npages)
-{
-	gfn_t end_gfn;
-	kvm_pfn_t pfn;
-
-	pfn     = gfn_to_pfn_memslot(slot, gfn);
-	end_gfn = gfn + npages;
-	gfn    += 1;
-
-	if (is_error_noslot_pfn(pfn))
-		return pfn;
-
-	while (gfn < end_gfn)
-		gfn_to_pfn_memslot(slot, gfn++);
-
-	return pfn;
-}
-
-static void kvm_unpin_pages(struct kvm *kvm, kvm_pfn_t pfn,
-		unsigned long npages)
-{
-	unsigned long i;
-
-	for (i = 0; i < npages; ++i)
-		kvm_release_pfn_clean(pfn + i);
-}
-
-int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
-{
-	gfn_t gfn, end_gfn;
-	kvm_pfn_t pfn;
-	int r = 0;
-	struct iommu_domain *domain = kvm->arch.iommu_domain;
-	int flags;
-
-	/* check if iommu exists and in use */
-	if (!domain)
-		return 0;
-
-	gfn     = slot->base_gfn;
-	end_gfn = gfn + slot->npages;
-
-	flags = IOMMU_READ;
-	if (!(slot->flags & KVM_MEM_READONLY))
-		flags |= IOMMU_WRITE;
-	if (!kvm->arch.iommu_noncoherent)
-		flags |= IOMMU_CACHE;
-
-
-	while (gfn < end_gfn) {
-		unsigned long page_size;
-
-		/* Check if already mapped */
-		if (iommu_iova_to_phys(domain, gfn_to_gpa(gfn))) {
-			gfn += 1;
-			continue;
-		}
-
-		/* Get the page size we could use to map */
-		page_size = kvm_host_page_size(kvm, gfn);
-
-		/* Make sure the page_size does not exceed the memslot */
-		while ((gfn + (page_size >> PAGE_SHIFT)) > end_gfn)
-			page_size >>= 1;
-
-		/* Make sure gfn is aligned to the page size we want to map */
-		while ((gfn << PAGE_SHIFT) & (page_size - 1))
-			page_size >>= 1;
-
-		/* Make sure hva is aligned to the page size we want to map */
-		while (__gfn_to_hva_memslot(slot, gfn) & (page_size - 1))
-			page_size >>= 1;
-
-		/*
-		 * Pin all pages we are about to map in memory. This is
-		 * important because we unmap and unpin in 4kb steps later.
-		 */
-		pfn = kvm_pin_pages(slot, gfn, page_size >> PAGE_SHIFT);
-		if (is_error_noslot_pfn(pfn)) {
-			gfn += 1;
-			continue;
-		}
-
-		/* Map into IO address space */
-		r = iommu_map(domain, gfn_to_gpa(gfn), pfn_to_hpa(pfn),
-			      page_size, flags);
-		if (r) {
-			printk(KERN_ERR "kvm_iommu_map_address:"
-			       "iommu failed to map pfn=%llx\n", pfn);
-			kvm_unpin_pages(kvm, pfn, page_size >> PAGE_SHIFT);
-			goto unmap_pages;
-		}
-
-		gfn += page_size >> PAGE_SHIFT;
-
-		cond_resched();
-	}
-
-	return 0;
-
-unmap_pages:
-	kvm_iommu_put_pages(kvm, slot->base_gfn, gfn - slot->base_gfn);
-	return r;
-}
-
-static int kvm_iommu_map_memslots(struct kvm *kvm)
-{
-	int idx, r = 0;
-	struct kvm_memslots *slots;
-	struct kvm_memory_slot *memslot;
-
-	if (kvm->arch.iommu_noncoherent)
-		kvm_arch_register_noncoherent_dma(kvm);
-
-	idx = srcu_read_lock(&kvm->srcu);
-	slots = kvm_memslots(kvm);
-
-	kvm_for_each_memslot(memslot, slots) {
-		r = kvm_iommu_map_pages(kvm, memslot);
-		if (r)
-			break;
-	}
-	srcu_read_unlock(&kvm->srcu, idx);
-
-	return r;
-}
-
-int kvm_assign_device(struct kvm *kvm, struct pci_dev *pdev)
-{
-	struct iommu_domain *domain = kvm->arch.iommu_domain;
-	int r;
-	bool noncoherent;
-
-	/* check if iommu exists and in use */
-	if (!domain)
-		return 0;
-
-	if (pdev == NULL)
-		return -ENODEV;
-
-	r = iommu_attach_device(domain, &pdev->dev);
-	if (r) {
-		dev_err(&pdev->dev, "kvm assign device failed ret %d", r);
-		return r;
-	}
-
-	noncoherent = !iommu_capable(&pci_bus_type, IOMMU_CAP_CACHE_COHERENCY);
-
-	/* Check if need to update IOMMU page table for guest memory */
-	if (noncoherent != kvm->arch.iommu_noncoherent) {
-		kvm_iommu_unmap_memslots(kvm);
-		kvm->arch.iommu_noncoherent = noncoherent;
-		r = kvm_iommu_map_memslots(kvm);
-		if (r)
-			goto out_unmap;
-	}
-
-	kvm_arch_start_assignment(kvm);
-	pci_set_dev_assigned(pdev);
-
-	dev_info(&pdev->dev, "kvm assign device\n");
-
-	return 0;
-out_unmap:
-	kvm_iommu_unmap_memslots(kvm);
-	return r;
-}
-
-int kvm_deassign_device(struct kvm *kvm, struct pci_dev *pdev)
-{
-	struct iommu_domain *domain = kvm->arch.iommu_domain;
-
-	/* check if iommu exists and in use */
-	if (!domain)
-		return 0;
-
-	if (pdev == NULL)
-		return -ENODEV;
-
-	iommu_detach_device(domain, &pdev->dev);
-
-	pci_clear_dev_assigned(pdev);
-	kvm_arch_end_assignment(kvm);
-
-	dev_info(&pdev->dev, "kvm deassign device\n");
-
-	return 0;
-}
-
-int kvm_iommu_map_guest(struct kvm *kvm)
-{
-	int r;
-
-	if (!iommu_present(&pci_bus_type)) {
-		printk(KERN_ERR "%s: iommu not found\n", __func__);
-		return -ENODEV;
-	}
-
-	mutex_lock(&kvm->slots_lock);
-
-	kvm->arch.iommu_domain = iommu_domain_alloc(&pci_bus_type);
-	if (!kvm->arch.iommu_domain) {
-		r = -ENOMEM;
-		goto out_unlock;
-	}
-
-	if (!allow_unsafe_assigned_interrupts &&
-	    !iommu_capable(&pci_bus_type, IOMMU_CAP_INTR_REMAP)) {
-		printk(KERN_WARNING "%s: No interrupt remapping support,"
-		       " disallowing device assignment."
-		       " Re-enable with \"allow_unsafe_assigned_interrupts=1\""
-		       " module option.\n", __func__);
-		iommu_domain_free(kvm->arch.iommu_domain);
-		kvm->arch.iommu_domain = NULL;
-		r = -EPERM;
-		goto out_unlock;
-	}
-
-	r = kvm_iommu_map_memslots(kvm);
-	if (r)
-		kvm_iommu_unmap_memslots(kvm);
-
-out_unlock:
-	mutex_unlock(&kvm->slots_lock);
-	return r;
-}
-
-static void kvm_iommu_put_pages(struct kvm *kvm,
-				gfn_t base_gfn, unsigned long npages)
-{
-	struct iommu_domain *domain;
-	gfn_t end_gfn, gfn;
-	kvm_pfn_t pfn;
-	u64 phys;
-
-	domain  = kvm->arch.iommu_domain;
-	end_gfn = base_gfn + npages;
-	gfn     = base_gfn;
-
-	/* check if iommu exists and in use */
-	if (!domain)
-		return;
-
-	while (gfn < end_gfn) {
-		unsigned long unmap_pages;
-		size_t size;
-
-		/* Get physical address */
-		phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn));
-
-		if (!phys) {
-			gfn++;
-			continue;
-		}
-
-		pfn  = phys >> PAGE_SHIFT;
-
-		/* Unmap address from IO address space */
-		size       = iommu_unmap(domain, gfn_to_gpa(gfn), PAGE_SIZE);
-		unmap_pages = 1ULL << get_order(size);
-
-		/* Unpin all pages we just unmapped to not leak any memory */
-		kvm_unpin_pages(kvm, pfn, unmap_pages);
-
-		gfn += unmap_pages;
-
-		cond_resched();
-	}
-}
-
-void kvm_iommu_unmap_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
-{
-	kvm_iommu_put_pages(kvm, slot->base_gfn, slot->npages);
-}
-
-static int kvm_iommu_unmap_memslots(struct kvm *kvm)
-{
-	int idx;
-	struct kvm_memslots *slots;
-	struct kvm_memory_slot *memslot;
-
-	idx = srcu_read_lock(&kvm->srcu);
-	slots = kvm_memslots(kvm);
-
-	kvm_for_each_memslot(memslot, slots)
-		kvm_iommu_unmap_pages(kvm, memslot);
-
-	srcu_read_unlock(&kvm->srcu, idx);
-
-	if (kvm->arch.iommu_noncoherent)
-		kvm_arch_unregister_noncoherent_dma(kvm);
-
-	return 0;
-}
-
-int kvm_iommu_unmap_guest(struct kvm *kvm)
-{
-	struct iommu_domain *domain = kvm->arch.iommu_domain;
-
-	/* check if iommu exists and in use */
-	if (!domain)
-		return 0;
-
-	mutex_lock(&kvm->slots_lock);
-	kvm_iommu_unmap_memslots(kvm);
-	kvm->arch.iommu_domain = NULL;
-	kvm->arch.iommu_noncoherent = false;
-	mutex_unlock(&kvm->slots_lock);
-
-	iommu_domain_free(domain);
-	return 0;
-}
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 60d91c9..5c24811 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -60,7 +60,7 @@ static int kvm_cpu_has_extint(struct kvm_vcpu *v)
 		if (irqchip_split(v->kvm))
 			return pending_userspace_extint(v);
 		else
-			return pic_irqchip(v->kvm)->output;
+			return v->kvm->arch.vpic->output;
 	} else
 		return 0;
 }
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 40d5b2c..0edd22c 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -78,40 +78,42 @@ void kvm_pic_destroy(struct kvm *kvm);
 int kvm_pic_read_irq(struct kvm *kvm);
 void kvm_pic_update_irq(struct kvm_pic *s);
 
-static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
-{
-	return kvm->arch.vpic;
-}
-
 static inline int pic_in_kernel(struct kvm *kvm)
 {
-	int ret;
+	int mode = kvm->arch.irqchip_mode;
 
-	ret = (pic_irqchip(kvm) != NULL);
-	return ret;
+	/* Matches smp_wmb() when setting irqchip_mode */
+	smp_rmb();
+	return mode == KVM_IRQCHIP_KERNEL;
 }
 
 static inline int irqchip_split(struct kvm *kvm)
 {
-	return kvm->arch.irqchip_mode == KVM_IRQCHIP_SPLIT;
+	int mode = kvm->arch.irqchip_mode;
+
+	/* Matches smp_wmb() when setting irqchip_mode */
+	smp_rmb();
+	return mode == KVM_IRQCHIP_SPLIT;
 }
 
 static inline int irqchip_kernel(struct kvm *kvm)
 {
-	return kvm->arch.irqchip_mode == KVM_IRQCHIP_KERNEL;
+	int mode = kvm->arch.irqchip_mode;
+
+	/* Matches smp_wmb() when setting irqchip_mode */
+	smp_rmb();
+	return mode == KVM_IRQCHIP_KERNEL;
 }
 
 static inline int irqchip_in_kernel(struct kvm *kvm)
 {
-	bool ret = kvm->arch.irqchip_mode != KVM_IRQCHIP_NONE;
+	int mode = kvm->arch.irqchip_mode;
 
-	/* Matches with wmb after initializing kvm->irq_routing. */
+	/* Matches smp_wmb() when setting irqchip_mode */
 	smp_rmb();
-	return ret;
+	return mode > KVM_IRQCHIP_INIT_IN_PROGRESS;
 }
 
-void kvm_pic_reset(struct kvm_kpic_state *s);
-
 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
 void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
 void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 6825cd3..4517a4c 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -42,7 +42,7 @@ static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
 			   struct kvm *kvm, int irq_source_id, int level,
 			   bool line_status)
 {
-	struct kvm_pic *pic = pic_irqchip(kvm);
+	struct kvm_pic *pic = kvm->arch.vpic;
 	return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level);
 }
 
@@ -232,11 +232,11 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
 		goto unlock;
 	}
 	clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap);
-	if (!ioapic_in_kernel(kvm))
+	if (!irqchip_kernel(kvm))
 		goto unlock;
 
 	kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id);
-	kvm_pic_clear_all(pic_irqchip(kvm), irq_source_id);
+	kvm_pic_clear_all(kvm->arch.vpic, irq_source_id);
 unlock:
 	mutex_unlock(&kvm->irq_lock);
 }
@@ -278,38 +278,35 @@ int kvm_set_routing_entry(struct kvm *kvm,
 			  struct kvm_kernel_irq_routing_entry *e,
 			  const struct kvm_irq_routing_entry *ue)
 {
-	int r = -EINVAL;
-	int delta;
-	unsigned max_pin;
+	/* also allow creation of routes during KVM_IRQCHIP_INIT_IN_PROGRESS */
+	if (kvm->arch.irqchip_mode == KVM_IRQCHIP_NONE)
+		return -EINVAL;
 
+	/* Matches smp_wmb() when setting irqchip_mode */
+	smp_rmb();
 	switch (ue->type) {
 	case KVM_IRQ_ROUTING_IRQCHIP:
-		delta = 0;
+		if (irqchip_split(kvm))
+			return -EINVAL;
+		e->irqchip.pin = ue->u.irqchip.pin;
 		switch (ue->u.irqchip.irqchip) {
 		case KVM_IRQCHIP_PIC_SLAVE:
-			delta = 8;
+			e->irqchip.pin += PIC_NUM_PINS / 2;
 			/* fall through */
 		case KVM_IRQCHIP_PIC_MASTER:
-			if (!pic_in_kernel(kvm))
-				goto out;
-
+			if (ue->u.irqchip.pin >= PIC_NUM_PINS / 2)
+				return -EINVAL;
 			e->set = kvm_set_pic_irq;
-			max_pin = PIC_NUM_PINS;
 			break;
 		case KVM_IRQCHIP_IOAPIC:
-			if (!ioapic_in_kernel(kvm))
-				goto out;
-
-			max_pin = KVM_IOAPIC_NUM_PINS;
+			if (ue->u.irqchip.pin >= KVM_IOAPIC_NUM_PINS)
+				return -EINVAL;
 			e->set = kvm_set_ioapic_irq;
 			break;
 		default:
-			goto out;
+			return -EINVAL;
 		}
 		e->irqchip.irqchip = ue->u.irqchip.irqchip;
-		e->irqchip.pin = ue->u.irqchip.pin + delta;
-		if (e->irqchip.pin >= max_pin)
-			goto out;
 		break;
 	case KVM_IRQ_ROUTING_MSI:
 		e->set = kvm_set_msi;
@@ -318,7 +315,7 @@ int kvm_set_routing_entry(struct kvm *kvm,
 		e->msi.data = ue->u.msi.data;
 
 		if (kvm_msi_route_invalid(kvm, e))
-			goto out;
+			return -EINVAL;
 		break;
 	case KVM_IRQ_ROUTING_HV_SINT:
 		e->set = kvm_hv_set_sint;
@@ -326,12 +323,10 @@ int kvm_set_routing_entry(struct kvm *kvm,
 		e->hv_sint.sint = ue->u.hv_sint.sint;
 		break;
 	default:
-		goto out;
+		return -EINVAL;
 	}
 
-	r = 0;
-out:
-	return r;
+	return 0;
 }
 
 bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ac78105..5586765 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4340,7 +4340,8 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
 
-void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly)
+void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
+			     bool accessed_dirty)
 {
 	struct kvm_mmu *context = &vcpu->arch.mmu;
 
@@ -4349,6 +4350,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly)
 	context->shadow_root_level = kvm_x86_ops->get_tdp_level();
 
 	context->nx = true;
+	context->ept_ad = accessed_dirty;
 	context->page_fault = ept_page_fault;
 	context->gva_to_gpa = ept_gva_to_gpa;
 	context->sync_page = ept_sync_page;
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index ddc56e9..d8ccb32 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -74,7 +74,8 @@ enum {
 
 int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct);
 void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu);
-void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly);
+void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
+			     bool accessed_dirty);
 
 static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
 {
diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c
index 37942e4..60168cd 100644
--- a/arch/x86/kvm/page_track.c
+++ b/arch/x86/kvm/page_track.c
@@ -160,6 +160,14 @@ bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn,
 	return !!ACCESS_ONCE(slot->arch.gfn_track[mode][index]);
 }
 
+void kvm_page_track_cleanup(struct kvm *kvm)
+{
+	struct kvm_page_track_notifier_head *head;
+
+	head = &kvm->arch.track_notifier_head;
+	cleanup_srcu_struct(&head->track_srcu);
+}
+
 void kvm_page_track_init(struct kvm *kvm)
 {
 	struct kvm_page_track_notifier_head *head;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index a011054..314d207 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -23,13 +23,6 @@
  * so the code in this file is compiled twice, once per pte size.
  */
 
-/*
- * This is used to catch non optimized PT_GUEST_(DIRTY|ACCESS)_SHIFT macro
- * uses for EPT without A/D paging type.
- */
-extern u64 __pure __using_nonexistent_pte_bit(void)
-	       __compiletime_error("wrong use of PT_GUEST_(DIRTY|ACCESS)_SHIFT");
-
 #if PTTYPE == 64
 	#define pt_element_t u64
 	#define guest_walker guest_walker64
@@ -39,10 +32,9 @@ extern u64 __pure __using_nonexistent_pte_bit(void)
 	#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
 	#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
 	#define PT_LEVEL_BITS PT64_LEVEL_BITS
-	#define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK
-	#define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK
 	#define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
 	#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
+	#define PT_HAVE_ACCESSED_DIRTY(mmu) true
 	#ifdef CONFIG_X86_64
 	#define PT_MAX_FULL_LEVELS 4
 	#define CMPXCHG cmpxchg
@@ -60,10 +52,9 @@ extern u64 __pure __using_nonexistent_pte_bit(void)
 	#define PT_INDEX(addr, level) PT32_INDEX(addr, level)
 	#define PT_LEVEL_BITS PT32_LEVEL_BITS
 	#define PT_MAX_FULL_LEVELS 2
-	#define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK
-	#define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK
 	#define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
 	#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
+	#define PT_HAVE_ACCESSED_DIRTY(mmu) true
 	#define CMPXCHG cmpxchg
 #elif PTTYPE == PTTYPE_EPT
 	#define pt_element_t u64
@@ -74,16 +65,18 @@ extern u64 __pure __using_nonexistent_pte_bit(void)
 	#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
 	#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
 	#define PT_LEVEL_BITS PT64_LEVEL_BITS
-	#define PT_GUEST_ACCESSED_MASK 0
-	#define PT_GUEST_DIRTY_MASK 0
-	#define PT_GUEST_DIRTY_SHIFT __using_nonexistent_pte_bit()
-	#define PT_GUEST_ACCESSED_SHIFT __using_nonexistent_pte_bit()
+	#define PT_GUEST_DIRTY_SHIFT 9
+	#define PT_GUEST_ACCESSED_SHIFT 8
+	#define PT_HAVE_ACCESSED_DIRTY(mmu) ((mmu)->ept_ad)
 	#define CMPXCHG cmpxchg64
 	#define PT_MAX_FULL_LEVELS 4
 #else
 	#error Invalid PTTYPE value
 #endif
 
+#define PT_GUEST_DIRTY_MASK    (1 << PT_GUEST_DIRTY_SHIFT)
+#define PT_GUEST_ACCESSED_MASK (1 << PT_GUEST_ACCESSED_SHIFT)
+
 #define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl)
 #define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PT_PAGE_TABLE_LEVEL)
 
@@ -111,12 +104,13 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
 	return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
 }
 
-static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte)
+static inline void FNAME(protect_clean_gpte)(struct kvm_mmu *mmu, unsigned *access,
+					     unsigned gpte)
 {
 	unsigned mask;
 
 	/* dirty bit is not supported, so no need to track it */
-	if (!PT_GUEST_DIRTY_MASK)
+	if (!PT_HAVE_ACCESSED_DIRTY(mmu))
 		return;
 
 	BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
@@ -171,7 +165,7 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
 		goto no_present;
 
 	/* if accessed bit is not supported prefetch non accessed gpte */
-	if (PT_GUEST_ACCESSED_MASK && !(gpte & PT_GUEST_ACCESSED_MASK))
+	if (PT_HAVE_ACCESSED_DIRTY(&vcpu->arch.mmu) && !(gpte & PT_GUEST_ACCESSED_MASK))
 		goto no_present;
 
 	return false;
@@ -217,7 +211,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
 	int ret;
 
 	/* dirty/accessed bits are not supported, so no need to update them */
-	if (!PT_GUEST_DIRTY_MASK)
+	if (!PT_HAVE_ACCESSED_DIRTY(mmu))
 		return 0;
 
 	for (level = walker->max_level; level >= walker->level; --level) {
@@ -286,7 +280,9 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 	pt_element_t __user *uninitialized_var(ptep_user);
 	gfn_t table_gfn;
 	unsigned index, pt_access, pte_access, accessed_dirty, pte_pkey;
+	unsigned nested_access;
 	gpa_t pte_gpa;
+	bool have_ad;
 	int offset;
 	const int write_fault = access & PFERR_WRITE_MASK;
 	const int user_fault  = access & PFERR_USER_MASK;
@@ -299,6 +295,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 retry_walk:
 	walker->level = mmu->root_level;
 	pte           = mmu->get_cr3(vcpu);
+	have_ad       = PT_HAVE_ACCESSED_DIRTY(mmu);
 
 #if PTTYPE == 64
 	if (walker->level == PT32E_ROOT_LEVEL) {
@@ -312,7 +309,15 @@ retry_walk:
 	walker->max_level = walker->level;
 	ASSERT(!(is_long_mode(vcpu) && !is_pae(vcpu)));
 
-	accessed_dirty = PT_GUEST_ACCESSED_MASK;
+	accessed_dirty = have_ad ? PT_GUEST_ACCESSED_MASK : 0;
+
+	/*
+	 * FIXME: on Intel processors, loads of the PDPTE registers for PAE paging
+	 * by the MOV to CR instruction are treated as reads and do not cause the
+	 * processor to set the dirty flag in any EPT paging-structure entry.
+	 */
+	nested_access = (have_ad ? PFERR_WRITE_MASK : 0) | PFERR_USER_MASK;
+
 	pt_access = pte_access = ACC_ALL;
 	++walker->level;
 
@@ -332,7 +337,7 @@ retry_walk:
 		walker->pte_gpa[walker->level - 1] = pte_gpa;
 
 		real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn),
-					      PFERR_USER_MASK|PFERR_WRITE_MASK,
+					      nested_access,
 					      &walker->fault);
 
 		/*
@@ -394,7 +399,7 @@ retry_walk:
 	walker->gfn = real_gpa >> PAGE_SHIFT;
 
 	if (!write_fault)
-		FNAME(protect_clean_gpte)(&pte_access, pte);
+		FNAME(protect_clean_gpte)(mmu, &pte_access, pte);
 	else
 		/*
 		 * On a write fault, fold the dirty bit into accessed_dirty.
@@ -485,7 +490,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 
 	gfn = gpte_to_gfn(gpte);
 	pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
-	FNAME(protect_clean_gpte)(&pte_access, gpte);
+	FNAME(protect_clean_gpte)(&vcpu->arch.mmu, &pte_access, gpte);
 	pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
 			no_dirty_log && (pte_access & ACC_WRITE_MASK));
 	if (is_error_pfn(pfn))
@@ -979,7 +984,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 		gfn = gpte_to_gfn(gpte);
 		pte_access = sp->role.access;
 		pte_access &= FNAME(gpte_access)(vcpu, gpte);
-		FNAME(protect_clean_gpte)(&pte_access, gpte);
+		FNAME(protect_clean_gpte)(&vcpu->arch.mmu, &pte_access, gpte);
 
 		if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access,
 		      &nr_present))
@@ -1025,3 +1030,4 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 #undef PT_GUEST_DIRTY_MASK
 #undef PT_GUEST_DIRTY_SHIFT
 #undef PT_GUEST_ACCESSED_SHIFT
+#undef PT_HAVE_ACCESSED_DIRTY
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d1efe2c..c41f03e 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1198,10 +1198,13 @@ static void init_vmcb(struct vcpu_svm *svm)
 	set_intercept(svm, INTERCEPT_CLGI);
 	set_intercept(svm, INTERCEPT_SKINIT);
 	set_intercept(svm, INTERCEPT_WBINVD);
-	set_intercept(svm, INTERCEPT_MONITOR);
-	set_intercept(svm, INTERCEPT_MWAIT);
 	set_intercept(svm, INTERCEPT_XSETBV);
 
+	if (!kvm_mwait_in_guest()) {
+		set_intercept(svm, INTERCEPT_MONITOR);
+		set_intercept(svm, INTERCEPT_MWAIT);
+	}
+
 	control->iopm_base_pa = iopm_base;
 	control->msrpm_base_pa = __pa(svm->msrpm);
 	control->int_ctl = V_INTR_MASKING_MASK;
@@ -1379,6 +1382,9 @@ static void avic_vm_destroy(struct kvm *kvm)
 	unsigned long flags;
 	struct kvm_arch *vm_data = &kvm->arch;
 
+	if (!avic)
+		return;
+
 	avic_free_vm_id(vm_data->avic_vm_id);
 
 	if (vm_data->avic_logical_id_table_page)
@@ -5253,6 +5259,12 @@ static inline void avic_post_state_restore(struct kvm_vcpu *vcpu)
 	avic_handle_ldr_update(vcpu);
 }
 
+static void svm_setup_mce(struct kvm_vcpu *vcpu)
+{
+	/* [63:9] are reserved. */
+	vcpu->arch.mcg_cap &= 0x1ff;
+}
+
 static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
 	.cpu_has_kvm_support = has_svm,
 	.disabled_by_bios = is_disabled,
@@ -5364,6 +5376,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
 	.pmu_ops = &amd_pmu_ops,
 	.deliver_posted_interrupt = svm_deliver_avic_intr,
 	.update_pi_irte = svm_update_pi_irte,
+	.setup_mce = svm_setup_mce,
 };
 
 static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 98e82ee..75e6cc9 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -84,9 +84,6 @@ module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
 static bool __read_mostly emulate_invalid_guest_state = true;
 module_param(emulate_invalid_guest_state, bool, S_IRUGO);
 
-static bool __read_mostly vmm_exclusive = 1;
-module_param(vmm_exclusive, bool, S_IRUGO);
-
 static bool __read_mostly fasteoi = 1;
 module_param(fasteoi, bool, S_IRUGO);
 
@@ -615,10 +612,6 @@ struct vcpu_vmx {
 	int vpid;
 	bool emulation_required;
 
-	/* Support for vnmi-less CPUs */
-	int soft_vnmi_blocked;
-	ktime_t entry_time;
-	s64 vnmi_blocked_time;
 	u32 exit_reason;
 
 	/* Posted interrupt descriptor */
@@ -914,8 +907,6 @@ static void nested_release_page_clean(struct page *page)
 
 static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
 static u64 construct_eptp(unsigned long root_hpa);
-static void kvm_cpu_vmxon(u64 addr);
-static void kvm_cpu_vmxoff(void);
 static bool vmx_xsaves_supported(void);
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 static void vmx_set_segment(struct kvm_vcpu *vcpu,
@@ -1239,6 +1230,11 @@ static inline bool cpu_has_vmx_invvpid_global(void)
 	return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
 }
 
+static inline bool cpu_has_vmx_invvpid(void)
+{
+	return vmx_capability.vpid & VMX_VPID_INVVPID_BIT;
+}
+
 static inline bool cpu_has_vmx_ept(void)
 {
 	return vmcs_config.cpu_based_2nd_exec_ctrl &
@@ -1285,11 +1281,6 @@ static inline bool cpu_has_vmx_invpcid(void)
 		SECONDARY_EXEC_ENABLE_INVPCID;
 }
 
-static inline bool cpu_has_virtual_nmis(void)
-{
-	return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
-}
-
 static inline bool cpu_has_vmx_wbinvd_exit(void)
 {
 	return vmcs_config.cpu_based_2nd_exec_ctrl &
@@ -2235,15 +2226,10 @@ static void decache_tsc_multiplier(struct vcpu_vmx *vmx)
 static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
 	bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
 
-	if (!vmm_exclusive)
-		kvm_cpu_vmxon(phys_addr);
-	else if (!already_loaded)
-		loaded_vmcs_clear(vmx->loaded_vmcs);
-
 	if (!already_loaded) {
+		loaded_vmcs_clear(vmx->loaded_vmcs);
 		local_irq_disable();
 		crash_disable_local_vmclear(cpu);
 
@@ -2321,11 +2307,6 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
 	vmx_vcpu_pi_put(vcpu);
 
 	__vmx_load_host_state(to_vmx(vcpu));
-	if (!vmm_exclusive) {
-		__loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs);
-		vcpu->cpu = -1;
-		kvm_cpu_vmxoff();
-	}
 }
 
 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
@@ -2749,11 +2730,11 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 		vmx->nested.nested_vmx_secondary_ctls_high);
 	vmx->nested.nested_vmx_secondary_ctls_low = 0;
 	vmx->nested.nested_vmx_secondary_ctls_high &=
+		SECONDARY_EXEC_RDRAND | SECONDARY_EXEC_RDSEED |
 		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
 		SECONDARY_EXEC_RDTSCP |
 		SECONDARY_EXEC_DESC |
 		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
-		SECONDARY_EXEC_ENABLE_VPID |
 		SECONDARY_EXEC_APIC_REGISTER_VIRT |
 		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
 		SECONDARY_EXEC_WBINVD_EXITING |
@@ -2764,14 +2745,16 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 		vmx->nested.nested_vmx_secondary_ctls_high |=
 			SECONDARY_EXEC_ENABLE_EPT;
 		vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
-			 VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT |
-			 VMX_EPT_INVEPT_BIT;
+			 VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT;
 		if (cpu_has_vmx_ept_execute_only())
 			vmx->nested.nested_vmx_ept_caps |=
 				VMX_EPT_EXECUTE_ONLY_BIT;
 		vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept;
 		vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
-			VMX_EPT_EXTENT_CONTEXT_BIT;
+			VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
+			VMX_EPT_1GB_PAGE_BIT;
+	       if (enable_ept_ad_bits)
+		       vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT;
 	} else
 		vmx->nested.nested_vmx_ept_caps = 0;
 
@@ -2781,10 +2764,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 	 * though it is treated as global context.  The alternative is
 	 * not failing the single-context invvpid, and it is worse.
 	 */
-	if (enable_vpid)
+	if (enable_vpid) {
+		vmx->nested.nested_vmx_secondary_ctls_high |=
+			SECONDARY_EXEC_ENABLE_VPID;
 		vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT |
 			VMX_VPID_EXTENT_SUPPORTED_MASK;
-	else
+	} else
 		vmx->nested.nested_vmx_vpid_caps = 0;
 
 	if (enable_unrestricted_guest)
@@ -3416,6 +3401,7 @@ static __init int vmx_disabled_by_bios(void)
 
 static void kvm_cpu_vmxon(u64 addr)
 {
+	cr4_set_bits(X86_CR4_VMXE);
 	intel_pt_handle_vmx(1);
 
 	asm volatile (ASM_VMX_VMXON_RAX
@@ -3458,12 +3444,8 @@ static int hardware_enable(void)
 		/* enable and lock */
 		wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
 	}
-	cr4_set_bits(X86_CR4_VMXE);
-
-	if (vmm_exclusive) {
-		kvm_cpu_vmxon(phys_addr);
-		ept_sync_global();
-	}
+	kvm_cpu_vmxon(phys_addr);
+	ept_sync_global();
 
 	native_store_gdt(this_cpu_ptr(&host_gdt));
 
@@ -3489,15 +3471,13 @@ static void kvm_cpu_vmxoff(void)
 	asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
 
 	intel_pt_handle_vmx(0);
+	cr4_clear_bits(X86_CR4_VMXE);
 }
 
 static void hardware_disable(void)
 {
-	if (vmm_exclusive) {
-		vmclear_local_loaded_vmcss();
-		kvm_cpu_vmxoff();
-	}
-	cr4_clear_bits(X86_CR4_VMXE);
+	vmclear_local_loaded_vmcss();
+	kvm_cpu_vmxoff();
 }
 
 static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
@@ -3547,11 +3527,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 	      CPU_BASED_USE_IO_BITMAPS |
 	      CPU_BASED_MOV_DR_EXITING |
 	      CPU_BASED_USE_TSC_OFFSETING |
-	      CPU_BASED_MWAIT_EXITING |
-	      CPU_BASED_MONITOR_EXITING |
 	      CPU_BASED_INVLPG_EXITING |
 	      CPU_BASED_RDPMC_EXITING;
 
+	if (!kvm_mwait_in_guest())
+		min |= CPU_BASED_MWAIT_EXITING |
+			CPU_BASED_MONITOR_EXITING;
+
 	opt = CPU_BASED_TPR_SHADOW |
 	      CPU_BASED_USE_MSR_BITMAPS |
 	      CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@ -3617,9 +3599,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 				&_vmexit_control) < 0)
 		return -EIO;
 
-	min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
-	opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
-		 PIN_BASED_VMX_PREEMPTION_TIMER;
+	min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING |
+		PIN_BASED_VIRTUAL_NMIS;
+	opt = PIN_BASED_POSTED_INTR | PIN_BASED_VMX_PREEMPTION_TIMER;
 	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
 				&_pin_based_exec_control) < 0)
 		return -EIO;
@@ -4011,11 +3993,12 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
 
 static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid)
 {
-	vpid_sync_context(vpid);
 	if (enable_ept) {
 		if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
 			return;
 		ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
+	} else {
+		vpid_sync_context(vpid);
 	}
 }
 
@@ -4024,6 +4007,12 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
 	__vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid);
 }
 
+static void vmx_flush_tlb_ept_only(struct kvm_vcpu *vcpu)
+{
+	if (enable_ept)
+		vmx_flush_tlb(vcpu);
+}
+
 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
 {
 	ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
@@ -5285,8 +5274,6 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 
 	vmx->rmode.vm86_active = 0;
 
-	vmx->soft_vnmi_blocked = 0;
-
 	vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
 	kvm_set_cr8(vcpu, 0);
 
@@ -5406,8 +5393,7 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
 
 static void enable_nmi_window(struct kvm_vcpu *vcpu)
 {
-	if (!cpu_has_virtual_nmis() ||
-	    vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
+	if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
 		enable_irq_window(vcpu);
 		return;
 	}
@@ -5448,19 +5434,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
 	if (!is_guest_mode(vcpu)) {
-		if (!cpu_has_virtual_nmis()) {
-			/*
-			 * Tracking the NMI-blocked state in software is built upon
-			 * finding the next open IRQ window. This, in turn, depends on
-			 * well-behaving guests: They have to keep IRQs disabled at
-			 * least as long as the NMI handler runs. Otherwise we may
-			 * cause NMI nesting, maybe breaking the guest. But as this is
-			 * highly unlikely, we can live with the residual risk.
-			 */
-			vmx->soft_vnmi_blocked = 1;
-			vmx->vnmi_blocked_time = 0;
-		}
-
 		++vcpu->stat.nmi_injections;
 		vmx->nmi_known_unmasked = false;
 	}
@@ -5477,8 +5450,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
 
 static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
 {
-	if (!cpu_has_virtual_nmis())
-		return to_vmx(vcpu)->soft_vnmi_blocked;
 	if (to_vmx(vcpu)->nmi_known_unmasked)
 		return false;
 	return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)	& GUEST_INTR_STATE_NMI;
@@ -5488,20 +5459,13 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
-	if (!cpu_has_virtual_nmis()) {
-		if (vmx->soft_vnmi_blocked != masked) {
-			vmx->soft_vnmi_blocked = masked;
-			vmx->vnmi_blocked_time = 0;
-		}
-	} else {
-		vmx->nmi_known_unmasked = !masked;
-		if (masked)
-			vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
-				      GUEST_INTR_STATE_NMI);
-		else
-			vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
-					GUEST_INTR_STATE_NMI);
-	}
+	vmx->nmi_known_unmasked = !masked;
+	if (masked)
+		vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+			      GUEST_INTR_STATE_NMI);
+	else
+		vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
+				GUEST_INTR_STATE_NMI);
 }
 
 static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
@@ -5509,9 +5473,6 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
 	if (to_vmx(vcpu)->nested.nested_run_pending)
 		return 0;
 
-	if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked)
-		return 0;
-
 	return	!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
 		  (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
 		   | GUEST_INTR_STATE_NMI));
@@ -6232,21 +6193,18 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
 	unsigned long exit_qualification;
 	gpa_t gpa;
 	u32 error_code;
-	int gla_validity;
 
 	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 
-	gla_validity = (exit_qualification >> 7) & 0x3;
-	if (gla_validity == 0x2) {
-		printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
-		printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
-			(long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
-			vmcs_readl(GUEST_LINEAR_ADDRESS));
-		printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
-			(long unsigned int)exit_qualification);
-		vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
-		vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION;
-		return 0;
+	if (is_guest_mode(vcpu)
+	    && !(exit_qualification & EPT_VIOLATION_GVA_TRANSLATED)) {
+		/*
+		 * Fix up exit_qualification according to whether guest
+		 * page table accesses are reads or writes.
+		 */
+		u64 eptp = nested_ept_get_cr3(vcpu);
+		if (!(eptp & VMX_EPT_AD_ENABLE_BIT))
+			exit_qualification &= ~EPT_VIOLATION_ACC_WRITE;
 	}
 
 	/*
@@ -6256,7 +6214,6 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
 	 * AAK134, BY25.
 	 */
 	if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
-			cpu_has_virtual_nmis() &&
 			(exit_qualification & INTR_INFO_UNBLOCK_NMI))
 		vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
 
@@ -6342,7 +6299,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
 		if (intr_window_requested && vmx_interrupt_allowed(vcpu))
 			return handle_interrupt_window(&vmx->vcpu);
 
-		if (test_bit(KVM_REQ_EVENT, &vcpu->requests))
+		if (kvm_test_request(KVM_REQ_EVENT, vcpu))
 			return 1;
 
 		err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE);
@@ -6517,8 +6474,10 @@ static __init int hardware_setup(void)
 	if (boot_cpu_has(X86_FEATURE_NX))
 		kvm_enable_efer_bits(EFER_NX);
 
-	if (!cpu_has_vmx_vpid())
+	if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
+		!(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
 		enable_vpid = 0;
+
 	if (!cpu_has_vmx_shadow_vmcs())
 		enable_shadow_vmcs = 0;
 	if (enable_shadow_vmcs)
@@ -7093,34 +7052,24 @@ out_msr_bitmap:
 static int handle_vmon(struct kvm_vcpu *vcpu)
 {
 	int ret;
-	struct kvm_segment cs;
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
 		| FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
 
-	/* The Intel VMX Instruction Reference lists a bunch of bits that
-	 * are prerequisite to running VMXON, most notably cr4.VMXE must be
-	 * set to 1 (see vmx_set_cr4() for when we allow the guest to set this).
-	 * Otherwise, we should fail with #UD. We test these now:
+	/*
+	 * The Intel VMX Instruction Reference lists a bunch of bits that are
+	 * prerequisite to running VMXON, most notably cr4.VMXE must be set to
+	 * 1 (see vmx_set_cr4() for when we allow the guest to set this).
+	 * Otherwise, we should fail with #UD.  But most faulting conditions
+	 * have already been checked by hardware, prior to the VM-exit for
+	 * VMXON.  We do test guest cr4.VMXE because processor CR4 always has
+	 * that bit set to 1 in non-root mode.
 	 */
-	if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE) ||
-	    !kvm_read_cr0_bits(vcpu, X86_CR0_PE) ||
-	    (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
-		kvm_queue_exception(vcpu, UD_VECTOR);
-		return 1;
-	}
-
-	vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
-	if (is_long_mode(vcpu) && !cs.l) {
+	if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) {
 		kvm_queue_exception(vcpu, UD_VECTOR);
 		return 1;
 	}
 
-	if (vmx_get_cpl(vcpu)) {
-		kvm_inject_gp(vcpu, 0);
-		return 1;
-	}
-
 	if (vmx->nested.vmxon) {
 		nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
 		return kvm_skip_emulated_instruction(vcpu);
@@ -7147,29 +7096,15 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
  * Intel's VMX Instruction Reference specifies a common set of prerequisites
  * for running VMX instructions (except VMXON, whose prerequisites are
  * slightly different). It also specifies what exception to inject otherwise.
+ * Note that many of these exceptions have priority over VM exits, so they
+ * don't have to be checked again here.
  */
 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
 {
-	struct kvm_segment cs;
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-	if (!vmx->nested.vmxon) {
+	if (!to_vmx(vcpu)->nested.vmxon) {
 		kvm_queue_exception(vcpu, UD_VECTOR);
 		return 0;
 	}
-
-	vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
-	if ((vmx_get_rflags(vcpu) & X86_EFLAGS_VM) ||
-	    (is_long_mode(vcpu) && !cs.l)) {
-		kvm_queue_exception(vcpu, UD_VECTOR);
-		return 0;
-	}
-
-	if (vmx_get_cpl(vcpu)) {
-		kvm_inject_gp(vcpu, 0);
-		return 0;
-	}
-
 	return 1;
 }
 
@@ -7513,7 +7448,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
 		if (get_vmx_mem_address(vcpu, exit_qualification,
 				vmx_instruction_info, true, &gva))
 			return 1;
-		/* _system ok, as nested_vmx_check_permission verified cpl=0 */
+		/* _system ok, as hardware has verified cpl=0 */
 		kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, gva,
 			     &field_value, (is_long_mode(vcpu) ? 8 : 4), NULL);
 	}
@@ -7646,7 +7581,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
 	if (get_vmx_mem_address(vcpu, exit_qualification,
 			vmx_instruction_info, true, &vmcs_gva))
 		return 1;
-	/* ok to use *_system, as nested_vmx_check_permission verified cpl=0 */
+	/* ok to use *_system, as hardware has verified cpl=0 */
 	if (kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, vmcs_gva,
 				 (void *)&to_vmx(vcpu)->nested.current_vmptr,
 				 sizeof(u64), &e)) {
@@ -7679,11 +7614,6 @@ static int handle_invept(struct kvm_vcpu *vcpu)
 	if (!nested_vmx_check_permission(vcpu))
 		return 1;
 
-	if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) {
-		kvm_queue_exception(vcpu, UD_VECTOR);
-		return 1;
-	}
-
 	vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
 	type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
 
@@ -7805,7 +7735,6 @@ static int handle_pml_full(struct kvm_vcpu *vcpu)
 	 * "blocked by NMI" bit has to be set before next VM entry.
 	 */
 	if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
-			cpu_has_virtual_nmis() &&
 			(exit_qualification & INTR_INFO_UNBLOCK_NMI))
 		vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
 				GUEST_INTR_STATE_NMI);
@@ -8107,6 +8036,10 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 		return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
 	case EXIT_REASON_RDPMC:
 		return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
+	case EXIT_REASON_RDRAND:
+		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND);
+	case EXIT_REASON_RDSEED:
+		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED);
 	case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
 		return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
 	case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
@@ -8477,31 +8410,12 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 		return 0;
 	}
 
-	if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked &&
-	    !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis(
-					get_vmcs12(vcpu))))) {
-		if (vmx_interrupt_allowed(vcpu)) {
-			vmx->soft_vnmi_blocked = 0;
-		} else if (vmx->vnmi_blocked_time > 1000000000LL &&
-			   vcpu->arch.nmi_pending) {
-			/*
-			 * This CPU don't support us in finding the end of an
-			 * NMI-blocked window if the guest runs with IRQs
-			 * disabled. So we pull the trigger after 1 s of
-			 * futile waiting, but inform the user about this.
-			 */
-			printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
-			       "state on VCPU %d after 1 s timeout\n",
-			       __func__, vcpu->vcpu_id);
-			vmx->soft_vnmi_blocked = 0;
-		}
-	}
-
 	if (exit_reason < kvm_vmx_max_exit_handlers
 	    && kvm_vmx_exit_handlers[exit_reason])
 		return kvm_vmx_exit_handlers[exit_reason](vcpu);
 	else {
-		WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_reason);
+		vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
+				exit_reason);
 		kvm_queue_exception(vcpu, UD_VECTOR);
 		return 1;
 	}
@@ -8547,6 +8461,7 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
 	} else {
 		sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
 		sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+		vmx_flush_tlb_ept_only(vcpu);
 	}
 	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
 
@@ -8572,8 +8487,10 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
 	 */
 	if (!is_guest_mode(vcpu) ||
 	    !nested_cpu_has2(get_vmcs12(&vmx->vcpu),
-			     SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
+			     SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
 		vmcs_write64(APIC_ACCESS_ADDR, hpa);
+		vmx_flush_tlb_ept_only(vcpu);
+	}
 }
 
 static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
@@ -8768,37 +8685,33 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
 
 	idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
 
-	if (cpu_has_virtual_nmis()) {
-		if (vmx->nmi_known_unmasked)
-			return;
-		/*
-		 * Can't use vmx->exit_intr_info since we're not sure what
-		 * the exit reason is.
-		 */
-		exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-		unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
-		vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
-		/*
-		 * SDM 3: 27.7.1.2 (September 2008)
-		 * Re-set bit "block by NMI" before VM entry if vmexit caused by
-		 * a guest IRET fault.
-		 * SDM 3: 23.2.2 (September 2008)
-		 * Bit 12 is undefined in any of the following cases:
-		 *  If the VM exit sets the valid bit in the IDT-vectoring
-		 *   information field.
-		 *  If the VM exit is due to a double fault.
-		 */
-		if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
-		    vector != DF_VECTOR && !idtv_info_valid)
-			vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
-				      GUEST_INTR_STATE_NMI);
-		else
-			vmx->nmi_known_unmasked =
-				!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
-				  & GUEST_INTR_STATE_NMI);
-	} else if (unlikely(vmx->soft_vnmi_blocked))
-		vmx->vnmi_blocked_time +=
-			ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
+	if (vmx->nmi_known_unmasked)
+		return;
+	/*
+	 * Can't use vmx->exit_intr_info since we're not sure what
+	 * the exit reason is.
+	 */
+	exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+	unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
+	vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
+	/*
+	 * SDM 3: 27.7.1.2 (September 2008)
+	 * Re-set bit "block by NMI" before VM entry if vmexit caused by
+	 * a guest IRET fault.
+	 * SDM 3: 23.2.2 (September 2008)
+	 * Bit 12 is undefined in any of the following cases:
+	 *  If the VM exit sets the valid bit in the IDT-vectoring
+	 *   information field.
+	 *  If the VM exit is due to a double fault.
+	 */
+	if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
+	    vector != DF_VECTOR && !idtv_info_valid)
+		vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+			      GUEST_INTR_STATE_NMI);
+	else
+		vmx->nmi_known_unmasked =
+			!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
+			  & GUEST_INTR_STATE_NMI);
 }
 
 static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
@@ -8915,10 +8828,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	unsigned long debugctlmsr, cr4;
 
-	/* Record the guest's net vcpu time for enforced NMI injections. */
-	if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
-		vmx->entry_time = ktime_get();
-
 	/* Don't enter VMX if guest state is invalid, let the exit handler
 	   start emulation until we arrive back to a valid state */
 	if (vmx->emulation_required)
@@ -9126,16 +9035,16 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	vmx_complete_interrupts(vmx);
 }
 
-static void vmx_load_vmcs01(struct kvm_vcpu *vcpu)
+static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	int cpu;
 
-	if (vmx->loaded_vmcs == &vmx->vmcs01)
+	if (vmx->loaded_vmcs == vmcs)
 		return;
 
 	cpu = get_cpu();
-	vmx->loaded_vmcs = &vmx->vmcs01;
+	vmx->loaded_vmcs = vmcs;
 	vmx_vcpu_put(vcpu);
 	vmx_vcpu_load(vcpu, cpu);
 	vcpu->cpu = cpu;
@@ -9153,7 +9062,7 @@ static void vmx_free_vcpu_nested(struct kvm_vcpu *vcpu)
 
        r = vcpu_load(vcpu);
        BUG_ON(r);
-       vmx_load_vmcs01(vcpu);
+       vmx_switch_vmcs(vcpu, &vmx->vmcs01);
        free_nested(vmx);
        vcpu_put(vcpu);
 }
@@ -9214,11 +9123,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 	vmx->loaded_vmcs->shadow_vmcs = NULL;
 	if (!vmx->loaded_vmcs->vmcs)
 		goto free_msrs;
-	if (!vmm_exclusive)
-		kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id())));
 	loaded_vmcs_init(vmx->loaded_vmcs);
-	if (!vmm_exclusive)
-		kvm_cpu_vmxoff();
 
 	cpu = get_cpu();
 	vmx_vcpu_load(&vmx->vcpu, cpu);
@@ -9478,17 +9383,26 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
 	return get_vmcs12(vcpu)->ept_pointer;
 }
 
-static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
+static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
 {
+	u64 eptp;
+
 	WARN_ON(mmu_is_nested(vcpu));
+	eptp = nested_ept_get_cr3(vcpu);
+	if ((eptp & VMX_EPT_AD_ENABLE_BIT) && !enable_ept_ad_bits)
+		return 1;
+
+	kvm_mmu_unload(vcpu);
 	kvm_init_shadow_ept_mmu(vcpu,
 			to_vmx(vcpu)->nested.nested_vmx_ept_caps &
-			VMX_EPT_EXECUTE_ONLY_BIT);
+			VMX_EPT_EXECUTE_ONLY_BIT,
+			eptp & VMX_EPT_AD_ENABLE_BIT);
 	vcpu->arch.mmu.set_cr3           = vmx_set_cr3;
 	vcpu->arch.mmu.get_cr3           = nested_ept_get_cr3;
 	vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
 
 	vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
+	return 0;
 }
 
 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
@@ -9974,7 +9888,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u32 exec_control;
-	bool nested_ept_enabled = false;
 
 	vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
 	vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
@@ -10121,8 +10034,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 				vmcs12->guest_intr_status);
 		}
 
-		nested_ept_enabled = (exec_control & SECONDARY_EXEC_ENABLE_EPT) != 0;
-
 		/*
 		 * Write an illegal value to APIC_ACCESS_ADDR. Later,
 		 * nested_get_vmcs12_pages will either fix it up or
@@ -10253,8 +10164,13 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 	}
 
 	if (nested_cpu_has_ept(vmcs12)) {
-		kvm_mmu_unload(vcpu);
-		nested_ept_init_mmu_context(vcpu);
+		if (nested_ept_init_mmu_context(vcpu)) {
+			*entry_failure_code = ENTRY_FAIL_DEFAULT;
+			return 1;
+		}
+	} else if (nested_cpu_has2(vmcs12,
+				   SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
+		vmx_flush_tlb_ept_only(vcpu);
 	}
 
 	/*
@@ -10282,12 +10198,10 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 	vmx_set_efer(vcpu, vcpu->arch.efer);
 
 	/* Shadow page tables on either EPT or shadow page tables. */
-	if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_ept_enabled,
+	if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12),
 				entry_failure_code))
 		return 1;
 
-	kvm_mmu_reset_context(vcpu);
-
 	if (!enable_ept)
 		vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
 
@@ -10407,7 +10321,6 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 	struct loaded_vmcs *vmcs02;
-	int cpu;
 	u32 msr_entry_idx;
 	u32 exit_qual;
 
@@ -10420,18 +10333,12 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
 	if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
 		vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
 
-	cpu = get_cpu();
-	vmx->loaded_vmcs = vmcs02;
-	vmx_vcpu_put(vcpu);
-	vmx_vcpu_load(vcpu, cpu);
-	vcpu->cpu = cpu;
-	put_cpu();
-
+	vmx_switch_vmcs(vcpu, vmcs02);
 	vmx_segment_cache_clear(vmx);
 
 	if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) {
 		leave_guest_mode(vcpu);
-		vmx_load_vmcs01(vcpu);
+		vmx_switch_vmcs(vcpu, &vmx->vmcs01);
 		nested_vmx_entry_failure(vcpu, vmcs12,
 					 EXIT_REASON_INVALID_STATE, exit_qual);
 		return 1;
@@ -10444,7 +10351,7 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
 					    vmcs12->vm_entry_msr_load_count);
 	if (msr_entry_idx) {
 		leave_guest_mode(vcpu);
-		vmx_load_vmcs01(vcpu);
+		vmx_switch_vmcs(vcpu, &vmx->vmcs01);
 		nested_vmx_entry_failure(vcpu, vmcs12,
 				EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx);
 		return 1;
@@ -11012,7 +10919,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 	if (unlikely(vmx->fail))
 		vm_inst_error = vmcs_read32(VM_INSTRUCTION_ERROR);
 
-	vmx_load_vmcs01(vcpu);
+	vmx_switch_vmcs(vcpu, &vmx->vmcs01);
 
 	if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
 	    && nested_exit_intr_ack_set(vcpu)) {
@@ -11056,6 +10963,10 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 		vmx->nested.change_vmcs01_virtual_x2apic_mode = false;
 		vmx_set_virtual_x2apic_mode(vcpu,
 				vcpu->arch.apic_base & X2APIC_ENABLE);
+	} else if (!nested_cpu_has_ept(vmcs12) &&
+		   nested_cpu_has2(vmcs12,
+				   SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
+		vmx_flush_tlb_ept_only(vcpu);
 	}
 
 	/* This is needed for same reason as it was needed in prepare_vmcs02 */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1faf620..be2ade5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -27,7 +27,6 @@
 #include "kvm_cache_regs.h"
 #include "x86.h"
 #include "cpuid.h"
-#include "assigned-dev.h"
 #include "pmu.h"
 #include "hyperv.h"
 
@@ -1008,6 +1007,8 @@ static u32 emulated_msrs[] = {
 	MSR_IA32_MCG_CTL,
 	MSR_IA32_MCG_EXT_CTL,
 	MSR_IA32_SMBASE,
+	MSR_PLATFORM_INFO,
+	MSR_MISC_FEATURES_ENABLES,
 };
 
 static unsigned num_emulated_msrs;
@@ -1444,10 +1445,10 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
 	struct kvm *kvm = vcpu->kvm;
 	u64 offset, ns, elapsed;
 	unsigned long flags;
-	s64 usdiff;
 	bool matched;
 	bool already_matched;
 	u64 data = msr->data;
+	bool synchronizing = false;
 
 	raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
 	offset = kvm_compute_tsc_offset(vcpu, data);
@@ -1455,51 +1456,34 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
 	elapsed = ns - kvm->arch.last_tsc_nsec;
 
 	if (vcpu->arch.virtual_tsc_khz) {
-		int faulted = 0;
-
-		/* n.b - signed multiplication and division required */
-		usdiff = data - kvm->arch.last_tsc_write;
-#ifdef CONFIG_X86_64
-		usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz;
-#else
-		/* do_div() only does unsigned */
-		asm("1: idivl %[divisor]\n"
-		    "2: xor %%edx, %%edx\n"
-		    "   movl $0, %[faulted]\n"
-		    "3:\n"
-		    ".section .fixup,\"ax\"\n"
-		    "4: movl $1, %[faulted]\n"
-		    "   jmp  3b\n"
-		    ".previous\n"
-
-		_ASM_EXTABLE(1b, 4b)
-
-		: "=A"(usdiff), [faulted] "=r" (faulted)
-		: "A"(usdiff * 1000), [divisor] "rm"(vcpu->arch.virtual_tsc_khz));
-
-#endif
-		do_div(elapsed, 1000);
-		usdiff -= elapsed;
-		if (usdiff < 0)
-			usdiff = -usdiff;
-
-		/* idivl overflow => difference is larger than USEC_PER_SEC */
-		if (faulted)
-			usdiff = USEC_PER_SEC;
-	} else
-		usdiff = USEC_PER_SEC; /* disable TSC match window below */
+		if (data == 0 && msr->host_initiated) {
+			/*
+			 * detection of vcpu initialization -- need to sync
+			 * with other vCPUs. This particularly helps to keep
+			 * kvm_clock stable after CPU hotplug
+			 */
+			synchronizing = true;
+		} else {
+			u64 tsc_exp = kvm->arch.last_tsc_write +
+						nsec_to_cycles(vcpu, elapsed);
+			u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL;
+			/*
+			 * Special case: TSC write with a small delta (1 second)
+			 * of virtual cycle time against real time is
+			 * interpreted as an attempt to synchronize the CPU.
+			 */
+			synchronizing = data < tsc_exp + tsc_hz &&
+					data + tsc_hz > tsc_exp;
+		}
+	}
 
 	/*
-	 * Special case: TSC write with a small delta (1 second) of virtual
-	 * cycle time against real time is interpreted as an attempt to
-	 * synchronize the CPU.
-         *
 	 * For a reliable TSC, we can match TSC offsets, and for an unstable
 	 * TSC, we add elapsed time in this computation.  We could let the
 	 * compensation code attempt to catch up if we fall behind, but
 	 * it's better to try to match offsets from the beginning.
          */
-	if (usdiff < USEC_PER_SEC &&
+	if (synchronizing &&
 	    vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
 		if (!check_tsc_unstable()) {
 			offset = kvm->arch.cur_tsc_offset;
@@ -1769,13 +1753,13 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
 
 	/* guest entries allowed */
 	kvm_for_each_vcpu(i, vcpu, kvm)
-		clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);
+		kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
 
 	spin_unlock(&ka->pvclock_gtod_sync_lock);
 #endif
 }
 
-static u64 __get_kvmclock_ns(struct kvm *kvm)
+u64 get_kvmclock_ns(struct kvm *kvm)
 {
 	struct kvm_arch *ka = &kvm->arch;
 	struct pvclock_vcpu_time_info hv_clock;
@@ -1796,18 +1780,6 @@ static u64 __get_kvmclock_ns(struct kvm *kvm)
 	return __pvclock_read_cycles(&hv_clock, rdtsc());
 }
 
-u64 get_kvmclock_ns(struct kvm *kvm)
-{
-	unsigned long flags;
-	s64 ns;
-
-	local_irq_save(flags);
-	ns = __get_kvmclock_ns(kvm);
-	local_irq_restore(flags);
-
-	return ns;
-}
-
 static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
 {
 	struct kvm_vcpu_arch *vcpu = &v->arch;
@@ -2155,6 +2127,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_VM_HSAVE_PA:
 	case MSR_AMD64_PATCH_LOADER:
 	case MSR_AMD64_BU_CFG2:
+	case MSR_AMD64_DC_CFG:
 		break;
 
 	case MSR_EFER:
@@ -2230,8 +2203,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			bool tmp = (msr == MSR_KVM_SYSTEM_TIME);
 
 			if (ka->boot_vcpu_runs_old_kvmclock != tmp)
-				set_bit(KVM_REQ_MASTERCLOCK_UPDATE,
-					&vcpu->requests);
+				kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
 
 			ka->boot_vcpu_runs_old_kvmclock = tmp;
 		}
@@ -2331,6 +2303,21 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			return 1;
 		vcpu->arch.osvw.status = data;
 		break;
+	case MSR_PLATFORM_INFO:
+		if (!msr_info->host_initiated ||
+		    data & ~MSR_PLATFORM_INFO_CPUID_FAULT ||
+		    (!(data & MSR_PLATFORM_INFO_CPUID_FAULT) &&
+		     cpuid_fault_enabled(vcpu)))
+			return 1;
+		vcpu->arch.msr_platform_info = data;
+		break;
+	case MSR_MISC_FEATURES_ENABLES:
+		if (data & ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT ||
+		    (data & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT &&
+		     !supports_cpuid_fault(vcpu)))
+			return 1;
+		vcpu->arch.msr_misc_features_enables = data;
+		break;
 	default:
 		if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
 			return xen_hvm_config(vcpu, data);
@@ -2417,6 +2404,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_FAM10H_MMIO_CONF_BASE:
 	case MSR_AMD64_BU_CFG2:
 	case MSR_IA32_PERF_CTL:
+	case MSR_AMD64_DC_CFG:
 		msr_info->data = 0;
 		break;
 	case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
@@ -2545,6 +2533,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			return 1;
 		msr_info->data = vcpu->arch.osvw.status;
 		break;
+	case MSR_PLATFORM_INFO:
+		msr_info->data = vcpu->arch.msr_platform_info;
+		break;
+	case MSR_MISC_FEATURES_ENABLES:
+		msr_info->data = vcpu->arch.msr_misc_features_enables;
+		break;
 	default:
 		if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
 			return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
@@ -2675,15 +2669,14 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_SET_BOOT_CPU_ID:
  	case KVM_CAP_SPLIT_IRQCHIP:
 	case KVM_CAP_IMMEDIATE_EXIT:
-#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
-	case KVM_CAP_ASSIGN_DEV_IRQ:
-	case KVM_CAP_PCI_2_3:
-#endif
 		r = 1;
 		break;
 	case KVM_CAP_ADJUST_CLOCK:
 		r = KVM_CLOCK_TSC_STABLE;
 		break;
+	case KVM_CAP_X86_GUEST_MWAIT:
+		r = kvm_mwait_in_guest();
+		break;
 	case KVM_CAP_X86_SMM:
 		/* SMBASE is usually relocated above 1M on modern chipsets,
 		 * and SMM handlers might indeed rely on 4G segment limits,
@@ -2695,9 +2688,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		 */
 		r = kvm_x86_ops->cpu_has_high_real_mode_segbase();
 		break;
-	case KVM_CAP_COALESCED_MMIO:
-		r = KVM_COALESCED_MMIO_PAGE_OFFSET;
-		break;
 	case KVM_CAP_VAPIC:
 		r = !kvm_x86_ops->cpu_has_accelerated_tpr();
 		break;
@@ -2713,11 +2703,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_PV_MMU:	/* obsolete */
 		r = 0;
 		break;
-#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
-	case KVM_CAP_IOMMU:
-		r = iommu_present(&pci_bus_type);
-		break;
-#endif
 	case KVM_CAP_MCE:
 		r = KVM_MAX_MCE_BANKS;
 		break;
@@ -2816,11 +2801,6 @@ static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
 	return kvm_arch_has_noncoherent_dma(vcpu->kvm);
 }
 
-static inline void kvm_migrate_timers(struct kvm_vcpu *vcpu)
-{
-	set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests);
-}
-
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	/* Address WBINVD may be executed by guest */
@@ -2864,7 +2844,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
 			kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
 		if (vcpu->cpu != cpu)
-			kvm_migrate_timers(vcpu);
+			kvm_make_request(KVM_REQ_MIGRATE_TIMER, vcpu);
 		vcpu->cpu = cpu;
 	}
 
@@ -3124,7 +3104,14 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 		return -EINVAL;
 
 	if (events->exception.injected &&
-	    (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR))
+	    (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR ||
+	     is_guest_mode(vcpu)))
+		return -EINVAL;
+
+	/* INITs are latched while in SMM */
+	if (events->flags & KVM_VCPUEVENT_VALID_SMM &&
+	    (events->smi.smm || events->smi.pending) &&
+	    vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
 		return -EINVAL;
 
 	process_nmi(vcpu);
@@ -3721,22 +3708,21 @@ static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
 
 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
 {
+	struct kvm_pic *pic = kvm->arch.vpic;
 	int r;
 
 	r = 0;
 	switch (chip->chip_id) {
 	case KVM_IRQCHIP_PIC_MASTER:
-		memcpy(&chip->chip.pic,
-			&pic_irqchip(kvm)->pics[0],
+		memcpy(&chip->chip.pic, &pic->pics[0],
 			sizeof(struct kvm_pic_state));
 		break;
 	case KVM_IRQCHIP_PIC_SLAVE:
-		memcpy(&chip->chip.pic,
-			&pic_irqchip(kvm)->pics[1],
+		memcpy(&chip->chip.pic, &pic->pics[1],
 			sizeof(struct kvm_pic_state));
 		break;
 	case KVM_IRQCHIP_IOAPIC:
-		r = kvm_get_ioapic(kvm, &chip->chip.ioapic);
+		kvm_get_ioapic(kvm, &chip->chip.ioapic);
 		break;
 	default:
 		r = -EINVAL;
@@ -3747,32 +3733,31 @@ static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
 
 static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
 {
+	struct kvm_pic *pic = kvm->arch.vpic;
 	int r;
 
 	r = 0;
 	switch (chip->chip_id) {
 	case KVM_IRQCHIP_PIC_MASTER:
-		spin_lock(&pic_irqchip(kvm)->lock);
-		memcpy(&pic_irqchip(kvm)->pics[0],
-			&chip->chip.pic,
+		spin_lock(&pic->lock);
+		memcpy(&pic->pics[0], &chip->chip.pic,
 			sizeof(struct kvm_pic_state));
-		spin_unlock(&pic_irqchip(kvm)->lock);
+		spin_unlock(&pic->lock);
 		break;
 	case KVM_IRQCHIP_PIC_SLAVE:
-		spin_lock(&pic_irqchip(kvm)->lock);
-		memcpy(&pic_irqchip(kvm)->pics[1],
-			&chip->chip.pic,
+		spin_lock(&pic->lock);
+		memcpy(&pic->pics[1], &chip->chip.pic,
 			sizeof(struct kvm_pic_state));
-		spin_unlock(&pic_irqchip(kvm)->lock);
+		spin_unlock(&pic->lock);
 		break;
 	case KVM_IRQCHIP_IOAPIC:
-		r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
+		kvm_set_ioapic(kvm, &chip->chip.ioapic);
 		break;
 	default:
 		r = -EINVAL;
 		break;
 	}
-	kvm_pic_update_irq(pic_irqchip(kvm));
+	kvm_pic_update_irq(pic);
 	return r;
 }
 
@@ -3934,9 +3919,14 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 			goto split_irqchip_unlock;
 		if (kvm->created_vcpus)
 			goto split_irqchip_unlock;
+		kvm->arch.irqchip_mode = KVM_IRQCHIP_INIT_IN_PROGRESS;
 		r = kvm_setup_empty_irq_routing(kvm);
-		if (r)
+		if (r) {
+			kvm->arch.irqchip_mode = KVM_IRQCHIP_NONE;
+			/* Pairs with smp_rmb() when reading irqchip_mode */
+			smp_wmb();
 			goto split_irqchip_unlock;
+		}
 		/* Pairs with irqchip_in_kernel. */
 		smp_wmb();
 		kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT;
@@ -4018,20 +4008,18 @@ long kvm_arch_vm_ioctl(struct file *filp,
 
 		r = kvm_ioapic_init(kvm);
 		if (r) {
-			mutex_lock(&kvm->slots_lock);
 			kvm_pic_destroy(kvm);
-			mutex_unlock(&kvm->slots_lock);
 			goto create_irqchip_unlock;
 		}
 
+		kvm->arch.irqchip_mode = KVM_IRQCHIP_INIT_IN_PROGRESS;
 		r = kvm_setup_default_irq_routing(kvm);
 		if (r) {
-			mutex_lock(&kvm->slots_lock);
-			mutex_lock(&kvm->irq_lock);
+			kvm->arch.irqchip_mode = KVM_IRQCHIP_NONE;
+			/* Pairs with smp_rmb() when reading irqchip_mode */
+			smp_wmb();
 			kvm_ioapic_destroy(kvm);
 			kvm_pic_destroy(kvm);
-			mutex_unlock(&kvm->irq_lock);
-			mutex_unlock(&kvm->slots_lock);
 			goto create_irqchip_unlock;
 		}
 		/* Write kvm->irq_routing before enabling irqchip_in_kernel. */
@@ -4196,10 +4184,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
 			goto out;
 
 		r = 0;
-		local_irq_disable();
-		now_ns = __get_kvmclock_ns(kvm);
+		now_ns = get_kvmclock_ns(kvm);
 		kvm->arch.kvmclock_offset += user_ns.clock - now_ns;
-		local_irq_enable();
 		kvm_gen_update_masterclock(kvm);
 		break;
 	}
@@ -4207,11 +4193,9 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		struct kvm_clock_data user_ns;
 		u64 now_ns;
 
-		local_irq_disable();
-		now_ns = __get_kvmclock_ns(kvm);
+		now_ns = get_kvmclock_ns(kvm);
 		user_ns.clock = now_ns;
 		user_ns.flags = kvm->arch.use_master_clock ? KVM_CLOCK_TSC_STABLE : 0;
-		local_irq_enable();
 		memset(&user_ns.pad, 0, sizeof(user_ns.pad));
 
 		r = -EFAULT;
@@ -4230,7 +4214,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		break;
 	}
 	default:
-		r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
+		r = -ENOTTY;
 	}
 out:
 	return r;
@@ -5223,6 +5207,16 @@ static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked)
 	kvm_x86_ops->set_nmi_mask(emul_to_vcpu(ctxt), masked);
 }
 
+static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt)
+{
+	return emul_to_vcpu(ctxt)->arch.hflags;
+}
+
+static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_flags)
+{
+	kvm_set_hflags(emul_to_vcpu(ctxt), emul_flags);
+}
+
 static const struct x86_emulate_ops emulate_ops = {
 	.read_gpr            = emulator_read_gpr,
 	.write_gpr           = emulator_write_gpr,
@@ -5262,6 +5256,8 @@ static const struct x86_emulate_ops emulate_ops = {
 	.intercept           = emulator_intercept,
 	.get_cpuid           = emulator_get_cpuid,
 	.set_nmi_mask        = emulator_set_nmi_mask,
+	.get_hflags          = emulator_get_hflags,
+	.set_hflags          = emulator_set_hflags,
 };
 
 static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
@@ -5314,7 +5310,6 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
 	BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK);
 	BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK);
 	BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK);
-	ctxt->emul_flags = vcpu->arch.hflags;
 
 	init_decode_cache(ctxt);
 	vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
@@ -5718,8 +5713,6 @@ restart:
 		unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
 		toggle_interruptibility(vcpu, ctxt->interruptibility);
 		vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
-		if (vcpu->arch.hflags != ctxt->emul_flags)
-			kvm_set_hflags(vcpu, ctxt->emul_flags);
 		kvm_rip_write(vcpu, ctxt->eip);
 		if (r == EMULATE_DONE)
 			kvm_vcpu_check_singlestep(vcpu, rflags, &r);
@@ -6869,7 +6862,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 	/*
 	 * 1) We should set ->mode before checking ->requests.  Please see
-	 * the comment in kvm_make_all_cpus_request.
+	 * the comment in kvm_vcpu_exiting_guest_mode().
 	 *
 	 * 2) For APICv, we should set ->mode before checking PIR.ON.  This
 	 * pairs with the memory barrier implicit in pi_test_and_set_on
@@ -7051,7 +7044,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
 		if (r <= 0)
 			break;
 
-		clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
+		kvm_clear_request(KVM_REQ_PENDING_TIMER, vcpu);
 		if (kvm_cpu_has_pending_timer(vcpu))
 			kvm_inject_pending_timer_irqs(vcpu);
 
@@ -7179,7 +7172,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
 		kvm_vcpu_block(vcpu);
 		kvm_apic_accept_events(vcpu);
-		clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
+		kvm_clear_request(KVM_REQ_UNHALT, vcpu);
 		r = -EAGAIN;
 		goto out;
 	}
@@ -7355,6 +7348,12 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 	    mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
 		return -EINVAL;
 
+	/* INITs are latched while in SMM */
+	if ((is_smm(vcpu) || vcpu->arch.smi_pending) &&
+	    (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED ||
+	     mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED))
+		return -EINVAL;
+
 	if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
 		vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
 		set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events);
@@ -7724,6 +7723,9 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 	if (!init_event) {
 		kvm_pmu_reset(vcpu);
 		vcpu->arch.smbase = 0x30000;
+
+		vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
+		vcpu->arch.msr_misc_features_enables = 0;
 	}
 
 	memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
@@ -8068,7 +8070,6 @@ void kvm_arch_sync_events(struct kvm *kvm)
 {
 	cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work);
 	cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work);
-	kvm_free_all_assigned_devices(kvm);
 	kvm_free_pit(kvm);
 }
 
@@ -8152,12 +8153,12 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 	}
 	if (kvm_x86_ops->vm_destroy)
 		kvm_x86_ops->vm_destroy(kvm);
-	kvm_iommu_unmap_guest(kvm);
-	kfree(kvm->arch.vpic);
-	kfree(kvm->arch.vioapic);
+	kvm_pic_destroy(kvm);
+	kvm_ioapic_destroy(kvm);
 	kvm_free_vcpus(kvm);
 	kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
 	kvm_mmu_uninit_vm(kvm);
+	kvm_page_track_cleanup(kvm);
 }
 
 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
@@ -8384,7 +8385,7 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
 	if (atomic_read(&vcpu->arch.nmi_queued))
 		return true;
 
-	if (test_bit(KVM_REQ_SMI, &vcpu->requests))
+	if (kvm_test_request(KVM_REQ_SMI, vcpu))
 		return true;
 
 	if (kvm_arch_interrupt_allowed(vcpu) &&
@@ -8566,11 +8567,11 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
 {
 	struct x86_exception fault;
 
-	trace_kvm_async_pf_ready(work->arch.token, work->gva);
 	if (work->wakeup_all)
 		work->arch.token = ~0; /* broadcast wakeup */
 	else
 		kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
+	trace_kvm_async_pf_ready(work->arch.token, work->gva);
 
 	if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) &&
 	    !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index e8ff3e4..6120670 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -1,6 +1,8 @@
 #ifndef ARCH_X86_KVM_X86_H
 #define ARCH_X86_KVM_X86_H
 
+#include <asm/processor.h>
+#include <asm/mwait.h>
 #include <linux/kvm_host.h>
 #include <asm/pvclock.h>
 #include "kvm_cache_regs.h"
@@ -212,4 +214,38 @@ static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
 	    __rem;						\
 	 })
 
+static inline bool kvm_mwait_in_guest(void)
+{
+	unsigned int eax, ebx, ecx, edx;
+
+	if (!cpu_has(&boot_cpu_data, X86_FEATURE_MWAIT))
+		return false;
+
+	switch (boot_cpu_data.x86_vendor) {
+	case X86_VENDOR_AMD:
+		/* All AMD CPUs have a working MWAIT implementation */
+		return true;
+	case X86_VENDOR_INTEL:
+		/* Handle Intel below */
+		break;
+	default:
+		return false;
+	}
+
+	/*
+	 * Intel CPUs without CPUID5_ECX_INTERRUPT_BREAK are problematic as
+	 * they would allow guest to stop the CPU completely by disabling
+	 * interrupts then invoking MWAIT.
+	 */
+	if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
+		return false;
+
+	cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
+
+	if (!(ecx & CPUID5_ECX_INTERRUPT_BREAK))
+		return false;
+
+	return true;
+}
+
 #endif
diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile
index e7e7055..69f0827 100644
--- a/arch/x86/um/Makefile
+++ b/arch/x86/um/Makefile
@@ -16,7 +16,7 @@ obj-y = bug.o bugs_$(BITS).o delay.o fault.o ldt.o \
 
 ifeq ($(CONFIG_X86_32),y)
 
-obj-y += checksum_32.o
+obj-y += checksum_32.o syscalls_32.o
 obj-$(CONFIG_ELF_CORE) += elfcore.o
 
 subarch-y = ../lib/string_32.o ../lib/atomic64_32.o ../lib/atomic64_cx8_32.o
diff --git a/arch/x86/um/asm/ptrace.h b/arch/x86/um/asm/ptrace.h
index e59eef2..b291ca5 100644
--- a/arch/x86/um/asm/ptrace.h
+++ b/arch/x86/um/asm/ptrace.h
@@ -78,7 +78,7 @@ static inline int ptrace_set_thread_area(struct task_struct *child, int idx,
         return -ENOSYS;
 }
 
-extern long arch_prctl(struct task_struct *task, int code,
+extern long arch_prctl(struct task_struct *task, int option,
 		       unsigned long __user *addr);
 
 #endif
diff --git a/arch/x86/um/os-Linux/prctl.c b/arch/x86/um/os-Linux/prctl.c
index 96eb2bd..8431e87 100644
--- a/arch/x86/um/os-Linux/prctl.c
+++ b/arch/x86/um/os-Linux/prctl.c
@@ -6,7 +6,7 @@
 #include <sys/ptrace.h>
 #include <asm/ptrace.h>
 
-int os_arch_prctl(int pid, int code, unsigned long *addr)
+int os_arch_prctl(int pid, int option, unsigned long *arg2)
 {
-        return ptrace(PTRACE_ARCH_PRCTL, pid, (unsigned long) addr, code);
+	return ptrace(PTRACE_ARCH_PRCTL, pid, (unsigned long) arg2, option);
 }
diff --git a/arch/x86/um/syscalls_32.c b/arch/x86/um/syscalls_32.c
new file mode 100644
index 0000000..627d688
--- /dev/null
+++ b/arch/x86/um/syscalls_32.c
@@ -0,0 +1,7 @@
+#include <linux/syscalls.h>
+#include <os.h>
+
+SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
+{
+	return -EINVAL;
+}
diff --git a/arch/x86/um/syscalls_64.c b/arch/x86/um/syscalls_64.c
index 10d9070..58f5166 100644
--- a/arch/x86/um/syscalls_64.c
+++ b/arch/x86/um/syscalls_64.c
@@ -7,13 +7,15 @@
 
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
+#include <linux/syscalls.h>
 #include <linux/uaccess.h>
 #include <asm/prctl.h> /* XXX This should get the constants from libc */
 #include <os.h>
 
-long arch_prctl(struct task_struct *task, int code, unsigned long __user *addr)
+long arch_prctl(struct task_struct *task, int option,
+		unsigned long __user *arg2)
 {
-	unsigned long *ptr = addr, tmp;
+	unsigned long *ptr = arg2, tmp;
 	long ret;
 	int pid = task->mm->context.id.u.pid;
 
@@ -30,7 +32,7 @@ long arch_prctl(struct task_struct *task, int code, unsigned long __user *addr)
 	 * arch_prctl is run on the host, then the registers are read
 	 * back.
 	 */
-	switch (code) {
+	switch (option) {
 	case ARCH_SET_FS:
 	case ARCH_SET_GS:
 		ret = restore_registers(pid, &current->thread.regs.regs);
@@ -50,11 +52,11 @@ long arch_prctl(struct task_struct *task, int code, unsigned long __user *addr)
 		ptr = &tmp;
 	}
 
-	ret = os_arch_prctl(pid, code, ptr);
+	ret = os_arch_prctl(pid, option, ptr);
 	if (ret)
 		return ret;
 
-	switch (code) {
+	switch (option) {
 	case ARCH_SET_FS:
 		current->thread.arch.fs = (unsigned long) ptr;
 		ret = save_registers(pid, &current->thread.regs.regs);
@@ -63,19 +65,19 @@ long arch_prctl(struct task_struct *task, int code, unsigned long __user *addr)
 		ret = save_registers(pid, &current->thread.regs.regs);
 		break;
 	case ARCH_GET_FS:
-		ret = put_user(tmp, addr);
+		ret = put_user(tmp, arg2);
 		break;
 	case ARCH_GET_GS:
-		ret = put_user(tmp, addr);
+		ret = put_user(tmp, arg2);
 		break;
 	}
 
 	return ret;
 }
 
-long sys_arch_prctl(int code, unsigned long addr)
+SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
 {
-	return arch_prctl(current, code, (unsigned long __user *) addr);
+	return arch_prctl(current, option, (unsigned long __user *) arg2);
 }
 
 void arch_switch_to(struct task_struct *to)