diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-10-06 10:49:01 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-10-06 10:49:01 -0700 |
commit | 6218590bcb452c3da7517d02b588d4d0a8628f73 (patch) | |
tree | 8b6a285052ac999e0e36e04f0c1e6bbfb46e84c4 /arch | |
parent | 14986a34e1289424811443a524cdd9e1688c7913 (diff) | |
parent | d9ab710b85310e4ba9295f2b494eda54cf1a355a (diff) | |
download | op-kernel-dev-6218590bcb452c3da7517d02b588d4d0a8628f73.zip op-kernel-dev-6218590bcb452c3da7517d02b588d4d0a8628f73.tar.gz |
Merge tag 'kvm-4.9-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Radim Krčmář:
"All architectures:
- move `make kvmconfig` stubs from x86
- use 64 bits for debugfs stats
ARM:
- Important fixes for not using an in-kernel irqchip
- handle SError exceptions and present them to guests if appropriate
- proxying of GICV access at EL2 if guest mappings are unsafe
- GICv3 on AArch32 on ARMv8
- preparations for GICv3 save/restore, including ABI docs
- cleanups and a bit of optimizations
MIPS:
- A couple of fixes in preparation for supporting MIPS EVA host
kernels
- MIPS SMP host & TLB invalidation fixes
PPC:
- Fix the bug which caused guests to falsely report lockups
- other minor fixes
- a small optimization
s390:
- Lazy enablement of runtime instrumentation
- up to 255 CPUs for nested guests
- rework of machine check deliver
- cleanups and fixes
x86:
- IOMMU part of AMD's AVIC for vmexit-less interrupt delivery
- Hyper-V TSC page
- per-vcpu tsc_offset in debugfs
- accelerated INS/OUTS in nVMX
- cleanups and fixes"
* tag 'kvm-4.9-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (140 commits)
KVM: MIPS: Drop dubious EntryHi optimisation
KVM: MIPS: Invalidate TLB by regenerating ASIDs
KVM: MIPS: Split kernel/user ASID regeneration
KVM: MIPS: Drop other CPU ASIDs on guest MMU changes
KVM: arm/arm64: vgic: Don't flush/sync without a working vgic
KVM: arm64: Require in-kernel irqchip for PMU support
KVM: PPC: Book3s PR: Allow access to unprivileged MMCR2 register
KVM: PPC: Book3S PR: Support 64kB page size on POWER8E and POWER8NVL
KVM: PPC: Book3S: Remove duplicate setting of the B field in tlbie
KVM: PPC: BookE: Fix a sanity check
KVM: PPC: Book3S HV: Take out virtual core piggybacking code
KVM: PPC: Book3S: Treat VTB as a per-subcore register, not per-thread
ARM: gic-v3: Work around definition of gic_write_bpr1
KVM: nVMX: Fix the NMI IDT-vectoring handling
KVM: VMX: Enable MSR-BASED TPR shadow even if APICv is inactive
KVM: nVMX: Fix reload apic access page warning
kvmconfig: add virtio-gpu to config fragment
config: move x86 kvm_guest.config to a common location
arm64: KVM: Remove duplicating init code for setting VMID
ARM: KVM: Support vgic-v3
...
Diffstat (limited to 'arch')
101 files changed, 3192 insertions, 1840 deletions
diff --git a/arch/arm/include/asm/arch_gicv3.h b/arch/arm/include/asm/arch_gicv3.h index dfe4002..a808829 100644 --- a/arch/arm/include/asm/arch_gicv3.h +++ b/arch/arm/include/asm/arch_gicv3.h @@ -22,9 +22,7 @@ #include <linux/io.h> #include <asm/barrier.h> - -#define __ACCESS_CP15(CRn, Op1, CRm, Op2) p15, Op1, %0, CRn, CRm, Op2 -#define __ACCESS_CP15_64(Op1, CRm) p15, Op1, %Q0, %R0, CRm +#include <asm/cp15.h> #define ICC_EOIR1 __ACCESS_CP15(c12, 0, c12, 1) #define ICC_DIR __ACCESS_CP15(c12, 0, c11, 1) @@ -99,68 +97,129 @@ #define ICH_AP1R2 __AP1Rx(2) #define ICH_AP1R3 __AP1Rx(3) +/* A32-to-A64 mappings used by VGIC save/restore */ + +#define CPUIF_MAP(a32, a64) \ +static inline void write_ ## a64(u32 val) \ +{ \ + write_sysreg(val, a32); \ +} \ +static inline u32 read_ ## a64(void) \ +{ \ + return read_sysreg(a32); \ +} \ + +#define CPUIF_MAP_LO_HI(a32lo, a32hi, a64) \ +static inline void write_ ## a64(u64 val) \ +{ \ + write_sysreg(lower_32_bits(val), a32lo);\ + write_sysreg(upper_32_bits(val), a32hi);\ +} \ +static inline u64 read_ ## a64(void) \ +{ \ + u64 val = read_sysreg(a32lo); \ + \ + val |= (u64)read_sysreg(a32hi) << 32; \ + \ + return val; \ +} + +CPUIF_MAP(ICH_HCR, ICH_HCR_EL2) +CPUIF_MAP(ICH_VTR, ICH_VTR_EL2) +CPUIF_MAP(ICH_MISR, ICH_MISR_EL2) +CPUIF_MAP(ICH_EISR, ICH_EISR_EL2) +CPUIF_MAP(ICH_ELSR, ICH_ELSR_EL2) +CPUIF_MAP(ICH_VMCR, ICH_VMCR_EL2) +CPUIF_MAP(ICH_AP0R3, ICH_AP0R3_EL2) +CPUIF_MAP(ICH_AP0R2, ICH_AP0R2_EL2) +CPUIF_MAP(ICH_AP0R1, ICH_AP0R1_EL2) +CPUIF_MAP(ICH_AP0R0, ICH_AP0R0_EL2) +CPUIF_MAP(ICH_AP1R3, ICH_AP1R3_EL2) +CPUIF_MAP(ICH_AP1R2, ICH_AP1R2_EL2) +CPUIF_MAP(ICH_AP1R1, ICH_AP1R1_EL2) +CPUIF_MAP(ICH_AP1R0, ICH_AP1R0_EL2) +CPUIF_MAP(ICC_HSRE, ICC_SRE_EL2) +CPUIF_MAP(ICC_SRE, ICC_SRE_EL1) + +CPUIF_MAP_LO_HI(ICH_LR15, ICH_LRC15, ICH_LR15_EL2) +CPUIF_MAP_LO_HI(ICH_LR14, ICH_LRC14, ICH_LR14_EL2) +CPUIF_MAP_LO_HI(ICH_LR13, ICH_LRC13, ICH_LR13_EL2) +CPUIF_MAP_LO_HI(ICH_LR12, ICH_LRC12, ICH_LR12_EL2) +CPUIF_MAP_LO_HI(ICH_LR11, ICH_LRC11, ICH_LR11_EL2) +CPUIF_MAP_LO_HI(ICH_LR10, ICH_LRC10, ICH_LR10_EL2) +CPUIF_MAP_LO_HI(ICH_LR9, ICH_LRC9, ICH_LR9_EL2) +CPUIF_MAP_LO_HI(ICH_LR8, ICH_LRC8, ICH_LR8_EL2) +CPUIF_MAP_LO_HI(ICH_LR7, ICH_LRC7, ICH_LR7_EL2) +CPUIF_MAP_LO_HI(ICH_LR6, ICH_LRC6, ICH_LR6_EL2) +CPUIF_MAP_LO_HI(ICH_LR5, ICH_LRC5, ICH_LR5_EL2) +CPUIF_MAP_LO_HI(ICH_LR4, ICH_LRC4, ICH_LR4_EL2) +CPUIF_MAP_LO_HI(ICH_LR3, ICH_LRC3, ICH_LR3_EL2) +CPUIF_MAP_LO_HI(ICH_LR2, ICH_LRC2, ICH_LR2_EL2) +CPUIF_MAP_LO_HI(ICH_LR1, ICH_LRC1, ICH_LR1_EL2) +CPUIF_MAP_LO_HI(ICH_LR0, ICH_LRC0, ICH_LR0_EL2) + +#define read_gicreg(r) read_##r() +#define write_gicreg(v, r) write_##r(v) + /* Low-level accessors */ static inline void gic_write_eoir(u32 irq) { - asm volatile("mcr " __stringify(ICC_EOIR1) : : "r" (irq)); + write_sysreg(irq, ICC_EOIR1); isb(); } static inline void gic_write_dir(u32 val) { - asm volatile("mcr " __stringify(ICC_DIR) : : "r" (val)); + write_sysreg(val, ICC_DIR); isb(); } static inline u32 gic_read_iar(void) { - u32 irqstat; + u32 irqstat = read_sysreg(ICC_IAR1); - asm volatile("mrc " __stringify(ICC_IAR1) : "=r" (irqstat)); dsb(sy); + return irqstat; } static inline void gic_write_pmr(u32 val) { - asm volatile("mcr " __stringify(ICC_PMR) : : "r" (val)); + write_sysreg(val, ICC_PMR); } static inline void gic_write_ctlr(u32 val) { - asm volatile("mcr " __stringify(ICC_CTLR) : : "r" (val)); + write_sysreg(val, ICC_CTLR); isb(); } static inline void gic_write_grpen1(u32 val) { - asm volatile("mcr " __stringify(ICC_IGRPEN1) : : "r" (val)); + write_sysreg(val, ICC_IGRPEN1); isb(); } static inline void gic_write_sgi1r(u64 val) { - asm volatile("mcrr " __stringify(ICC_SGI1R) : : "r" (val)); + write_sysreg(val, ICC_SGI1R); } static inline u32 gic_read_sre(void) { - u32 val; - - asm volatile("mrc " __stringify(ICC_SRE) : "=r" (val)); - return val; + return read_sysreg(ICC_SRE); } static inline void gic_write_sre(u32 val) { - asm volatile("mcr " __stringify(ICC_SRE) : : "r" (val)); + write_sysreg(val, ICC_SRE); isb(); } static inline void gic_write_bpr1(u32 val) { - asm volatile("mcr " __stringify(ICC_BPR1) : : "r" (val)); + write_sysreg(val, ICC_BPR1); } /* diff --git a/arch/arm/include/asm/cp15.h b/arch/arm/include/asm/cp15.h index c3f1152..dbdbce1 100644 --- a/arch/arm/include/asm/cp15.h +++ b/arch/arm/include/asm/cp15.h @@ -49,6 +49,21 @@ #ifdef CONFIG_CPU_CP15 +#define __ACCESS_CP15(CRn, Op1, CRm, Op2) \ + "mrc", "mcr", __stringify(p15, Op1, %0, CRn, CRm, Op2), u32 +#define __ACCESS_CP15_64(Op1, CRm) \ + "mrrc", "mcrr", __stringify(p15, Op1, %Q0, %R0, CRm), u64 + +#define __read_sysreg(r, w, c, t) ({ \ + t __val; \ + asm volatile(r " " c : "=r" (__val)); \ + __val; \ +}) +#define read_sysreg(...) __read_sysreg(__VA_ARGS__) + +#define __write_sysreg(v, r, w, c, t) asm volatile(w " " c : : "r" ((t)(v))) +#define write_sysreg(v, ...) __write_sysreg(v, __VA_ARGS__) + extern unsigned long cr_alignment; /* defined in entry-armv.S */ static inline unsigned long get_cr(void) diff --git a/arch/arm/include/asm/cputype.h b/arch/arm/include/asm/cputype.h index 754f86f..522b5fe 100644 --- a/arch/arm/include/asm/cputype.h +++ b/arch/arm/include/asm/cputype.h @@ -55,6 +55,7 @@ #define MPIDR_LEVEL_BITS 8 #define MPIDR_LEVEL_MASK ((1 << MPIDR_LEVEL_BITS) - 1) +#define MPIDR_LEVEL_SHIFT(level) (MPIDR_LEVEL_BITS * level) #define MPIDR_AFFINITY_LEVEL(mpidr, level) \ ((mpidr >> (MPIDR_LEVEL_BITS * level)) & MPIDR_LEVEL_MASK) diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h index 58faff5..d7ea6bc 100644 --- a/arch/arm/include/asm/kvm_asm.h +++ b/arch/arm/include/asm/kvm_asm.h @@ -21,6 +21,10 @@ #include <asm/virt.h> +#define ARM_EXIT_WITH_ABORT_BIT 31 +#define ARM_EXCEPTION_CODE(x) ((x) & ~(1U << ARM_EXIT_WITH_ABORT_BIT)) +#define ARM_ABORT_PENDING(x) !!((x) & (1U << ARM_EXIT_WITH_ABORT_BIT)) + #define ARM_EXCEPTION_RESET 0 #define ARM_EXCEPTION_UNDEFINED 1 #define ARM_EXCEPTION_SOFTWARE 2 @@ -68,6 +72,9 @@ extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); extern void __init_stage2_translation(void); extern void __kvm_hyp_reset(unsigned long); + +extern u64 __vgic_v3_get_ich_vtr_el2(void); +extern void __vgic_v3_init_lrs(void); #endif #endif /* __ARM_KVM_ASM_H__ */ diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h index ee5328f..9a8a45a 100644 --- a/arch/arm/include/asm/kvm_emulate.h +++ b/arch/arm/include/asm/kvm_emulate.h @@ -40,18 +40,29 @@ static inline void vcpu_set_reg(struct kvm_vcpu *vcpu, u8 reg_num, *vcpu_reg(vcpu, reg_num) = val; } -bool kvm_condition_valid(struct kvm_vcpu *vcpu); -void kvm_skip_instr(struct kvm_vcpu *vcpu, bool is_wide_instr); +bool kvm_condition_valid32(const struct kvm_vcpu *vcpu); +void kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr); void kvm_inject_undefined(struct kvm_vcpu *vcpu); +void kvm_inject_vabt(struct kvm_vcpu *vcpu); void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr); void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr); +static inline bool kvm_condition_valid(const struct kvm_vcpu *vcpu) +{ + return kvm_condition_valid32(vcpu); +} + +static inline void kvm_skip_instr(struct kvm_vcpu *vcpu, bool is_wide_instr) +{ + kvm_skip_instr32(vcpu, is_wide_instr); +} + static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu) { vcpu->arch.hcr = HCR_GUEST_MASK; } -static inline unsigned long vcpu_get_hcr(struct kvm_vcpu *vcpu) +static inline unsigned long vcpu_get_hcr(const struct kvm_vcpu *vcpu) { return vcpu->arch.hcr; } @@ -61,7 +72,7 @@ static inline void vcpu_set_hcr(struct kvm_vcpu *vcpu, unsigned long hcr) vcpu->arch.hcr = hcr; } -static inline bool vcpu_mode_is_32bit(struct kvm_vcpu *vcpu) +static inline bool vcpu_mode_is_32bit(const struct kvm_vcpu *vcpu) { return 1; } @@ -71,9 +82,9 @@ static inline unsigned long *vcpu_pc(struct kvm_vcpu *vcpu) return &vcpu->arch.ctxt.gp_regs.usr_regs.ARM_pc; } -static inline unsigned long *vcpu_cpsr(struct kvm_vcpu *vcpu) +static inline unsigned long *vcpu_cpsr(const struct kvm_vcpu *vcpu) { - return &vcpu->arch.ctxt.gp_regs.usr_regs.ARM_cpsr; + return (unsigned long *)&vcpu->arch.ctxt.gp_regs.usr_regs.ARM_cpsr; } static inline void vcpu_set_thumb(struct kvm_vcpu *vcpu) @@ -93,11 +104,21 @@ static inline bool vcpu_mode_priv(struct kvm_vcpu *vcpu) return cpsr_mode > USR_MODE;; } -static inline u32 kvm_vcpu_get_hsr(struct kvm_vcpu *vcpu) +static inline u32 kvm_vcpu_get_hsr(const struct kvm_vcpu *vcpu) { return vcpu->arch.fault.hsr; } +static inline int kvm_vcpu_get_condition(const struct kvm_vcpu *vcpu) +{ + u32 hsr = kvm_vcpu_get_hsr(vcpu); + + if (hsr & HSR_CV) + return (hsr & HSR_COND) >> HSR_COND_SHIFT; + + return -1; +} + static inline unsigned long kvm_vcpu_get_hfar(struct kvm_vcpu *vcpu) { return vcpu->arch.fault.hxfar; diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index de338d9..2d19e02 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -39,7 +39,12 @@ #include <kvm/arm_vgic.h> + +#ifdef CONFIG_ARM_GIC_V3 +#define KVM_MAX_VCPUS VGIC_V3_MAX_CPUS +#else #define KVM_MAX_VCPUS VGIC_V2_MAX_CPUS +#endif #define KVM_REQ_VCPU_EXIT 8 @@ -183,15 +188,15 @@ struct kvm_vcpu_arch { }; struct kvm_vm_stat { - u32 remote_tlb_flush; + ulong remote_tlb_flush; }; struct kvm_vcpu_stat { - u32 halt_successful_poll; - u32 halt_attempted_poll; - u32 halt_poll_invalid; - u32 halt_wakeup; - u32 hvc_exit_stat; + u64 halt_successful_poll; + u64 halt_attempted_poll; + u64 halt_poll_invalid; + u64 halt_wakeup; + u64 hvc_exit_stat; u64 wfe_exit_stat; u64 wfi_exit_stat; u64 mmio_exit_user; diff --git a/arch/arm/include/asm/kvm_hyp.h b/arch/arm/include/asm/kvm_hyp.h index 6eaff28..343135e 100644 --- a/arch/arm/include/asm/kvm_hyp.h +++ b/arch/arm/include/asm/kvm_hyp.h @@ -20,28 +20,15 @@ #include <linux/compiler.h> #include <linux/kvm_host.h> +#include <asm/cp15.h> #include <asm/kvm_mmu.h> #include <asm/vfp.h> #define __hyp_text __section(.hyp.text) notrace -#define __ACCESS_CP15(CRn, Op1, CRm, Op2) \ - "mrc", "mcr", __stringify(p15, Op1, %0, CRn, CRm, Op2), u32 -#define __ACCESS_CP15_64(Op1, CRm) \ - "mrrc", "mcrr", __stringify(p15, Op1, %Q0, %R0, CRm), u64 #define __ACCESS_VFP(CRn) \ "mrc", "mcr", __stringify(p10, 7, %0, CRn, cr0, 0), u32 -#define __write_sysreg(v, r, w, c, t) asm volatile(w " " c : : "r" ((t)(v))) -#define write_sysreg(v, ...) __write_sysreg(v, __VA_ARGS__) - -#define __read_sysreg(r, w, c, t) ({ \ - t __val; \ - asm volatile(r " " c : "=r" (__val)); \ - __val; \ -}) -#define read_sysreg(...) __read_sysreg(__VA_ARGS__) - #define write_special(v, r) \ asm volatile("msr " __stringify(r) ", %0" : : "r" (v)) #define read_special(r) ({ \ @@ -119,6 +106,9 @@ void __vgic_v2_restore_state(struct kvm_vcpu *vcpu); void __sysreg_save_state(struct kvm_cpu_context *ctxt); void __sysreg_restore_state(struct kvm_cpu_context *ctxt); +void __vgic_v3_save_state(struct kvm_vcpu *vcpu); +void __vgic_v3_restore_state(struct kvm_vcpu *vcpu); + void asmlinkage __vfp_save_state(struct vfp_hard_struct *vfp); void asmlinkage __vfp_restore_state(struct vfp_hard_struct *vfp); static inline bool __vfp_enabled(void) diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 3bb803d..74a44727 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -63,37 +63,13 @@ void kvm_clear_hyp_idmap(void); static inline void kvm_set_pmd(pmd_t *pmd, pmd_t new_pmd) { *pmd = new_pmd; - flush_pmd_entry(pmd); + dsb(ishst); } static inline void kvm_set_pte(pte_t *pte, pte_t new_pte) { *pte = new_pte; - /* - * flush_pmd_entry just takes a void pointer and cleans the necessary - * cache entries, so we can reuse the function for ptes. - */ - flush_pmd_entry(pte); -} - -static inline void kvm_clean_pgd(pgd_t *pgd) -{ - clean_dcache_area(pgd, PTRS_PER_S2_PGD * sizeof(pgd_t)); -} - -static inline void kvm_clean_pmd(pmd_t *pmd) -{ - clean_dcache_area(pmd, PTRS_PER_PMD * sizeof(pmd_t)); -} - -static inline void kvm_clean_pmd_entry(pmd_t *pmd) -{ - clean_pmd_entry(pmd); -} - -static inline void kvm_clean_pte(pte_t *pte) -{ - clean_pte_table(pte); + dsb(ishst); } static inline pte_t kvm_s2pte_mkwrite(pte_t pte) diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h index a2b3eb3..b38c10c 100644 --- a/arch/arm/include/uapi/asm/kvm.h +++ b/arch/arm/include/uapi/asm/kvm.h @@ -84,6 +84,13 @@ struct kvm_regs { #define KVM_VGIC_V2_DIST_SIZE 0x1000 #define KVM_VGIC_V2_CPU_SIZE 0x2000 +/* Supported VGICv3 address types */ +#define KVM_VGIC_V3_ADDR_TYPE_DIST 2 +#define KVM_VGIC_V3_ADDR_TYPE_REDIST 3 + +#define KVM_VGIC_V3_DIST_SIZE SZ_64K +#define KVM_VGIC_V3_REDIST_SIZE (2 * SZ_64K) + #define KVM_ARM_VCPU_POWER_OFF 0 /* CPU is started in OFF state */ #define KVM_ARM_VCPU_PSCI_0_2 1 /* CPU uses PSCI v0.2 */ diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile index 10d77a6..f19842e 100644 --- a/arch/arm/kvm/Makefile +++ b/arch/arm/kvm/Makefile @@ -21,13 +21,16 @@ obj-$(CONFIG_KVM_ARM_HOST) += hyp/ obj-y += kvm-arm.o init.o interrupts.o obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o obj-y += coproc.o coproc_a15.o coproc_a7.o mmio.o psci.o perf.o +obj-y += $(KVM)/arm/aarch32.o obj-y += $(KVM)/arm/vgic/vgic.o obj-y += $(KVM)/arm/vgic/vgic-init.o obj-y += $(KVM)/arm/vgic/vgic-irqfd.o obj-y += $(KVM)/arm/vgic/vgic-v2.o +obj-y += $(KVM)/arm/vgic/vgic-v3.o obj-y += $(KVM)/arm/vgic/vgic-mmio.o obj-y += $(KVM)/arm/vgic/vgic-mmio-v2.o +obj-y += $(KVM)/arm/vgic/vgic-mmio-v3.o obj-y += $(KVM)/arm/vgic/vgic-kvm-device.o obj-y += $(KVM)/irqchip.o obj-y += $(KVM)/arm/arch_timer.o diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index c94b90d..03e9273 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -144,6 +144,16 @@ out_fail_alloc: return ret; } +bool kvm_arch_has_vcpu_debugfs(void) +{ + return false; +} + +int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu) +{ + return 0; +} + int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) { return VM_FAULT_SIGBUS; @@ -1176,6 +1186,10 @@ static int init_common_resources(void) return -ENOMEM; } + /* set size of VMID supported by CPU */ + kvm_vmid_bits = kvm_get_vmid_bits(); + kvm_info("%d-bit VMID\n", kvm_vmid_bits); + return 0; } @@ -1241,10 +1255,6 @@ static void teardown_hyp_mode(void) static int init_vhe_mode(void) { - /* set size of VMID supported by CPU */ - kvm_vmid_bits = kvm_get_vmid_bits(); - kvm_info("%d-bit VMID\n", kvm_vmid_bits); - kvm_info("VHE mode initialized successfully\n"); return 0; } @@ -1328,10 +1338,6 @@ static int init_hyp_mode(void) } } - /* set size of VMID supported by CPU */ - kvm_vmid_bits = kvm_get_vmid_bits(); - kvm_info("%d-bit VMID\n", kvm_vmid_bits); - kvm_info("Hyp mode initialized successfully\n"); return 0; diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c index 1bb2b79..3e5e419 100644 --- a/arch/arm/kvm/coproc.c +++ b/arch/arm/kvm/coproc.c @@ -228,6 +228,35 @@ bool access_vm_reg(struct kvm_vcpu *vcpu, return true; } +static bool access_gic_sgi(struct kvm_vcpu *vcpu, + const struct coproc_params *p, + const struct coproc_reg *r) +{ + u64 reg; + + if (!p->is_write) + return read_from_write_only(vcpu, p); + + reg = (u64)*vcpu_reg(vcpu, p->Rt2) << 32; + reg |= *vcpu_reg(vcpu, p->Rt1) ; + + vgic_v3_dispatch_sgi(vcpu, reg); + + return true; +} + +static bool access_gic_sre(struct kvm_vcpu *vcpu, + const struct coproc_params *p, + const struct coproc_reg *r) +{ + if (p->is_write) + return ignore_write(vcpu, p); + + *vcpu_reg(vcpu, p->Rt1) = vcpu->arch.vgic_cpu.vgic_v3.vgic_sre; + + return true; +} + /* * We could trap ID_DFR0 and tell the guest we don't support performance * monitoring. Unfortunately the patch to make the kernel check ID_DFR0 was @@ -361,10 +390,16 @@ static const struct coproc_reg cp15_regs[] = { { CRn(10), CRm( 3), Op1( 0), Op2( 1), is32, access_vm_reg, reset_unknown, c10_AMAIR1}, + /* ICC_SGI1R */ + { CRm64(12), Op1( 0), is64, access_gic_sgi}, + /* VBAR: swapped by interrupt.S. */ { CRn(12), CRm( 0), Op1( 0), Op2( 0), is32, NULL, reset_val, c12_VBAR, 0x00000000 }, + /* ICC_SRE */ + { CRn(12), CRm(12), Op1( 0), Op2(5), is32, access_gic_sre }, + /* CONTEXTIDR/TPIDRURW/TPIDRURO/TPIDRPRW: swapped by interrupt.S. */ { CRn(13), CRm( 0), Op1( 0), Op2( 1), is32, access_vm_reg, reset_val, c13_CID, 0x00000000 }, diff --git a/arch/arm/kvm/emulate.c b/arch/arm/kvm/emulate.c index af93e3f..0064b86 100644 --- a/arch/arm/kvm/emulate.c +++ b/arch/arm/kvm/emulate.c @@ -161,105 +161,6 @@ unsigned long *vcpu_spsr(struct kvm_vcpu *vcpu) } } -/* - * A conditional instruction is allowed to trap, even though it - * wouldn't be executed. So let's re-implement the hardware, in - * software! - */ -bool kvm_condition_valid(struct kvm_vcpu *vcpu) -{ - unsigned long cpsr, cond, insn; - - /* - * Exception Code 0 can only happen if we set HCR.TGE to 1, to - * catch undefined instructions, and then we won't get past - * the arm_exit_handlers test anyway. - */ - BUG_ON(!kvm_vcpu_trap_get_class(vcpu)); - - /* Top two bits non-zero? Unconditional. */ - if (kvm_vcpu_get_hsr(vcpu) >> 30) - return true; - - cpsr = *vcpu_cpsr(vcpu); - - /* Is condition field valid? */ - if ((kvm_vcpu_get_hsr(vcpu) & HSR_CV) >> HSR_CV_SHIFT) - cond = (kvm_vcpu_get_hsr(vcpu) & HSR_COND) >> HSR_COND_SHIFT; - else { - /* This can happen in Thumb mode: examine IT state. */ - unsigned long it; - - it = ((cpsr >> 8) & 0xFC) | ((cpsr >> 25) & 0x3); - - /* it == 0 => unconditional. */ - if (it == 0) - return true; - - /* The cond for this insn works out as the top 4 bits. */ - cond = (it >> 4); - } - - /* Shift makes it look like an ARM-mode instruction */ - insn = cond << 28; - return arm_check_condition(insn, cpsr) != ARM_OPCODE_CONDTEST_FAIL; -} - -/** - * adjust_itstate - adjust ITSTATE when emulating instructions in IT-block - * @vcpu: The VCPU pointer - * - * When exceptions occur while instructions are executed in Thumb IF-THEN - * blocks, the ITSTATE field of the CPSR is not advanced (updated), so we have - * to do this little bit of work manually. The fields map like this: - * - * IT[7:0] -> CPSR[26:25],CPSR[15:10] - */ -static void kvm_adjust_itstate(struct kvm_vcpu *vcpu) -{ - unsigned long itbits, cond; - unsigned long cpsr = *vcpu_cpsr(vcpu); - bool is_arm = !(cpsr & PSR_T_BIT); - - BUG_ON(is_arm && (cpsr & PSR_IT_MASK)); - - if (!(cpsr & PSR_IT_MASK)) - return; - - cond = (cpsr & 0xe000) >> 13; - itbits = (cpsr & 0x1c00) >> (10 - 2); - itbits |= (cpsr & (0x3 << 25)) >> 25; - - /* Perform ITAdvance (see page A-52 in ARM DDI 0406C) */ - if ((itbits & 0x7) == 0) - itbits = cond = 0; - else - itbits = (itbits << 1) & 0x1f; - - cpsr &= ~PSR_IT_MASK; - cpsr |= cond << 13; - cpsr |= (itbits & 0x1c) << (10 - 2); - cpsr |= (itbits & 0x3) << 25; - *vcpu_cpsr(vcpu) = cpsr; -} - -/** - * kvm_skip_instr - skip a trapped instruction and proceed to the next - * @vcpu: The vcpu pointer - */ -void kvm_skip_instr(struct kvm_vcpu *vcpu, bool is_wide_instr) -{ - bool is_thumb; - - is_thumb = !!(*vcpu_cpsr(vcpu) & PSR_T_BIT); - if (is_thumb && !is_wide_instr) - *vcpu_pc(vcpu) += 2; - else - *vcpu_pc(vcpu) += 4; - kvm_adjust_itstate(vcpu); -} - - /****************************************************************************** * Inject exceptions into the guest */ @@ -402,3 +303,15 @@ void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr) { inject_abt(vcpu, true, addr); } + +/** + * kvm_inject_vabt - inject an async abort / SError into the guest + * @vcpu: The VCPU to receive the exception + * + * It is assumed that this code is called from the VCPU thread and that the + * VCPU therefore is not currently executing guest code. + */ +void kvm_inject_vabt(struct kvm_vcpu *vcpu) +{ + vcpu_set_hcr(vcpu, vcpu_get_hcr(vcpu) | HCR_VA); +} diff --git a/arch/arm/kvm/handle_exit.c b/arch/arm/kvm/handle_exit.c index 3f1ef0d..4e40d19 100644 --- a/arch/arm/kvm/handle_exit.c +++ b/arch/arm/kvm/handle_exit.c @@ -28,14 +28,6 @@ typedef int (*exit_handle_fn)(struct kvm_vcpu *, struct kvm_run *); -static int handle_svc_hyp(struct kvm_vcpu *vcpu, struct kvm_run *run) -{ - /* SVC called from Hyp mode should never get here */ - kvm_debug("SVC called from Hyp mode shouldn't go here\n"); - BUG(); - return -EINVAL; /* Squash warning */ -} - static int handle_hvc(struct kvm_vcpu *vcpu, struct kvm_run *run) { int ret; @@ -59,22 +51,6 @@ static int handle_smc(struct kvm_vcpu *vcpu, struct kvm_run *run) return 1; } -static int handle_pabt_hyp(struct kvm_vcpu *vcpu, struct kvm_run *run) -{ - /* The hypervisor should never cause aborts */ - kvm_err("Prefetch Abort taken from Hyp mode at %#08lx (HSR: %#08x)\n", - kvm_vcpu_get_hfar(vcpu), kvm_vcpu_get_hsr(vcpu)); - return -EFAULT; -} - -static int handle_dabt_hyp(struct kvm_vcpu *vcpu, struct kvm_run *run) -{ - /* This is either an error in the ws. code or an external abort */ - kvm_err("Data Abort taken from Hyp mode at %#08lx (HSR: %#08x)\n", - kvm_vcpu_get_hfar(vcpu), kvm_vcpu_get_hsr(vcpu)); - return -EFAULT; -} - /** * kvm_handle_wfx - handle a WFI or WFE instructions trapped in guests * @vcpu: the vcpu pointer @@ -112,13 +88,10 @@ static exit_handle_fn arm_exit_handlers[] = { [HSR_EC_CP14_64] = kvm_handle_cp14_access, [HSR_EC_CP_0_13] = kvm_handle_cp_0_13_access, [HSR_EC_CP10_ID] = kvm_handle_cp10_id, - [HSR_EC_SVC_HYP] = handle_svc_hyp, [HSR_EC_HVC] = handle_hvc, [HSR_EC_SMC] = handle_smc, [HSR_EC_IABT] = kvm_handle_guest_abort, - [HSR_EC_IABT_HYP] = handle_pabt_hyp, [HSR_EC_DABT] = kvm_handle_guest_abort, - [HSR_EC_DABT_HYP] = handle_dabt_hyp, }; static exit_handle_fn kvm_get_exit_handler(struct kvm_vcpu *vcpu) @@ -144,6 +117,25 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, { exit_handle_fn exit_handler; + if (ARM_ABORT_PENDING(exception_index)) { + u8 hsr_ec = kvm_vcpu_trap_get_class(vcpu); + + /* + * HVC/SMC already have an adjusted PC, which we need + * to correct in order to return to after having + * injected the abort. + */ + if (hsr_ec == HSR_EC_HVC || hsr_ec == HSR_EC_SMC) { + u32 adj = kvm_vcpu_trap_il_is32bit(vcpu) ? 4 : 2; + *vcpu_pc(vcpu) -= adj; + } + + kvm_inject_vabt(vcpu); + return 1; + } + + exception_index = ARM_EXCEPTION_CODE(exception_index); + switch (exception_index) { case ARM_EXCEPTION_IRQ: return 1; @@ -160,6 +152,9 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, exit_handler = kvm_get_exit_handler(vcpu); return exit_handler(vcpu, run); + case ARM_EXCEPTION_DATA_ABORT: + kvm_inject_vabt(vcpu); + return 1; default: kvm_pr_unimpl("Unsupported exception type: %d", exception_index); diff --git a/arch/arm/kvm/hyp/Makefile b/arch/arm/kvm/hyp/Makefile index 8dfa5f7..3023bb5 100644 --- a/arch/arm/kvm/hyp/Makefile +++ b/arch/arm/kvm/hyp/Makefile @@ -5,6 +5,7 @@ KVM=../../../../virt/kvm obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/vgic-v2-sr.o +obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/vgic-v3-sr.o obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/timer-sr.o obj-$(CONFIG_KVM_ARM_HOST) += tlb.o diff --git a/arch/arm/kvm/hyp/entry.S b/arch/arm/kvm/hyp/entry.S index 21c2388..60783f3 100644 --- a/arch/arm/kvm/hyp/entry.S +++ b/arch/arm/kvm/hyp/entry.S @@ -18,6 +18,7 @@ #include <linux/linkage.h> #include <asm/asm-offsets.h> #include <asm/kvm_arm.h> +#include <asm/kvm_asm.h> .arch_extension virt @@ -63,6 +64,36 @@ ENTRY(__guest_exit) ldr lr, [r0, #4] mov r0, r1 + mrs r1, SPSR + mrs r2, ELR_hyp + mrc p15, 4, r3, c5, c2, 0 @ HSR + + /* + * Force loads and stores to complete before unmasking aborts + * and forcing the delivery of the exception. This gives us a + * single instruction window, which the handler will try to + * match. + */ + dsb sy + cpsie a + + .global abort_guest_exit_start +abort_guest_exit_start: + + isb + + .global abort_guest_exit_end +abort_guest_exit_end: + + /* + * If we took an abort, r0[31] will be set, and cmp will set + * the N bit in PSTATE. + */ + cmp r0, #0 + msrmi SPSR_cxsf, r1 + msrmi ELR_hyp, r2 + mcrmi p15, 4, r3, c5, c2, 0 @ HSR + bx lr ENDPROC(__guest_exit) diff --git a/arch/arm/kvm/hyp/hyp-entry.S b/arch/arm/kvm/hyp/hyp-entry.S index 7809138..96beb53 100644 --- a/arch/arm/kvm/hyp/hyp-entry.S +++ b/arch/arm/kvm/hyp/hyp-entry.S @@ -81,7 +81,6 @@ __kvm_hyp_vector: invalid_vector hyp_undef ARM_EXCEPTION_UNDEFINED invalid_vector hyp_svc ARM_EXCEPTION_SOFTWARE invalid_vector hyp_pabt ARM_EXCEPTION_PREF_ABORT - invalid_vector hyp_dabt ARM_EXCEPTION_DATA_ABORT invalid_vector hyp_fiq ARM_EXCEPTION_FIQ ENTRY(__hyp_do_panic) @@ -164,6 +163,21 @@ hyp_irq: load_vcpu r0 @ Load VCPU pointer to r0 b __guest_exit +hyp_dabt: + push {r0, r1} + mrs r0, ELR_hyp + ldr r1, =abort_guest_exit_start +THUMB( add r1, r1, #1) + cmp r0, r1 + ldrne r1, =abort_guest_exit_end +THUMB( addne r1, r1, #1) + cmpne r0, r1 + pop {r0, r1} + bne __hyp_panic + + orr r0, r0, #(1 << ARM_EXIT_WITH_ABORT_BIT) + eret + .ltorg .popsection diff --git a/arch/arm/kvm/hyp/switch.c b/arch/arm/kvm/hyp/switch.c index b13caa9..92678b7 100644 --- a/arch/arm/kvm/hyp/switch.c +++ b/arch/arm/kvm/hyp/switch.c @@ -14,6 +14,7 @@ * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ +#include <linux/jump_label.h> #include <asm/kvm_asm.h> #include <asm/kvm_hyp.h> @@ -54,6 +55,15 @@ static void __hyp_text __deactivate_traps(struct kvm_vcpu *vcpu) { u32 val; + /* + * If we pended a virtual abort, preserve it until it gets + * cleared. See B1.9.9 (Virtual Abort exception) for details, + * but the crucial bit is the zeroing of HCR.VA in the + * pseudocode. + */ + if (vcpu->arch.hcr & HCR_VA) + vcpu->arch.hcr = read_sysreg(HCR); + write_sysreg(0, HCR); write_sysreg(0, HSTR); val = read_sysreg(HDCR); @@ -74,14 +84,21 @@ static void __hyp_text __deactivate_vm(struct kvm_vcpu *vcpu) write_sysreg(read_sysreg(MIDR), VPIDR); } + static void __hyp_text __vgic_save_state(struct kvm_vcpu *vcpu) { - __vgic_v2_save_state(vcpu); + if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + __vgic_v3_save_state(vcpu); + else + __vgic_v2_save_state(vcpu); } static void __hyp_text __vgic_restore_state(struct kvm_vcpu *vcpu) { - __vgic_v2_restore_state(vcpu); + if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + __vgic_v3_restore_state(vcpu); + else + __vgic_v2_restore_state(vcpu); } static bool __hyp_text __populate_fault_info(struct kvm_vcpu *vcpu) @@ -134,7 +151,7 @@ static bool __hyp_text __populate_fault_info(struct kvm_vcpu *vcpu) return true; } -static int __hyp_text __guest_run(struct kvm_vcpu *vcpu) +int __hyp_text __kvm_vcpu_run(struct kvm_vcpu *vcpu) { struct kvm_cpu_context *host_ctxt; struct kvm_cpu_context *guest_ctxt; @@ -191,8 +208,6 @@ again: return exit_code; } -__alias(__guest_run) int __kvm_vcpu_run(struct kvm_vcpu *vcpu); - static const char * const __hyp_panic_string[] = { [ARM_EXCEPTION_RESET] = "\nHYP panic: RST PC:%08x CPSR:%08x", [ARM_EXCEPTION_UNDEFINED] = "\nHYP panic: UNDEF PC:%08x CPSR:%08x", diff --git a/arch/arm/kvm/hyp/tlb.c b/arch/arm/kvm/hyp/tlb.c index a263600..7296528 100644 --- a/arch/arm/kvm/hyp/tlb.c +++ b/arch/arm/kvm/hyp/tlb.c @@ -34,7 +34,7 @@ * As v7 does not support flushing per IPA, just nuke the whole TLB * instead, ignoring the ipa value. */ -static void __hyp_text __tlb_flush_vmid(struct kvm *kvm) +void __hyp_text __kvm_tlb_flush_vmid(struct kvm *kvm) { dsb(ishst); @@ -50,21 +50,14 @@ static void __hyp_text __tlb_flush_vmid(struct kvm *kvm) write_sysreg(0, VTTBR); } -__alias(__tlb_flush_vmid) void __kvm_tlb_flush_vmid(struct kvm *kvm); - -static void __hyp_text __tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) +void __hyp_text __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) { - __tlb_flush_vmid(kvm); + __kvm_tlb_flush_vmid(kvm); } -__alias(__tlb_flush_vmid_ipa) void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, - phys_addr_t ipa); - -static void __hyp_text __tlb_flush_vm_context(void) +void __hyp_text __kvm_flush_vm_context(void) { write_sysreg(0, TLBIALLNSNHIS); write_sysreg(0, ICIALLUIS); dsb(ish); } - -__alias(__tlb_flush_vm_context) void __kvm_flush_vm_context(void); diff --git a/arch/arm/kvm/mmio.c b/arch/arm/kvm/mmio.c index 10f80a6..b6e715f 100644 --- a/arch/arm/kvm/mmio.c +++ b/arch/arm/kvm/mmio.c @@ -126,12 +126,6 @@ static int decode_hsr(struct kvm_vcpu *vcpu, bool *is_write, int *len) int access_size; bool sign_extend; - if (kvm_vcpu_dabt_isextabt(vcpu)) { - /* cache operation on I/O addr, tell guest unsupported */ - kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); - return 1; - } - if (kvm_vcpu_dabt_iss1tw(vcpu)) { /* page table accesses IO mem: tell guest to fix its TTBR */ kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index e9a5c0e..a5265ed 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -744,7 +744,6 @@ int kvm_alloc_stage2_pgd(struct kvm *kvm) if (!pgd) return -ENOMEM; - kvm_clean_pgd(pgd); kvm->arch.pgd = pgd; return 0; } @@ -936,7 +935,6 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, if (!cache) return 0; /* ignore calls from kvm_set_spte_hva */ pte = mmu_memory_cache_alloc(cache); - kvm_clean_pte(pte); pmd_populate_kernel(NULL, pmd, pte); get_page(virt_to_page(pmd)); } @@ -1434,6 +1432,11 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) int ret, idx; is_iabt = kvm_vcpu_trap_is_iabt(vcpu); + if (unlikely(!is_iabt && kvm_vcpu_dabt_isextabt(vcpu))) { + kvm_inject_vabt(vcpu); + return 1; + } + fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_hsr(vcpu), diff --git a/arch/arm64/include/asm/arch_gicv3.h b/arch/arm64/include/asm/arch_gicv3.h index fc2a0cb..f8ae6d6 100644 --- a/arch/arm64/include/asm/arch_gicv3.h +++ b/arch/arm64/include/asm/arch_gicv3.h @@ -80,6 +80,19 @@ #include <linux/stringify.h> #include <asm/barrier.h> +#define read_gicreg(r) \ + ({ \ + u64 reg; \ + asm volatile("mrs_s %0, " __stringify(r) : "=r" (reg)); \ + reg; \ + }) + +#define write_gicreg(v,r) \ + do { \ + u64 __val = (v); \ + asm volatile("msr_s " __stringify(r) ", %0" : : "r" (__val));\ + } while (0) + /* * Low-level accessors * diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 4b5c977..2a2752b 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -50,7 +50,7 @@ #define HCR_BSU (3 << 10) #define HCR_BSU_IS (UL(1) << 10) #define HCR_FB (UL(1) << 9) -#define HCR_VA (UL(1) << 8) +#define HCR_VSE (UL(1) << 8) #define HCR_VI (UL(1) << 7) #define HCR_VF (UL(1) << 6) #define HCR_AMO (UL(1) << 5) @@ -80,7 +80,7 @@ #define HCR_GUEST_FLAGS (HCR_TSC | HCR_TSW | HCR_TWE | HCR_TWI | HCR_VM | \ HCR_TVM | HCR_BSU_IS | HCR_FB | HCR_TAC | \ HCR_AMO | HCR_SWIO | HCR_TIDCP | HCR_RW) -#define HCR_VIRT_EXCP_MASK (HCR_VA | HCR_VI | HCR_VF) +#define HCR_VIRT_EXCP_MASK (HCR_VSE | HCR_VI | HCR_VF) #define HCR_INT_OVERRIDE (HCR_FMO | HCR_IMO) #define HCR_HOST_VHE_FLAGS (HCR_RW | HCR_TGE | HCR_E2H) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 7561f63..18f7465 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -20,10 +20,15 @@ #include <asm/virt.h> +#define ARM_EXIT_WITH_SERROR_BIT 31 +#define ARM_EXCEPTION_CODE(x) ((x) & ~(1U << ARM_EXIT_WITH_SERROR_BIT)) +#define ARM_SERROR_PENDING(x) !!((x) & (1U << ARM_EXIT_WITH_SERROR_BIT)) + #define ARM_EXCEPTION_IRQ 0 -#define ARM_EXCEPTION_TRAP 1 +#define ARM_EXCEPTION_EL1_SERROR 1 +#define ARM_EXCEPTION_TRAP 2 /* The hyp-stub will return this for any kvm_call_hyp() call */ -#define ARM_EXCEPTION_HYP_GONE 2 +#define ARM_EXCEPTION_HYP_GONE 3 #define KVM_ARM64_DEBUG_DIRTY_SHIFT 0 #define KVM_ARM64_DEBUG_DIRTY (1 << KVM_ARM64_DEBUG_DIRTY_SHIFT) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 4cdeae3..fd9d5fd 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -38,6 +38,7 @@ bool kvm_condition_valid32(const struct kvm_vcpu *vcpu); void kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr); void kvm_inject_undefined(struct kvm_vcpu *vcpu); +void kvm_inject_vabt(struct kvm_vcpu *vcpu); void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr); void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr); @@ -147,6 +148,16 @@ static inline u32 kvm_vcpu_get_hsr(const struct kvm_vcpu *vcpu) return vcpu->arch.fault.esr_el2; } +static inline int kvm_vcpu_get_condition(const struct kvm_vcpu *vcpu) +{ + u32 esr = kvm_vcpu_get_hsr(vcpu); + + if (esr & ESR_ELx_CV) + return (esr & ESR_ELx_COND_MASK) >> ESR_ELx_COND_SHIFT; + + return -1; +} + static inline unsigned long kvm_vcpu_get_hfar(const struct kvm_vcpu *vcpu) { return vcpu->arch.fault.far_el2; diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 3eda975..bd94e67 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -290,15 +290,15 @@ struct kvm_vcpu_arch { #endif struct kvm_vm_stat { - u32 remote_tlb_flush; + ulong remote_tlb_flush; }; struct kvm_vcpu_stat { - u32 halt_successful_poll; - u32 halt_attempted_poll; - u32 halt_poll_invalid; - u32 halt_wakeup; - u32 hvc_exit_stat; + u64 halt_successful_poll; + u64 halt_attempted_poll; + u64 halt_poll_invalid; + u64 halt_wakeup; + u64 hvc_exit_stat; u64 wfe_exit_stat; u64 wfi_exit_stat; u64 mmio_exit_user; diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index cff5105..b18e852 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -123,6 +123,7 @@ typeof(orig) * __hyp_text fname(void) \ void __vgic_v2_save_state(struct kvm_vcpu *vcpu); void __vgic_v2_restore_state(struct kvm_vcpu *vcpu); +int __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu); void __vgic_v3_save_state(struct kvm_vcpu *vcpu); void __vgic_v3_restore_state(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index dff1098..a79b969 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -162,12 +162,6 @@ void kvm_clear_hyp_idmap(void); #define kvm_set_pte(ptep, pte) set_pte(ptep, pte) #define kvm_set_pmd(pmdp, pmd) set_pmd(pmdp, pmd) -static inline void kvm_clean_pgd(pgd_t *pgd) {} -static inline void kvm_clean_pmd(pmd_t *pmd) {} -static inline void kvm_clean_pmd_entry(pmd_t *pmd) {} -static inline void kvm_clean_pte(pte_t *pte) {} -static inline void kvm_clean_pte_entry(pte_t *pte) {} - static inline pte_t kvm_s2pte_mkwrite(pte_t pte) { pte_val(pte) |= PTE_S2_RDWR; diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 9c9edc9..6eaf12c 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -16,7 +16,7 @@ menuconfig VIRTUALIZATION if VIRTUALIZATION -config KVM_ARM_VGIC_V3 +config KVM_ARM_VGIC_V3_ITS bool config KVM @@ -34,7 +34,7 @@ config KVM select KVM_VFIO select HAVE_KVM_EVENTFD select HAVE_KVM_IRQFD - select KVM_ARM_VGIC_V3 + select KVM_ARM_VGIC_V3_ITS select KVM_ARM_PMU if HW_PERF_EVENTS select HAVE_KVM_MSI select HAVE_KVM_IRQCHIP diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index 695eb3c..d50a82a 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -16,9 +16,10 @@ kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/e kvm-$(CONFIG_KVM_ARM_HOST) += $(ARM)/arm.o $(ARM)/mmu.o $(ARM)/mmio.o kvm-$(CONFIG_KVM_ARM_HOST) += $(ARM)/psci.o $(ARM)/perf.o -kvm-$(CONFIG_KVM_ARM_HOST) += emulate.o inject_fault.o regmap.o +kvm-$(CONFIG_KVM_ARM_HOST) += inject_fault.o regmap.o kvm-$(CONFIG_KVM_ARM_HOST) += hyp.o hyp-init.o handle_exit.o kvm-$(CONFIG_KVM_ARM_HOST) += guest.o debug.o reset.o sys_regs.o sys_regs_generic_v8.o +kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/aarch32.o kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic.o kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-init.o diff --git a/arch/arm64/kvm/emulate.c b/arch/arm64/kvm/emulate.c deleted file mode 100644 index f87d8fb..0000000 --- a/arch/arm64/kvm/emulate.c +++ /dev/null @@ -1,159 +0,0 @@ -/* - * (not much of an) Emulation layer for 32bit guests. - * - * Copyright (C) 2012,2013 - ARM Ltd - * Author: Marc Zyngier <marc.zyngier@arm.com> - * - * based on arch/arm/kvm/emulate.c - * Copyright (C) 2012 - Virtual Open Systems and Columbia University - * Author: Christoffer Dall <c.dall@virtualopensystems.com> - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#include <linux/kvm_host.h> -#include <asm/esr.h> -#include <asm/kvm_emulate.h> - -/* - * stolen from arch/arm/kernel/opcodes.c - * - * condition code lookup table - * index into the table is test code: EQ, NE, ... LT, GT, AL, NV - * - * bit position in short is condition code: NZCV - */ -static const unsigned short cc_map[16] = { - 0xF0F0, /* EQ == Z set */ - 0x0F0F, /* NE */ - 0xCCCC, /* CS == C set */ - 0x3333, /* CC */ - 0xFF00, /* MI == N set */ - 0x00FF, /* PL */ - 0xAAAA, /* VS == V set */ - 0x5555, /* VC */ - 0x0C0C, /* HI == C set && Z clear */ - 0xF3F3, /* LS == C clear || Z set */ - 0xAA55, /* GE == (N==V) */ - 0x55AA, /* LT == (N!=V) */ - 0x0A05, /* GT == (!Z && (N==V)) */ - 0xF5FA, /* LE == (Z || (N!=V)) */ - 0xFFFF, /* AL always */ - 0 /* NV */ -}; - -static int kvm_vcpu_get_condition(const struct kvm_vcpu *vcpu) -{ - u32 esr = kvm_vcpu_get_hsr(vcpu); - - if (esr & ESR_ELx_CV) - return (esr & ESR_ELx_COND_MASK) >> ESR_ELx_COND_SHIFT; - - return -1; -} - -/* - * Check if a trapped instruction should have been executed or not. - */ -bool kvm_condition_valid32(const struct kvm_vcpu *vcpu) -{ - unsigned long cpsr; - u32 cpsr_cond; - int cond; - - /* Top two bits non-zero? Unconditional. */ - if (kvm_vcpu_get_hsr(vcpu) >> 30) - return true; - - /* Is condition field valid? */ - cond = kvm_vcpu_get_condition(vcpu); - if (cond == 0xE) - return true; - - cpsr = *vcpu_cpsr(vcpu); - - if (cond < 0) { - /* This can happen in Thumb mode: examine IT state. */ - unsigned long it; - - it = ((cpsr >> 8) & 0xFC) | ((cpsr >> 25) & 0x3); - - /* it == 0 => unconditional. */ - if (it == 0) - return true; - - /* The cond for this insn works out as the top 4 bits. */ - cond = (it >> 4); - } - - cpsr_cond = cpsr >> 28; - - if (!((cc_map[cond] >> cpsr_cond) & 1)) - return false; - - return true; -} - -/** - * adjust_itstate - adjust ITSTATE when emulating instructions in IT-block - * @vcpu: The VCPU pointer - * - * When exceptions occur while instructions are executed in Thumb IF-THEN - * blocks, the ITSTATE field of the CPSR is not advanced (updated), so we have - * to do this little bit of work manually. The fields map like this: - * - * IT[7:0] -> CPSR[26:25],CPSR[15:10] - */ -static void kvm_adjust_itstate(struct kvm_vcpu *vcpu) -{ - unsigned long itbits, cond; - unsigned long cpsr = *vcpu_cpsr(vcpu); - bool is_arm = !(cpsr & COMPAT_PSR_T_BIT); - - BUG_ON(is_arm && (cpsr & COMPAT_PSR_IT_MASK)); - - if (!(cpsr & COMPAT_PSR_IT_MASK)) - return; - - cond = (cpsr & 0xe000) >> 13; - itbits = (cpsr & 0x1c00) >> (10 - 2); - itbits |= (cpsr & (0x3 << 25)) >> 25; - - /* Perform ITAdvance (see page A2-52 in ARM DDI 0406C) */ - if ((itbits & 0x7) == 0) - itbits = cond = 0; - else - itbits = (itbits << 1) & 0x1f; - - cpsr &= ~COMPAT_PSR_IT_MASK; - cpsr |= cond << 13; - cpsr |= (itbits & 0x1c) << (10 - 2); - cpsr |= (itbits & 0x3) << 25; - *vcpu_cpsr(vcpu) = cpsr; -} - -/** - * kvm_skip_instr - skip a trapped instruction and proceed to the next - * @vcpu: The vcpu pointer - */ -void kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr) -{ - bool is_thumb; - - is_thumb = !!(*vcpu_cpsr(vcpu) & COMPAT_PSR_T_BIT); - if (is_thumb && !is_wide_instr) - *vcpu_pc(vcpu) += 2; - else - *vcpu_pc(vcpu) += 4; - kvm_adjust_itstate(vcpu); -} diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index fa96fe2..a204adf 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -170,9 +170,32 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, { exit_handle_fn exit_handler; + if (ARM_SERROR_PENDING(exception_index)) { + u8 hsr_ec = ESR_ELx_EC(kvm_vcpu_get_hsr(vcpu)); + + /* + * HVC/SMC already have an adjusted PC, which we need + * to correct in order to return to after having + * injected the SError. + */ + if (hsr_ec == ESR_ELx_EC_HVC32 || hsr_ec == ESR_ELx_EC_HVC64 || + hsr_ec == ESR_ELx_EC_SMC32 || hsr_ec == ESR_ELx_EC_SMC64) { + u32 adj = kvm_vcpu_trap_il_is32bit(vcpu) ? 4 : 2; + *vcpu_pc(vcpu) -= adj; + } + + kvm_inject_vabt(vcpu); + return 1; + } + + exception_index = ARM_EXCEPTION_CODE(exception_index); + switch (exception_index) { case ARM_EXCEPTION_IRQ: return 1; + case ARM_EXCEPTION_EL1_SERROR: + kvm_inject_vabt(vcpu); + return 1; case ARM_EXCEPTION_TRAP: /* * See ARM ARM B1.14.1: "Hyp traps on instructions diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile index 0c85feb..aaf42ae 100644 --- a/arch/arm64/kvm/hyp/Makefile +++ b/arch/arm64/kvm/hyp/Makefile @@ -5,9 +5,9 @@ KVM=../../../../virt/kvm obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/vgic-v2-sr.o +obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/vgic-v3-sr.o obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/timer-sr.o -obj-$(CONFIG_KVM_ARM_HOST) += vgic-v3-sr.o obj-$(CONFIG_KVM_ARM_HOST) += sysreg-sr.o obj-$(CONFIG_KVM_ARM_HOST) += debug-sr.o obj-$(CONFIG_KVM_ARM_HOST) += entry.o diff --git a/arch/arm64/kvm/hyp/debug-sr.c b/arch/arm64/kvm/hyp/debug-sr.c index 33342a7..4ba5c90 100644 --- a/arch/arm64/kvm/hyp/debug-sr.c +++ b/arch/arm64/kvm/hyp/debug-sr.c @@ -131,9 +131,7 @@ void __hyp_text __debug_cond_restore_host_state(struct kvm_vcpu *vcpu) vcpu->arch.debug_flags &= ~KVM_ARM64_DEBUG_DIRTY; } -static u32 __hyp_text __debug_read_mdcr_el2(void) +u32 __hyp_text __kvm_get_mdcr_el2(void) { return read_sysreg(mdcr_el2); } - -__alias(__debug_read_mdcr_el2) u32 __kvm_get_mdcr_el2(void); diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S index ce9e5e5..12ee62d 100644 --- a/arch/arm64/kvm/hyp/entry.S +++ b/arch/arm64/kvm/hyp/entry.S @@ -55,79 +55,111 @@ */ ENTRY(__guest_enter) // x0: vcpu - // x1: host/guest context - // x2-x18: clobbered by macros + // x1: host context + // x2-x17: clobbered by macros + // x18: guest context // Store the host regs save_callee_saved_regs x1 - // Preserve vcpu & host_ctxt for use at exit time - stp x0, x1, [sp, #-16]! + // Store the host_ctxt for use at exit time + str x1, [sp, #-16]! - add x1, x0, #VCPU_CONTEXT + add x18, x0, #VCPU_CONTEXT - // Prepare x0-x1 for later restore by pushing them onto the stack - ldp x2, x3, [x1, #CPU_XREG_OFFSET(0)] - stp x2, x3, [sp, #-16]! + // Restore guest regs x0-x17 + ldp x0, x1, [x18, #CPU_XREG_OFFSET(0)] + ldp x2, x3, [x18, #CPU_XREG_OFFSET(2)] + ldp x4, x5, [x18, #CPU_XREG_OFFSET(4)] + ldp x6, x7, [x18, #CPU_XREG_OFFSET(6)] + ldp x8, x9, [x18, #CPU_XREG_OFFSET(8)] + ldp x10, x11, [x18, #CPU_XREG_OFFSET(10)] + ldp x12, x13, [x18, #CPU_XREG_OFFSET(12)] + ldp x14, x15, [x18, #CPU_XREG_OFFSET(14)] + ldp x16, x17, [x18, #CPU_XREG_OFFSET(16)] - // x2-x18 - ldp x2, x3, [x1, #CPU_XREG_OFFSET(2)] - ldp x4, x5, [x1, #CPU_XREG_OFFSET(4)] - ldp x6, x7, [x1, #CPU_XREG_OFFSET(6)] - ldp x8, x9, [x1, #CPU_XREG_OFFSET(8)] - ldp x10, x11, [x1, #CPU_XREG_OFFSET(10)] - ldp x12, x13, [x1, #CPU_XREG_OFFSET(12)] - ldp x14, x15, [x1, #CPU_XREG_OFFSET(14)] - ldp x16, x17, [x1, #CPU_XREG_OFFSET(16)] - ldr x18, [x1, #CPU_XREG_OFFSET(18)] - - // x19-x29, lr - restore_callee_saved_regs x1 - - // Last bits of the 64bit state - ldp x0, x1, [sp], #16 + // Restore guest regs x19-x29, lr + restore_callee_saved_regs x18 + + // Restore guest reg x18 + ldr x18, [x18, #CPU_XREG_OFFSET(18)] // Do not touch any register after this! eret ENDPROC(__guest_enter) ENTRY(__guest_exit) - // x0: vcpu - // x1: return code - // x2-x3: free - // x4-x29,lr: vcpu regs - // vcpu x0-x3 on the stack + // x0: return code + // x1: vcpu + // x2-x29,lr: vcpu regs + // vcpu x0-x1 on the stack - add x2, x0, #VCPU_CONTEXT + add x1, x1, #VCPU_CONTEXT - stp x4, x5, [x2, #CPU_XREG_OFFSET(4)] - stp x6, x7, [x2, #CPU_XREG_OFFSET(6)] - stp x8, x9, [x2, #CPU_XREG_OFFSET(8)] - stp x10, x11, [x2, #CPU_XREG_OFFSET(10)] - stp x12, x13, [x2, #CPU_XREG_OFFSET(12)] - stp x14, x15, [x2, #CPU_XREG_OFFSET(14)] - stp x16, x17, [x2, #CPU_XREG_OFFSET(16)] - str x18, [x2, #CPU_XREG_OFFSET(18)] + ALTERNATIVE(nop, SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN) - ldp x6, x7, [sp], #16 // x2, x3 - ldp x4, x5, [sp], #16 // x0, x1 + // Store the guest regs x2 and x3 + stp x2, x3, [x1, #CPU_XREG_OFFSET(2)] - stp x4, x5, [x2, #CPU_XREG_OFFSET(0)] - stp x6, x7, [x2, #CPU_XREG_OFFSET(2)] + // Retrieve the guest regs x0-x1 from the stack + ldp x2, x3, [sp], #16 // x0, x1 + + // Store the guest regs x0-x1 and x4-x18 + stp x2, x3, [x1, #CPU_XREG_OFFSET(0)] + stp x4, x5, [x1, #CPU_XREG_OFFSET(4)] + stp x6, x7, [x1, #CPU_XREG_OFFSET(6)] + stp x8, x9, [x1, #CPU_XREG_OFFSET(8)] + stp x10, x11, [x1, #CPU_XREG_OFFSET(10)] + stp x12, x13, [x1, #CPU_XREG_OFFSET(12)] + stp x14, x15, [x1, #CPU_XREG_OFFSET(14)] + stp x16, x17, [x1, #CPU_XREG_OFFSET(16)] + str x18, [x1, #CPU_XREG_OFFSET(18)] + + // Store the guest regs x19-x29, lr + save_callee_saved_regs x1 - save_callee_saved_regs x2 + // Restore the host_ctxt from the stack + ldr x2, [sp], #16 - // Restore vcpu & host_ctxt from the stack - // (preserving return code in x1) - ldp x0, x2, [sp], #16 // Now restore the host regs restore_callee_saved_regs x2 - mov x0, x1 - ret + // If we have a pending asynchronous abort, now is the + // time to find out. From your VAXorcist book, page 666: + // "Threaten me not, oh Evil one! For I speak with + // the power of DEC, and I command thee to show thyself!" + mrs x2, elr_el2 + mrs x3, esr_el2 + mrs x4, spsr_el2 + mov x5, x0 + + dsb sy // Synchronize against in-flight ld/st + msr daifclr, #4 // Unmask aborts + + // This is our single instruction exception window. A pending + // SError is guaranteed to occur at the earliest when we unmask + // it, and at the latest just after the ISB. + .global abort_guest_exit_start +abort_guest_exit_start: + + isb + + .global abort_guest_exit_end +abort_guest_exit_end: + + // If the exception took place, restore the EL1 exception + // context so that we can report some information. + // Merge the exception code with the SError pending bit. + tbz x0, #ARM_EXIT_WITH_SERROR_BIT, 1f + msr elr_el2, x2 + msr esr_el2, x3 + msr spsr_el2, x4 + orr x0, x0, x5 +1: ret ENDPROC(__guest_exit) ENTRY(__fpsimd_guest_restore) + stp x2, x3, [sp, #-16]! stp x4, lr, [sp, #-16]! alternative_if_not ARM64_HAS_VIRT_HOST_EXTN diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index f6d9694..4e92399 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -27,16 +27,6 @@ .text .pushsection .hyp.text, "ax" -.macro save_x0_to_x3 - stp x0, x1, [sp, #-16]! - stp x2, x3, [sp, #-16]! -.endm - -.macro restore_x0_to_x3 - ldp x2, x3, [sp], #16 - ldp x0, x1, [sp], #16 -.endm - .macro do_el2_call /* * Shuffle the parameters before calling the function @@ -79,23 +69,23 @@ ENTRY(__kvm_hyp_teardown) ENDPROC(__kvm_hyp_teardown) el1_sync: // Guest trapped into EL2 - save_x0_to_x3 + stp x0, x1, [sp, #-16]! alternative_if_not ARM64_HAS_VIRT_HOST_EXTN mrs x1, esr_el2 alternative_else mrs x1, esr_el1 alternative_endif - lsr x2, x1, #ESR_ELx_EC_SHIFT + lsr x0, x1, #ESR_ELx_EC_SHIFT - cmp x2, #ESR_ELx_EC_HVC64 + cmp x0, #ESR_ELx_EC_HVC64 b.ne el1_trap - mrs x3, vttbr_el2 // If vttbr is valid, the 64bit guest - cbnz x3, el1_trap // called HVC + mrs x1, vttbr_el2 // If vttbr is valid, the 64bit guest + cbnz x1, el1_trap // called HVC /* Here, we're pretty sure the host called HVC. */ - restore_x0_to_x3 + ldp x0, x1, [sp], #16 cmp x0, #HVC_GET_VECTORS b.ne 1f @@ -113,24 +103,51 @@ alternative_endif el1_trap: /* - * x1: ESR - * x2: ESR_EC + * x0: ESR_EC */ /* Guest accessed VFP/SIMD registers, save host, restore Guest */ - cmp x2, #ESR_ELx_EC_FP_ASIMD + cmp x0, #ESR_ELx_EC_FP_ASIMD b.eq __fpsimd_guest_restore - mrs x0, tpidr_el2 - mov x1, #ARM_EXCEPTION_TRAP + mrs x1, tpidr_el2 + mov x0, #ARM_EXCEPTION_TRAP b __guest_exit el1_irq: - save_x0_to_x3 - mrs x0, tpidr_el2 - mov x1, #ARM_EXCEPTION_IRQ + stp x0, x1, [sp, #-16]! + mrs x1, tpidr_el2 + mov x0, #ARM_EXCEPTION_IRQ + b __guest_exit + +el1_error: + stp x0, x1, [sp, #-16]! + mrs x1, tpidr_el2 + mov x0, #ARM_EXCEPTION_EL1_SERROR b __guest_exit +el2_error: + /* + * Only two possibilities: + * 1) Either we come from the exit path, having just unmasked + * PSTATE.A: change the return code to an EL2 fault, and + * carry on, as we're already in a sane state to handle it. + * 2) Or we come from anywhere else, and that's a bug: we panic. + * + * For (1), x0 contains the original return code and x1 doesn't + * contain anything meaningful at that stage. We can reuse them + * as temp registers. + * For (2), who cares? + */ + mrs x0, elr_el2 + adr x1, abort_guest_exit_start + cmp x0, x1 + adr x1, abort_guest_exit_end + ccmp x0, x1, #4, ne + b.ne __hyp_panic + mov x0, #(1 << ARM_EXIT_WITH_SERROR_BIT) + eret + ENTRY(__hyp_do_panic) mov lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\ PSR_MODE_EL1h) @@ -155,11 +172,9 @@ ENDPROC(\label) invalid_vector el2h_sync_invalid invalid_vector el2h_irq_invalid invalid_vector el2h_fiq_invalid - invalid_vector el2h_error_invalid invalid_vector el1_sync_invalid invalid_vector el1_irq_invalid invalid_vector el1_fiq_invalid - invalid_vector el1_error_invalid .ltorg @@ -174,15 +189,15 @@ ENTRY(__kvm_hyp_vector) ventry el2h_sync_invalid // Synchronous EL2h ventry el2h_irq_invalid // IRQ EL2h ventry el2h_fiq_invalid // FIQ EL2h - ventry el2h_error_invalid // Error EL2h + ventry el2_error // Error EL2h ventry el1_sync // Synchronous 64-bit EL1 ventry el1_irq // IRQ 64-bit EL1 ventry el1_fiq_invalid // FIQ 64-bit EL1 - ventry el1_error_invalid // Error 64-bit EL1 + ventry el1_error // Error 64-bit EL1 ventry el1_sync // Synchronous 32-bit EL1 ventry el1_irq // IRQ 32-bit EL1 ventry el1_fiq_invalid // FIQ 32-bit EL1 - ventry el1_error_invalid // Error 32-bit EL1 + ventry el1_error // Error 32-bit EL1 ENDPROC(__kvm_hyp_vector) diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c index 5a84b45..83037cd 100644 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c @@ -16,7 +16,10 @@ */ #include <linux/types.h> +#include <linux/jump_label.h> + #include <asm/kvm_asm.h> +#include <asm/kvm_emulate.h> #include <asm/kvm_hyp.h> static bool __hyp_text __fpsimd_enabled_nvhe(void) @@ -109,6 +112,15 @@ static hyp_alternate_select(__deactivate_traps_arch, static void __hyp_text __deactivate_traps(struct kvm_vcpu *vcpu) { + /* + * If we pended a virtual abort, preserve it until it gets + * cleared. See D1.14.3 (Virtual Interrupts) for details, but + * the crucial bit is "On taking a vSError interrupt, + * HCR_EL2.VSE is cleared to 0." + */ + if (vcpu->arch.hcr_el2 & HCR_VSE) + vcpu->arch.hcr_el2 = read_sysreg(hcr_el2); + __deactivate_traps_arch()(); write_sysreg(0, hstr_el2); write_sysreg(read_sysreg(mdcr_el2) & MDCR_EL2_HPMN_MASK, mdcr_el2); @@ -126,17 +138,13 @@ static void __hyp_text __deactivate_vm(struct kvm_vcpu *vcpu) write_sysreg(0, vttbr_el2); } -static hyp_alternate_select(__vgic_call_save_state, - __vgic_v2_save_state, __vgic_v3_save_state, - ARM64_HAS_SYSREG_GIC_CPUIF); - -static hyp_alternate_select(__vgic_call_restore_state, - __vgic_v2_restore_state, __vgic_v3_restore_state, - ARM64_HAS_SYSREG_GIC_CPUIF); - static void __hyp_text __vgic_save_state(struct kvm_vcpu *vcpu) { - __vgic_call_save_state()(vcpu); + if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + __vgic_v3_save_state(vcpu); + else + __vgic_v2_save_state(vcpu); + write_sysreg(read_sysreg(hcr_el2) & ~HCR_INT_OVERRIDE, hcr_el2); } @@ -149,7 +157,10 @@ static void __hyp_text __vgic_restore_state(struct kvm_vcpu *vcpu) val |= vcpu->arch.irq_lines; write_sysreg(val, hcr_el2); - __vgic_call_restore_state()(vcpu); + if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + __vgic_v3_restore_state(vcpu); + else + __vgic_v2_restore_state(vcpu); } static bool __hyp_text __true_value(void) @@ -232,7 +243,22 @@ static bool __hyp_text __populate_fault_info(struct kvm_vcpu *vcpu) return true; } -static int __hyp_text __guest_run(struct kvm_vcpu *vcpu) +static void __hyp_text __skip_instr(struct kvm_vcpu *vcpu) +{ + *vcpu_pc(vcpu) = read_sysreg_el2(elr); + + if (vcpu_mode_is_32bit(vcpu)) { + vcpu->arch.ctxt.gp_regs.regs.pstate = read_sysreg_el2(spsr); + kvm_skip_instr32(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); + write_sysreg_el2(vcpu->arch.ctxt.gp_regs.regs.pstate, spsr); + } else { + *vcpu_pc(vcpu) += 4; + } + + write_sysreg_el2(*vcpu_pc(vcpu), elr); +} + +int __hyp_text __kvm_vcpu_run(struct kvm_vcpu *vcpu) { struct kvm_cpu_context *host_ctxt; struct kvm_cpu_context *guest_ctxt; @@ -267,9 +293,43 @@ again: exit_code = __guest_enter(vcpu, host_ctxt); /* And we're baaack! */ + /* + * We're using the raw exception code in order to only process + * the trap if no SError is pending. We will come back to the + * same PC once the SError has been injected, and replay the + * trapping instruction. + */ if (exit_code == ARM_EXCEPTION_TRAP && !__populate_fault_info(vcpu)) goto again; + if (static_branch_unlikely(&vgic_v2_cpuif_trap) && + exit_code == ARM_EXCEPTION_TRAP) { + bool valid; + + valid = kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_DABT_LOW && + kvm_vcpu_trap_get_fault_type(vcpu) == FSC_FAULT && + kvm_vcpu_dabt_isvalid(vcpu) && + !kvm_vcpu_dabt_isextabt(vcpu) && + !kvm_vcpu_dabt_iss1tw(vcpu); + + if (valid) { + int ret = __vgic_v2_perform_cpuif_access(vcpu); + + if (ret == 1) { + __skip_instr(vcpu); + goto again; + } + + if (ret == -1) { + /* Promote an illegal access to an SError */ + __skip_instr(vcpu); + exit_code = ARM_EXCEPTION_EL1_SERROR; + } + + /* 0 falls through to be handler out of EL2 */ + } + } + fp_enabled = __fpsimd_enabled(); __sysreg_save_guest_state(guest_ctxt); @@ -293,8 +353,6 @@ again: return exit_code; } -__alias(__guest_run) int __kvm_vcpu_run(struct kvm_vcpu *vcpu); - static const char __hyp_panic_string[] = "HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%p\n"; static void __hyp_text __hyp_call_panic_nvhe(u64 spsr, u64 elr, u64 par) diff --git a/arch/arm64/kvm/hyp/tlb.c b/arch/arm64/kvm/hyp/tlb.c index be8177c..9cc0ea7 100644 --- a/arch/arm64/kvm/hyp/tlb.c +++ b/arch/arm64/kvm/hyp/tlb.c @@ -17,7 +17,7 @@ #include <asm/kvm_hyp.h> -static void __hyp_text __tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) +void __hyp_text __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) { dsb(ishst); @@ -48,10 +48,7 @@ static void __hyp_text __tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) write_sysreg(0, vttbr_el2); } -__alias(__tlb_flush_vmid_ipa) void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, - phys_addr_t ipa); - -static void __hyp_text __tlb_flush_vmid(struct kvm *kvm) +void __hyp_text __kvm_tlb_flush_vmid(struct kvm *kvm) { dsb(ishst); @@ -67,14 +64,10 @@ static void __hyp_text __tlb_flush_vmid(struct kvm *kvm) write_sysreg(0, vttbr_el2); } -__alias(__tlb_flush_vmid) void __kvm_tlb_flush_vmid(struct kvm *kvm); - -static void __hyp_text __tlb_flush_vm_context(void) +void __hyp_text __kvm_flush_vm_context(void) { dsb(ishst); asm volatile("tlbi alle1is \n" "ic ialluis ": : ); dsb(ish); } - -__alias(__tlb_flush_vm_context) void __kvm_flush_vm_context(void); diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c deleted file mode 100644 index 5f8f80b..0000000 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ /dev/null @@ -1,343 +0,0 @@ -/* - * Copyright (C) 2012-2015 - ARM Ltd - * Author: Marc Zyngier <marc.zyngier@arm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#include <linux/compiler.h> -#include <linux/irqchip/arm-gic-v3.h> -#include <linux/kvm_host.h> - -#include <asm/kvm_hyp.h> - -#define vtr_to_max_lr_idx(v) ((v) & 0xf) -#define vtr_to_nr_pri_bits(v) (((u32)(v) >> 29) + 1) - -#define read_gicreg(r) \ - ({ \ - u64 reg; \ - asm volatile("mrs_s %0, " __stringify(r) : "=r" (reg)); \ - reg; \ - }) - -#define write_gicreg(v,r) \ - do { \ - u64 __val = (v); \ - asm volatile("msr_s " __stringify(r) ", %0" : : "r" (__val));\ - } while (0) - -static u64 __hyp_text __gic_v3_get_lr(unsigned int lr) -{ - switch (lr & 0xf) { - case 0: - return read_gicreg(ICH_LR0_EL2); - case 1: - return read_gicreg(ICH_LR1_EL2); - case 2: - return read_gicreg(ICH_LR2_EL2); - case 3: - return read_gicreg(ICH_LR3_EL2); - case 4: - return read_gicreg(ICH_LR4_EL2); - case 5: - return read_gicreg(ICH_LR5_EL2); - case 6: - return read_gicreg(ICH_LR6_EL2); - case 7: - return read_gicreg(ICH_LR7_EL2); - case 8: - return read_gicreg(ICH_LR8_EL2); - case 9: - return read_gicreg(ICH_LR9_EL2); - case 10: - return read_gicreg(ICH_LR10_EL2); - case 11: - return read_gicreg(ICH_LR11_EL2); - case 12: - return read_gicreg(ICH_LR12_EL2); - case 13: - return read_gicreg(ICH_LR13_EL2); - case 14: - return read_gicreg(ICH_LR14_EL2); - case 15: - return read_gicreg(ICH_LR15_EL2); - } - - unreachable(); -} - -static void __hyp_text __gic_v3_set_lr(u64 val, int lr) -{ - switch (lr & 0xf) { - case 0: - write_gicreg(val, ICH_LR0_EL2); - break; - case 1: - write_gicreg(val, ICH_LR1_EL2); - break; - case 2: - write_gicreg(val, ICH_LR2_EL2); - break; - case 3: - write_gicreg(val, ICH_LR3_EL2); - break; - case 4: - write_gicreg(val, ICH_LR4_EL2); - break; - case 5: - write_gicreg(val, ICH_LR5_EL2); - break; - case 6: - write_gicreg(val, ICH_LR6_EL2); - break; - case 7: - write_gicreg(val, ICH_LR7_EL2); - break; - case 8: - write_gicreg(val, ICH_LR8_EL2); - break; - case 9: - write_gicreg(val, ICH_LR9_EL2); - break; - case 10: - write_gicreg(val, ICH_LR10_EL2); - break; - case 11: - write_gicreg(val, ICH_LR11_EL2); - break; - case 12: - write_gicreg(val, ICH_LR12_EL2); - break; - case 13: - write_gicreg(val, ICH_LR13_EL2); - break; - case 14: - write_gicreg(val, ICH_LR14_EL2); - break; - case 15: - write_gicreg(val, ICH_LR15_EL2); - break; - } -} - -static void __hyp_text save_maint_int_state(struct kvm_vcpu *vcpu, int nr_lr) -{ - struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - int i; - bool expect_mi; - - expect_mi = !!(cpu_if->vgic_hcr & ICH_HCR_UIE); - - for (i = 0; i < nr_lr; i++) { - if (!(vcpu->arch.vgic_cpu.live_lrs & (1UL << i))) - continue; - - expect_mi |= (!(cpu_if->vgic_lr[i] & ICH_LR_HW) && - (cpu_if->vgic_lr[i] & ICH_LR_EOI)); - } - - if (expect_mi) { - cpu_if->vgic_misr = read_gicreg(ICH_MISR_EL2); - - if (cpu_if->vgic_misr & ICH_MISR_EOI) - cpu_if->vgic_eisr = read_gicreg(ICH_EISR_EL2); - else - cpu_if->vgic_eisr = 0; - } else { - cpu_if->vgic_misr = 0; - cpu_if->vgic_eisr = 0; - } -} - -void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu) -{ - struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - u64 val; - - /* - * Make sure stores to the GIC via the memory mapped interface - * are now visible to the system register interface. - */ - if (!cpu_if->vgic_sre) - dsb(st); - - cpu_if->vgic_vmcr = read_gicreg(ICH_VMCR_EL2); - - if (vcpu->arch.vgic_cpu.live_lrs) { - int i; - u32 max_lr_idx, nr_pri_bits; - - cpu_if->vgic_elrsr = read_gicreg(ICH_ELSR_EL2); - - write_gicreg(0, ICH_HCR_EL2); - val = read_gicreg(ICH_VTR_EL2); - max_lr_idx = vtr_to_max_lr_idx(val); - nr_pri_bits = vtr_to_nr_pri_bits(val); - - save_maint_int_state(vcpu, max_lr_idx + 1); - - for (i = 0; i <= max_lr_idx; i++) { - if (!(vcpu->arch.vgic_cpu.live_lrs & (1UL << i))) - continue; - - if (cpu_if->vgic_elrsr & (1 << i)) - cpu_if->vgic_lr[i] &= ~ICH_LR_STATE; - else - cpu_if->vgic_lr[i] = __gic_v3_get_lr(i); - - __gic_v3_set_lr(0, i); - } - - switch (nr_pri_bits) { - case 7: - cpu_if->vgic_ap0r[3] = read_gicreg(ICH_AP0R3_EL2); - cpu_if->vgic_ap0r[2] = read_gicreg(ICH_AP0R2_EL2); - case 6: - cpu_if->vgic_ap0r[1] = read_gicreg(ICH_AP0R1_EL2); - default: - cpu_if->vgic_ap0r[0] = read_gicreg(ICH_AP0R0_EL2); - } - - switch (nr_pri_bits) { - case 7: - cpu_if->vgic_ap1r[3] = read_gicreg(ICH_AP1R3_EL2); - cpu_if->vgic_ap1r[2] = read_gicreg(ICH_AP1R2_EL2); - case 6: - cpu_if->vgic_ap1r[1] = read_gicreg(ICH_AP1R1_EL2); - default: - cpu_if->vgic_ap1r[0] = read_gicreg(ICH_AP1R0_EL2); - } - - vcpu->arch.vgic_cpu.live_lrs = 0; - } else { - cpu_if->vgic_misr = 0; - cpu_if->vgic_eisr = 0; - cpu_if->vgic_elrsr = 0xffff; - cpu_if->vgic_ap0r[0] = 0; - cpu_if->vgic_ap0r[1] = 0; - cpu_if->vgic_ap0r[2] = 0; - cpu_if->vgic_ap0r[3] = 0; - cpu_if->vgic_ap1r[0] = 0; - cpu_if->vgic_ap1r[1] = 0; - cpu_if->vgic_ap1r[2] = 0; - cpu_if->vgic_ap1r[3] = 0; - } - - val = read_gicreg(ICC_SRE_EL2); - write_gicreg(val | ICC_SRE_EL2_ENABLE, ICC_SRE_EL2); - - if (!cpu_if->vgic_sre) { - /* Make sure ENABLE is set at EL2 before setting SRE at EL1 */ - isb(); - write_gicreg(1, ICC_SRE_EL1); - } -} - -void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu) -{ - struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - u64 val; - u32 max_lr_idx, nr_pri_bits; - u16 live_lrs = 0; - int i; - - /* - * VFIQEn is RES1 if ICC_SRE_EL1.SRE is 1. This causes a - * Group0 interrupt (as generated in GICv2 mode) to be - * delivered as a FIQ to the guest, with potentially fatal - * consequences. So we must make sure that ICC_SRE_EL1 has - * been actually programmed with the value we want before - * starting to mess with the rest of the GIC. - */ - if (!cpu_if->vgic_sre) { - write_gicreg(0, ICC_SRE_EL1); - isb(); - } - - val = read_gicreg(ICH_VTR_EL2); - max_lr_idx = vtr_to_max_lr_idx(val); - nr_pri_bits = vtr_to_nr_pri_bits(val); - - for (i = 0; i <= max_lr_idx; i++) { - if (cpu_if->vgic_lr[i] & ICH_LR_STATE) - live_lrs |= (1 << i); - } - - write_gicreg(cpu_if->vgic_vmcr, ICH_VMCR_EL2); - - if (live_lrs) { - write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2); - - switch (nr_pri_bits) { - case 7: - write_gicreg(cpu_if->vgic_ap0r[3], ICH_AP0R3_EL2); - write_gicreg(cpu_if->vgic_ap0r[2], ICH_AP0R2_EL2); - case 6: - write_gicreg(cpu_if->vgic_ap0r[1], ICH_AP0R1_EL2); - default: - write_gicreg(cpu_if->vgic_ap0r[0], ICH_AP0R0_EL2); - } - - switch (nr_pri_bits) { - case 7: - write_gicreg(cpu_if->vgic_ap1r[3], ICH_AP1R3_EL2); - write_gicreg(cpu_if->vgic_ap1r[2], ICH_AP1R2_EL2); - case 6: - write_gicreg(cpu_if->vgic_ap1r[1], ICH_AP1R1_EL2); - default: - write_gicreg(cpu_if->vgic_ap1r[0], ICH_AP1R0_EL2); - } - - for (i = 0; i <= max_lr_idx; i++) { - if (!(live_lrs & (1 << i))) - continue; - - __gic_v3_set_lr(cpu_if->vgic_lr[i], i); - } - } - - /* - * Ensures that the above will have reached the - * (re)distributors. This ensure the guest will read the - * correct values from the memory-mapped interface. - */ - if (!cpu_if->vgic_sre) { - isb(); - dsb(sy); - } - vcpu->arch.vgic_cpu.live_lrs = live_lrs; - - /* - * Prevent the guest from touching the GIC system registers if - * SRE isn't enabled for GICv3 emulation. - */ - write_gicreg(read_gicreg(ICC_SRE_EL2) & ~ICC_SRE_EL2_ENABLE, - ICC_SRE_EL2); -} - -void __hyp_text __vgic_v3_init_lrs(void) -{ - int max_lr_idx = vtr_to_max_lr_idx(read_gicreg(ICH_VTR_EL2)); - int i; - - for (i = 0; i <= max_lr_idx; i++) - __gic_v3_set_lr(0, i); -} - -static u64 __hyp_text __vgic_v3_read_ich_vtr_el2(void) -{ - return read_gicreg(ICH_VTR_EL2); -} - -__alias(__vgic_v3_read_ich_vtr_el2) u64 __vgic_v3_get_ich_vtr_el2(void); diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c index 898c0e6..da6a8cf 100644 --- a/arch/arm64/kvm/inject_fault.c +++ b/arch/arm64/kvm/inject_fault.c @@ -231,3 +231,15 @@ void kvm_inject_undefined(struct kvm_vcpu *vcpu) else inject_undef64(vcpu); } + +/** + * kvm_inject_vabt - inject an async abort / SError into the guest + * @vcpu: The VCPU to receive the exception + * + * It is assumed that this code is called from the VCPU thread and that the + * VCPU therefore is not currently executing guest code. + */ +void kvm_inject_vabt(struct kvm_vcpu *vcpu) +{ + vcpu_set_hcr(vcpu, vcpu_get_hcr(vcpu) | HCR_VSE); +} diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index b54bcad..07f58cf 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -107,35 +107,49 @@ #define KVM_INVALID_INST 0xdeadbeef #define KVM_INVALID_ADDR 0xdeadbeef +/* + * EVA has overlapping user & kernel address spaces, so user VAs may be > + * PAGE_OFFSET. For this reason we can't use the default KVM_HVA_ERR_BAD of + * PAGE_OFFSET. + */ + +#define KVM_HVA_ERR_BAD (-1UL) +#define KVM_HVA_ERR_RO_BAD (-2UL) + +static inline bool kvm_is_error_hva(unsigned long addr) +{ + return IS_ERR_VALUE(addr); +} + extern atomic_t kvm_mips_instance; struct kvm_vm_stat { - u32 remote_tlb_flush; + ulong remote_tlb_flush; }; struct kvm_vcpu_stat { - u32 wait_exits; - u32 cache_exits; - u32 signal_exits; - u32 int_exits; - u32 cop_unusable_exits; - u32 tlbmod_exits; - u32 tlbmiss_ld_exits; - u32 tlbmiss_st_exits; - u32 addrerr_st_exits; - u32 addrerr_ld_exits; - u32 syscall_exits; - u32 resvd_inst_exits; - u32 break_inst_exits; - u32 trap_inst_exits; - u32 msa_fpe_exits; - u32 fpe_exits; - u32 msa_disabled_exits; - u32 flush_dcache_exits; - u32 halt_successful_poll; - u32 halt_attempted_poll; - u32 halt_poll_invalid; - u32 halt_wakeup; + u64 wait_exits; + u64 cache_exits; + u64 signal_exits; + u64 int_exits; + u64 cop_unusable_exits; + u64 tlbmod_exits; + u64 tlbmiss_ld_exits; + u64 tlbmiss_st_exits; + u64 addrerr_st_exits; + u64 addrerr_ld_exits; + u64 syscall_exits; + u64 resvd_inst_exits; + u64 break_inst_exits; + u64 trap_inst_exits; + u64 msa_fpe_exits; + u64 fpe_exits; + u64 msa_disabled_exits; + u64 flush_dcache_exits; + u64 halt_successful_poll; + u64 halt_attempted_poll; + u64 halt_poll_invalid; + u64 halt_wakeup; }; struct kvm_arch_memory_slot { @@ -314,6 +328,9 @@ struct kvm_vcpu_arch { u32 guest_kernel_asid[NR_CPUS]; struct mm_struct guest_kernel_mm, guest_user_mm; + /* Guest ASID of last user mode execution */ + unsigned int last_user_gasid; + int last_sched_cpu; /* WAIT executed */ diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index e788515..4db4c03 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -846,6 +846,47 @@ enum emulation_result kvm_mips_emul_tlbr(struct kvm_vcpu *vcpu) return EMULATE_FAIL; } +/** + * kvm_mips_invalidate_guest_tlb() - Indicates a change in guest MMU map. + * @vcpu: VCPU with changed mappings. + * @tlb: TLB entry being removed. + * + * This is called to indicate a single change in guest MMU mappings, so that we + * can arrange TLB flushes on this and other CPUs. + */ +static void kvm_mips_invalidate_guest_tlb(struct kvm_vcpu *vcpu, + struct kvm_mips_tlb *tlb) +{ + int cpu, i; + bool user; + + /* No need to flush for entries which are already invalid */ + if (!((tlb->tlb_lo[0] | tlb->tlb_lo[1]) & ENTRYLO_V)) + return; + /* User address space doesn't need flushing for KSeg2/3 changes */ + user = tlb->tlb_hi < KVM_GUEST_KSEG0; + + preempt_disable(); + + /* + * Probe the shadow host TLB for the entry being overwritten, if one + * matches, invalidate it + */ + kvm_mips_host_tlb_inv(vcpu, tlb->tlb_hi); + + /* Invalidate the whole ASID on other CPUs */ + cpu = smp_processor_id(); + for_each_possible_cpu(i) { + if (i == cpu) + continue; + if (user) + vcpu->arch.guest_user_asid[i] = 0; + vcpu->arch.guest_kernel_asid[i] = 0; + } + + preempt_enable(); +} + /* Write Guest TLB Entry @ Index */ enum emulation_result kvm_mips_emul_tlbwi(struct kvm_vcpu *vcpu) { @@ -865,11 +906,8 @@ enum emulation_result kvm_mips_emul_tlbwi(struct kvm_vcpu *vcpu) } tlb = &vcpu->arch.guest_tlb[index]; - /* - * Probe the shadow host TLB for the entry being overwritten, if one - * matches, invalidate it - */ - kvm_mips_host_tlb_inv(vcpu, tlb->tlb_hi); + + kvm_mips_invalidate_guest_tlb(vcpu, tlb); tlb->tlb_mask = kvm_read_c0_guest_pagemask(cop0); tlb->tlb_hi = kvm_read_c0_guest_entryhi(cop0); @@ -898,11 +936,7 @@ enum emulation_result kvm_mips_emul_tlbwr(struct kvm_vcpu *vcpu) tlb = &vcpu->arch.guest_tlb[index]; - /* - * Probe the shadow host TLB for the entry being overwritten, if one - * matches, invalidate it - */ - kvm_mips_host_tlb_inv(vcpu, tlb->tlb_hi); + kvm_mips_invalidate_guest_tlb(vcpu, tlb); tlb->tlb_mask = kvm_read_c0_guest_pagemask(cop0); tlb->tlb_hi = kvm_read_c0_guest_entryhi(cop0); @@ -1026,6 +1060,7 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst, enum emulation_result er = EMULATE_DONE; u32 rt, rd, sel; unsigned long curr_pc; + int cpu, i; /* * Update PC and hold onto current PC in case there is @@ -1127,16 +1162,31 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst, } else if (rd == MIPS_CP0_TLB_HI && sel == 0) { u32 nasid = vcpu->arch.gprs[rt] & KVM_ENTRYHI_ASID; - if ((KSEGX(vcpu->arch.gprs[rt]) != CKSEG0) && - ((kvm_read_c0_guest_entryhi(cop0) & + if (((kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID) != nasid)) { trace_kvm_asid_change(vcpu, kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID, nasid); - /* Blow away the shadow host TLBs */ - kvm_mips_flush_host_tlb(1); + /* + * Regenerate/invalidate kernel MMU + * context. + * The user MMU context will be + * regenerated lazily on re-entry to + * guest user if the guest ASID actually + * changes. + */ + preempt_disable(); + cpu = smp_processor_id(); + kvm_get_new_mmu_context(&vcpu->arch.guest_kernel_mm, + cpu, vcpu); + vcpu->arch.guest_kernel_asid[cpu] = + vcpu->arch.guest_kernel_mm.context.asid[cpu]; + for_each_possible_cpu(i) + if (i != cpu) + vcpu->arch.guest_kernel_asid[i] = 0; + preempt_enable(); } kvm_write_c0_guest_entryhi(cop0, vcpu->arch.gprs[rt]); diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index a6ea084..ce96149 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -140,6 +140,16 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) return 0; } +bool kvm_arch_has_vcpu_debugfs(void) +{ + return false; +} + +int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu) +{ + return 0; +} + void kvm_mips_free_vcpus(struct kvm *kvm) { unsigned int i; @@ -411,6 +421,31 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, return -ENOIOCTLCMD; } +/* Must be called with preemption disabled, just before entering guest */ +static void kvm_mips_check_asids(struct kvm_vcpu *vcpu) +{ + struct mips_coproc *cop0 = vcpu->arch.cop0; + int cpu = smp_processor_id(); + unsigned int gasid; + + /* + * Lazy host ASID regeneration for guest user mode. + * If the guest ASID has changed since the last guest usermode + * execution, regenerate the host ASID so as to invalidate stale TLB + * entries. + */ + if (!KVM_GUEST_KERNEL_MODE(vcpu)) { + gasid = kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID; + if (gasid != vcpu->arch.last_user_gasid) { + kvm_get_new_mmu_context(&vcpu->arch.guest_user_mm, cpu, + vcpu); + vcpu->arch.guest_user_asid[cpu] = + vcpu->arch.guest_user_mm.context.asid[cpu]; + vcpu->arch.last_user_gasid = gasid; + } + } +} + int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) { int r = 0; @@ -438,6 +473,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) htw_stop(); trace_kvm_enter(vcpu); + + kvm_mips_check_asids(vcpu); + r = vcpu->arch.vcpu_run(run, vcpu); trace_kvm_out(vcpu); @@ -1551,6 +1589,8 @@ skip_emul: if (ret == RESUME_GUEST) { trace_kvm_reenter(vcpu); + kvm_mips_check_asids(vcpu); + /* * If FPU / MSA are enabled (i.e. the guest's FPU / MSA context * is live), restore FCR31 / MSACSR. diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index 121008c..03883ba 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -250,15 +250,27 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_get_new_mmu_context(&vcpu->arch.guest_kernel_mm, cpu, vcpu); vcpu->arch.guest_kernel_asid[cpu] = vcpu->arch.guest_kernel_mm.context.asid[cpu]; + newasid++; + + kvm_debug("[%d]: cpu_context: %#lx\n", cpu, + cpu_context(cpu, current->mm)); + kvm_debug("[%d]: Allocated new ASID for Guest Kernel: %#x\n", + cpu, vcpu->arch.guest_kernel_asid[cpu]); + } + + if ((vcpu->arch.guest_user_asid[cpu] ^ asid_cache(cpu)) & + asid_version_mask(cpu)) { + u32 gasid = kvm_read_c0_guest_entryhi(vcpu->arch.cop0) & + KVM_ENTRYHI_ASID; + kvm_get_new_mmu_context(&vcpu->arch.guest_user_mm, cpu, vcpu); vcpu->arch.guest_user_asid[cpu] = vcpu->arch.guest_user_mm.context.asid[cpu]; + vcpu->arch.last_user_gasid = gasid; newasid++; kvm_debug("[%d]: cpu_context: %#lx\n", cpu, cpu_context(cpu, current->mm)); - kvm_debug("[%d]: Allocated new ASID for Guest Kernel: %#x\n", - cpu, vcpu->arch.guest_kernel_asid[cpu]); kvm_debug("[%d]: Allocated new ASID for Guest User: %#x\n", cpu, vcpu->arch.guest_user_asid[cpu]); } diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index 0915539..3a5484f 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -175,6 +175,24 @@ static int kvm_trap_emul_handle_tlb_miss(struct kvm_vcpu *vcpu, bool store) run->exit_reason = KVM_EXIT_INTERNAL_ERROR; ret = RESUME_HOST; } + } else if (KVM_GUEST_KERNEL_MODE(vcpu) + && (KSEGX(badvaddr) == CKSEG0 || KSEGX(badvaddr) == CKSEG1)) { + /* + * With EVA we may get a TLB exception instead of an address + * error when the guest performs MMIO to KSeg1 addresses. + */ + kvm_debug("Emulate %s MMIO space\n", + store ? "Store to" : "Load from"); + er = kvm_mips_emulate_inst(cause, opc, run, vcpu); + if (er == EMULATE_FAIL) { + kvm_err("Emulate %s MMIO space failed\n", + store ? "Store to" : "Load from"); + run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + ret = RESUME_HOST; + } else { + run->exit_reason = KVM_EXIT_MMIO; + ret = RESUME_HOST; + } } else { kvm_err("Illegal TLB %s fault address , cause %#x, PC: %p, BadVaddr: %#lx\n", store ? "ST" : "LD", cause, opc, badvaddr); diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index 287a656..e407af2 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -245,6 +245,43 @@ static inline int segment_shift(int ssize) } /* + * This array is indexed by the LP field of the HPTE second dword. + * Since this field may contain some RPN bits, some entries are + * replicated so that we get the same value irrespective of RPN. + * The top 4 bits are the page size index (MMU_PAGE_*) for the + * actual page size, the bottom 4 bits are the base page size. + */ +extern u8 hpte_page_sizes[1 << LP_BITS]; + +static inline unsigned long __hpte_page_size(unsigned long h, unsigned long l, + bool is_base_size) +{ + unsigned int i, lp; + + if (!(h & HPTE_V_LARGE)) + return 1ul << 12; + + /* Look at the 8 bit LP value */ + lp = (l >> LP_SHIFT) & ((1 << LP_BITS) - 1); + i = hpte_page_sizes[lp]; + if (!i) + return 0; + if (!is_base_size) + i >>= 4; + return 1ul << mmu_psize_defs[i & 0xf].shift; +} + +static inline unsigned long hpte_page_size(unsigned long h, unsigned long l) +{ + return __hpte_page_size(h, l, 0); +} + +static inline unsigned long hpte_base_page_size(unsigned long h, unsigned long l) +{ + return __hpte_page_size(h, l, 1); +} + +/* * The current system page and segment sizes */ extern int mmu_kernel_ssize; diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h index 2fd1690..f6fda84 100644 --- a/arch/powerpc/include/asm/io.h +++ b/arch/powerpc/include/asm/io.h @@ -241,6 +241,35 @@ static inline void out_be64(volatile u64 __iomem *addr, u64 val) #endif #endif /* __powerpc64__ */ + +/* + * Simple Cache inhibited accessors + * Unlike the DEF_MMIO_* macros, these don't include any h/w memory + * barriers, callers need to manage memory barriers on their own. + * These can only be used in hypervisor real mode. + */ + +static inline u32 _lwzcix(unsigned long addr) +{ + u32 ret; + + __asm__ __volatile__("lwzcix %0,0, %1" + : "=r" (ret) : "r" (addr) : "memory"); + return ret; +} + +static inline void _stbcix(u64 addr, u8 val) +{ + __asm__ __volatile__("stbcix %0,0,%1" + : : "r" (val), "r" (addr) : "memory"); +} + +static inline void _stwcix(u64 addr, u32 val) +{ + __asm__ __volatile__("stwcix %0,0,%1" + : : "r" (val), "r" (addr) : "memory"); +} + /* * Low level IO stream instructions are defined out of line for now */ diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h index 5bca220..05cabed 100644 --- a/arch/powerpc/include/asm/kvm_asm.h +++ b/arch/powerpc/include/asm/kvm_asm.h @@ -105,6 +105,15 @@ #define BOOK3S_INTERRUPT_FAC_UNAVAIL 0xf60 #define BOOK3S_INTERRUPT_H_FAC_UNAVAIL 0xf80 +/* book3s_hv */ + +/* + * Special trap used to indicate to host that this is a + * passthrough interrupt that could not be handled + * completely in the guest. + */ +#define BOOK3S_INTERRUPT_HV_RM_HARD 0x5555 + #define BOOK3S_IRQPRIO_SYSTEM_RESET 0 #define BOOK3S_IRQPRIO_DATA_SEGMENT 1 #define BOOK3S_IRQPRIO_INST_SEGMENT 2 @@ -136,6 +145,7 @@ #define RESUME_FLAG_NV (1<<0) /* Reload guest nonvolatile state? */ #define RESUME_FLAG_HOST (1<<1) /* Resume host? */ #define RESUME_FLAG_ARCH1 (1<<2) +#define RESUME_FLAG_ARCH2 (1<<3) #define RESUME_GUEST 0 #define RESUME_GUEST_NV RESUME_FLAG_NV diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 8f39796..5cf306a 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -69,6 +69,43 @@ struct hpte_cache { int pagesize; }; +/* + * Struct for a virtual core. + * Note: entry_exit_map combines a bitmap of threads that have entered + * in the bottom 8 bits and a bitmap of threads that have exited in the + * next 8 bits. This is so that we can atomically set the entry bit + * iff the exit map is 0 without taking a lock. + */ +struct kvmppc_vcore { + int n_runnable; + int num_threads; + int entry_exit_map; + int napping_threads; + int first_vcpuid; + u16 pcpu; + u16 last_cpu; + u8 vcore_state; + u8 in_guest; + struct kvmppc_vcore *master_vcore; + struct kvm_vcpu *runnable_threads[MAX_SMT_THREADS]; + struct list_head preempt_list; + spinlock_t lock; + struct swait_queue_head wq; + spinlock_t stoltb_lock; /* protects stolen_tb and preempt_tb */ + u64 stolen_tb; + u64 preempt_tb; + struct kvm_vcpu *runner; + struct kvm *kvm; + u64 tb_offset; /* guest timebase - host timebase */ + ulong lpcr; + u32 arch_compat; + ulong pcr; + ulong dpdes; /* doorbell state (POWER8) */ + ulong vtb; /* virtual timebase */ + ulong conferring_threads; + unsigned int halt_poll_ns; +}; + struct kvmppc_vcpu_book3s { struct kvmppc_sid_map sid_map[SID_MAP_NUM]; struct { @@ -83,6 +120,7 @@ struct kvmppc_vcpu_book3s { u64 sdr1; u64 hior; u64 msr_mask; + u64 vtb; #ifdef CONFIG_PPC_BOOK3S_32 u32 vsid_pool[VSID_POOL_SIZE]; u32 vsid_next; @@ -191,6 +229,7 @@ extern void kvmppc_copy_to_svcpu(struct kvmppc_book3s_shadow_vcpu *svcpu, struct kvm_vcpu *vcpu); extern void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu, struct kvmppc_book3s_shadow_vcpu *svcpu); +extern int kvm_irq_bypass; static inline struct kvmppc_vcpu_book3s *to_book3s(struct kvm_vcpu *vcpu) { diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 88d17b4..8482921 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -20,6 +20,8 @@ #ifndef __ASM_KVM_BOOK3S_64_H__ #define __ASM_KVM_BOOK3S_64_H__ +#include <asm/book3s/64/mmu-hash.h> + #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE static inline struct kvmppc_book3s_shadow_vcpu *svcpu_get(struct kvm_vcpu *vcpu) { @@ -97,56 +99,20 @@ static inline void __unlock_hpte(__be64 *hpte, unsigned long hpte_v) hpte[0] = cpu_to_be64(hpte_v); } -static inline int __hpte_actual_psize(unsigned int lp, int psize) -{ - int i, shift; - unsigned int mask; - - /* start from 1 ignoring MMU_PAGE_4K */ - for (i = 1; i < MMU_PAGE_COUNT; i++) { - - /* invalid penc */ - if (mmu_psize_defs[psize].penc[i] == -1) - continue; - /* - * encoding bits per actual page size - * PTE LP actual page size - * rrrr rrrz >=8KB - * rrrr rrzz >=16KB - * rrrr rzzz >=32KB - * rrrr zzzz >=64KB - * ....... - */ - shift = mmu_psize_defs[i].shift - LP_SHIFT; - if (shift > LP_BITS) - shift = LP_BITS; - mask = (1 << shift) - 1; - if ((lp & mask) == mmu_psize_defs[psize].penc[i]) - return i; - } - return -1; -} - static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r, unsigned long pte_index) { - int b_psize = MMU_PAGE_4K, a_psize = MMU_PAGE_4K; + int i, b_psize = MMU_PAGE_4K, a_psize = MMU_PAGE_4K; unsigned int penc; unsigned long rb = 0, va_low, sllp; unsigned int lp = (r >> LP_SHIFT) & ((1 << LP_BITS) - 1); if (v & HPTE_V_LARGE) { - for (b_psize = 0; b_psize < MMU_PAGE_COUNT; b_psize++) { - - /* valid entries have a shift value */ - if (!mmu_psize_defs[b_psize].shift) - continue; - - a_psize = __hpte_actual_psize(lp, b_psize); - if (a_psize != -1) - break; - } + i = hpte_page_sizes[lp]; + b_psize = i & 0xf; + a_psize = i >> 4; } + /* * Ignore the top 14 bits of va * v have top two bits covering segment size, hence move @@ -159,7 +125,6 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r, /* This covers 14..54 bits of va*/ rb = (v & ~0x7fUL) << 16; /* AVA field */ - rb |= (v >> HPTE_V_SSIZE_SHIFT) << 8; /* B field */ /* * AVA in v had cleared lower 23 bits. We need to derive * that from pteg index @@ -211,49 +176,10 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r, break; } } - rb |= (v >> 54) & 0x300; /* B field */ + rb |= (v >> HPTE_V_SSIZE_SHIFT) << 8; /* B field */ return rb; } -static inline unsigned long __hpte_page_size(unsigned long h, unsigned long l, - bool is_base_size) -{ - - int size, a_psize; - /* Look at the 8 bit LP value */ - unsigned int lp = (l >> LP_SHIFT) & ((1 << LP_BITS) - 1); - - /* only handle 4k, 64k and 16M pages for now */ - if (!(h & HPTE_V_LARGE)) - return 1ul << 12; - else { - for (size = 0; size < MMU_PAGE_COUNT; size++) { - /* valid entries have a shift value */ - if (!mmu_psize_defs[size].shift) - continue; - - a_psize = __hpte_actual_psize(lp, size); - if (a_psize != -1) { - if (is_base_size) - return 1ul << mmu_psize_defs[size].shift; - return 1ul << mmu_psize_defs[a_psize].shift; - } - } - - } - return 0; -} - -static inline unsigned long hpte_page_size(unsigned long h, unsigned long l) -{ - return __hpte_page_size(h, l, 0); -} - -static inline unsigned long hpte_base_page_size(unsigned long h, unsigned long l) -{ - return __hpte_page_size(h, l, 1); -} - static inline unsigned long hpte_rpn(unsigned long ptel, unsigned long psize) { return ((ptel & HPTE_R_RPN) & ~(psize - 1)) >> PAGE_SHIFT; diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index ec35af3..28350a2 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -43,6 +43,8 @@ #include <asm/cputhreads.h> #define KVM_MAX_VCPU_ID (threads_per_subcore * KVM_MAX_VCORES) +#define __KVM_HAVE_ARCH_INTC_INITIALIZED + #ifdef CONFIG_KVM_MMIO #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 #endif @@ -95,42 +97,49 @@ struct kvmppc_vcpu_book3s; struct kvmppc_book3s_shadow_vcpu; struct kvm_vm_stat { - u32 remote_tlb_flush; + ulong remote_tlb_flush; }; struct kvm_vcpu_stat { - u32 sum_exits; - u32 mmio_exits; - u32 signal_exits; - u32 light_exits; + u64 sum_exits; + u64 mmio_exits; + u64 signal_exits; + u64 light_exits; /* Account for special types of light exits: */ - u32 itlb_real_miss_exits; - u32 itlb_virt_miss_exits; - u32 dtlb_real_miss_exits; - u32 dtlb_virt_miss_exits; - u32 syscall_exits; - u32 isi_exits; - u32 dsi_exits; - u32 emulated_inst_exits; - u32 dec_exits; - u32 ext_intr_exits; - u32 halt_successful_poll; - u32 halt_attempted_poll; - u32 halt_poll_invalid; - u32 halt_wakeup; - u32 dbell_exits; - u32 gdbell_exits; - u32 ld; - u32 st; + u64 itlb_real_miss_exits; + u64 itlb_virt_miss_exits; + u64 dtlb_real_miss_exits; + u64 dtlb_virt_miss_exits; + u64 syscall_exits; + u64 isi_exits; + u64 dsi_exits; + u64 emulated_inst_exits; + u64 dec_exits; + u64 ext_intr_exits; + u64 halt_poll_success_ns; + u64 halt_poll_fail_ns; + u64 halt_wait_ns; + u64 halt_successful_poll; + u64 halt_attempted_poll; + u64 halt_successful_wait; + u64 halt_poll_invalid; + u64 halt_wakeup; + u64 dbell_exits; + u64 gdbell_exits; + u64 ld; + u64 st; #ifdef CONFIG_PPC_BOOK3S - u32 pf_storage; - u32 pf_instruc; - u32 sp_storage; - u32 sp_instruc; - u32 queue_intr; - u32 ld_slow; - u32 st_slow; + u64 pf_storage; + u64 pf_instruc; + u64 sp_storage; + u64 sp_instruc; + u64 queue_intr; + u64 ld_slow; + u64 st_slow; #endif + u64 pthru_all; + u64 pthru_host; + u64 pthru_bad_aff; }; enum kvm_exit_types { @@ -197,6 +206,8 @@ struct kvmppc_spapr_tce_table { struct kvmppc_xics; struct kvmppc_icp; +struct kvmppc_passthru_irqmap; + /* * The reverse mapping array has one entry for each HPTE, * which stores the guest's view of the second word of the HPTE @@ -267,6 +278,7 @@ struct kvm_arch { #endif #ifdef CONFIG_KVM_XICS struct kvmppc_xics *xics; + struct kvmppc_passthru_irqmap *pimap; #endif struct kvmppc_ops *kvm_ops; #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE @@ -275,41 +287,6 @@ struct kvm_arch { #endif }; -/* - * Struct for a virtual core. - * Note: entry_exit_map combines a bitmap of threads that have entered - * in the bottom 8 bits and a bitmap of threads that have exited in the - * next 8 bits. This is so that we can atomically set the entry bit - * iff the exit map is 0 without taking a lock. - */ -struct kvmppc_vcore { - int n_runnable; - int num_threads; - int entry_exit_map; - int napping_threads; - int first_vcpuid; - u16 pcpu; - u16 last_cpu; - u8 vcore_state; - u8 in_guest; - struct kvmppc_vcore *master_vcore; - struct list_head runnable_threads; - struct list_head preempt_list; - spinlock_t lock; - struct swait_queue_head wq; - spinlock_t stoltb_lock; /* protects stolen_tb and preempt_tb */ - u64 stolen_tb; - u64 preempt_tb; - struct kvm_vcpu *runner; - struct kvm *kvm; - u64 tb_offset; /* guest timebase - host timebase */ - ulong lpcr; - u32 arch_compat; - ulong pcr; - ulong dpdes; /* doorbell state (POWER8) */ - ulong conferring_threads; -}; - #define VCORE_ENTRY_MAP(vc) ((vc)->entry_exit_map & 0xff) #define VCORE_EXIT_MAP(vc) ((vc)->entry_exit_map >> 8) #define VCORE_IS_EXITING(vc) (VCORE_EXIT_MAP(vc) != 0) @@ -329,6 +306,7 @@ struct kvmppc_vcore { #define VCORE_SLEEPING 3 #define VCORE_RUNNING 4 #define VCORE_EXITING 5 +#define VCORE_POLLING 6 /* * Struct used to manage memory for a virtual processor area @@ -397,6 +375,20 @@ struct kvmhv_tb_accumulator { u64 tb_max; /* max time */ }; +#ifdef CONFIG_PPC_BOOK3S_64 +struct kvmppc_irq_map { + u32 r_hwirq; + u32 v_hwirq; + struct irq_desc *desc; +}; + +#define KVMPPC_PIRQ_MAPPED 1024 +struct kvmppc_passthru_irqmap { + int n_mapped; + struct kvmppc_irq_map mapped[KVMPPC_PIRQ_MAPPED]; +}; +#endif + # ifdef CONFIG_PPC_FSL_BOOK3E #define KVMPPC_BOOKE_IAC_NUM 2 #define KVMPPC_BOOKE_DAC_NUM 2 @@ -483,7 +475,6 @@ struct kvm_vcpu_arch { ulong purr; ulong spurr; ulong ic; - ulong vtb; ulong dscr; ulong amr; ulong uamor; @@ -668,7 +659,6 @@ struct kvm_vcpu_arch { long pgfault_index; unsigned long pgfault_hpte[2]; - struct list_head run_list; struct task_struct *run_task; struct kvm_run *kvm_run; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 2544eda..f6e4964 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -287,6 +287,10 @@ struct kvmppc_ops { long (*arch_vm_ioctl)(struct file *filp, unsigned int ioctl, unsigned long arg); int (*hcall_implemented)(unsigned long hcall); + int (*irq_bypass_add_producer)(struct irq_bypass_consumer *, + struct irq_bypass_producer *); + void (*irq_bypass_del_producer)(struct irq_bypass_consumer *, + struct irq_bypass_producer *); }; extern struct kvmppc_ops *kvmppc_hv_ops; @@ -453,8 +457,19 @@ static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu) { return vcpu->arch.irq_type == KVMPPC_IRQ_XICS; } + +static inline struct kvmppc_passthru_irqmap *kvmppc_get_passthru_irqmap( + struct kvm *kvm) +{ + if (kvm && kvm_irq_bypass) + return kvm->arch.pimap; + return NULL; +} + extern void kvmppc_alloc_host_rm_ops(void); extern void kvmppc_free_host_rm_ops(void); +extern void kvmppc_free_pimap(struct kvm *kvm); +extern int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall); extern void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu); extern int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server); extern int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args); @@ -464,10 +479,23 @@ extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval); extern int kvmppc_xics_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu, u32 cpu); extern void kvmppc_xics_ipi_action(void); +extern void kvmppc_xics_set_mapped(struct kvm *kvm, unsigned long guest_irq, + unsigned long host_irq); +extern void kvmppc_xics_clr_mapped(struct kvm *kvm, unsigned long guest_irq, + unsigned long host_irq); +extern long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, u32 xirr, + struct kvmppc_irq_map *irq_map, + struct kvmppc_passthru_irqmap *pimap); extern int h_ipi_redirect; #else +static inline struct kvmppc_passthru_irqmap *kvmppc_get_passthru_irqmap( + struct kvm *kvm) + { return NULL; } static inline void kvmppc_alloc_host_rm_ops(void) {}; static inline void kvmppc_free_host_rm_ops(void) {}; +static inline void kvmppc_free_pimap(struct kvm *kvm) {}; +static inline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall) + { return 0; } static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu) { return 0; } static inline void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) { } diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index e2fb408..b78e8d3 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -271,6 +271,7 @@ static inline bool early_radix_enabled(void) #define MMU_PAGE_16G 13 #define MMU_PAGE_64G 14 +/* N.B. we need to change the type of hpte_page_sizes if this gets to be > 16 */ #define MMU_PAGE_COUNT 15 #ifdef CONFIG_PPC_BOOK3S_64 diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index ee05bd2..e958b70 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -67,6 +67,7 @@ int64_t opal_pci_config_write_half_word(uint64_t phb_id, uint64_t bus_dev_func, int64_t opal_pci_config_write_word(uint64_t phb_id, uint64_t bus_dev_func, uint64_t offset, uint32_t data); int64_t opal_set_xive(uint32_t isn, uint16_t server, uint8_t priority); +int64_t opal_rm_set_xive(uint32_t isn, uint16_t server, uint8_t priority); int64_t opal_get_xive(uint32_t isn, __be16 *server, uint8_t *priority); int64_t opal_register_exception_handler(uint64_t opal_exception, uint64_t handler_address, diff --git a/arch/powerpc/include/asm/pnv-pci.h b/arch/powerpc/include/asm/pnv-pci.h index 0cbd813..1b46b52 100644 --- a/arch/powerpc/include/asm/pnv-pci.h +++ b/arch/powerpc/include/asm/pnv-pci.h @@ -12,6 +12,7 @@ #include <linux/pci.h> #include <linux/pci_hotplug.h> +#include <linux/irq.h> #include <misc/cxl-base.h> #include <asm/opal-api.h> @@ -33,6 +34,8 @@ int pnv_cxl_alloc_hwirqs(struct pci_dev *dev, int num); void pnv_cxl_release_hwirqs(struct pci_dev *dev, int hwirq, int num); int pnv_cxl_get_irq_count(struct pci_dev *dev); struct device_node *pnv_pci_get_phb_node(struct pci_dev *dev); +int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq); +bool is_pnv_opal_msi(struct irq_chip *chip); #ifdef CONFIG_CXL_BASE int pnv_cxl_alloc_hwirq_ranges(struct cxl_irq_ranges *irqs, diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index f69f40f..978dada 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -737,6 +737,7 @@ #define MMCR0_FCHV 0x00000001UL /* freeze conditions in hypervisor mode */ #define SPRN_MMCR1 798 #define SPRN_MMCR2 785 +#define SPRN_UMMCR2 769 #define SPRN_MMCRA 0x312 #define MMCRA_SDSYNC 0x80000000UL /* SDAR synced with SIAR */ #define MMCRA_SDAR_DCACHE_MISS 0x40000000UL diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index b89d14c..a51ae9b 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -506,7 +506,6 @@ int main(void) DEFINE(VCPU_PURR, offsetof(struct kvm_vcpu, arch.purr)); DEFINE(VCPU_SPURR, offsetof(struct kvm_vcpu, arch.spurr)); DEFINE(VCPU_IC, offsetof(struct kvm_vcpu, arch.ic)); - DEFINE(VCPU_VTB, offsetof(struct kvm_vcpu, arch.vtb)); DEFINE(VCPU_DSCR, offsetof(struct kvm_vcpu, arch.dscr)); DEFINE(VCPU_AMR, offsetof(struct kvm_vcpu, arch.amr)); DEFINE(VCPU_UAMOR, offsetof(struct kvm_vcpu, arch.uamor)); @@ -557,6 +556,7 @@ int main(void) DEFINE(VCORE_LPCR, offsetof(struct kvmppc_vcore, lpcr)); DEFINE(VCORE_PCR, offsetof(struct kvmppc_vcore, pcr)); DEFINE(VCORE_DPDES, offsetof(struct kvmppc_vcore, dpdes)); + DEFINE(VCORE_VTB, offsetof(struct kvmppc_vcore, vtb)); DEFINE(VCPU_SLB_E, offsetof(struct kvmppc_slb, orige)); DEFINE(VCPU_SLB_V, offsetof(struct kvmppc_slb, origv)); DEFINE(VCPU_SLB_SIZE, sizeof(struct kvmppc_slb)); diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index c2024ac..029be26 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -22,6 +22,9 @@ config KVM select ANON_INODES select HAVE_KVM_EVENTFD select SRCU + select KVM_VFIO + select IRQ_BYPASS_MANAGER + select HAVE_KVM_IRQ_BYPASS config KVM_BOOK3S_HANDLER bool diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index 855d4b9..7dd89b7 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -7,16 +7,16 @@ subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm KVM := ../../../virt/kvm -common-objs-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \ - $(KVM)/eventfd.o +common-objs-y = $(KVM)/kvm_main.o $(KVM)/eventfd.o common-objs-$(CONFIG_KVM_VFIO) += $(KVM)/vfio.o +common-objs-$(CONFIG_KVM_MMIO) += $(KVM)/coalesced_mmio.o CFLAGS_e500_mmu.o := -I. CFLAGS_e500_mmu_host.o := -I. CFLAGS_emulate.o := -I. CFLAGS_emulate_loadstore.o := -I. -common-objs-y += powerpc.o emulate.o emulate_loadstore.o +common-objs-y += powerpc.o emulate_loadstore.o obj-$(CONFIG_KVM_EXIT_TIMING) += timing.o obj-$(CONFIG_KVM_BOOK3S_HANDLER) += book3s_exports.o @@ -24,6 +24,7 @@ AFLAGS_booke_interrupts.o := -I$(objtree)/$(obj) kvm-e500-objs := \ $(common-objs-y) \ + emulate.o \ booke.o \ booke_emulate.o \ booke_interrupts.o \ @@ -35,6 +36,7 @@ kvm-objs-$(CONFIG_KVM_E500V2) := $(kvm-e500-objs) kvm-e500mc-objs := \ $(common-objs-y) \ + emulate.o \ booke.o \ booke_emulate.o \ bookehv_interrupts.o \ @@ -61,9 +63,6 @@ kvm-pr-y := \ book3s_32_mmu.o ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE -kvm-book3s_64-module-objs := \ - $(KVM)/coalesced_mmio.o - kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \ book3s_rmhandlers.o endif @@ -89,11 +88,8 @@ endif kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \ book3s_xics.o -kvm-book3s_64-module-objs += \ - $(KVM)/kvm_main.o \ - $(KVM)/eventfd.o \ - powerpc.o \ - emulate_loadstore.o \ +kvm-book3s_64-module-objs := \ + $(common-objs-y) \ book3s.o \ book3s_64_vio.o \ book3s_rtas.o \ @@ -103,6 +99,7 @@ kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-module-objs) kvm-book3s_32-objs := \ $(common-objs-y) \ + emulate.o \ fpu.o \ book3s_paired_singles.o \ book3s.o \ diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 47018fc..b6952dd 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -52,8 +52,12 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "dec", VCPU_STAT(dec_exits) }, { "ext_intr", VCPU_STAT(ext_intr_exits) }, { "queue_intr", VCPU_STAT(queue_intr) }, + { "halt_poll_success_ns", VCPU_STAT(halt_poll_success_ns) }, + { "halt_poll_fail_ns", VCPU_STAT(halt_poll_fail_ns) }, + { "halt_wait_ns", VCPU_STAT(halt_wait_ns) }, { "halt_successful_poll", VCPU_STAT(halt_successful_poll), }, { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll), }, + { "halt_successful_wait", VCPU_STAT(halt_successful_wait) }, { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) }, { "halt_wakeup", VCPU_STAT(halt_wakeup) }, { "pf_storage", VCPU_STAT(pf_storage) }, @@ -64,6 +68,9 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "ld_slow", VCPU_STAT(ld_slow) }, { "st", VCPU_STAT(st) }, { "st_slow", VCPU_STAT(st_slow) }, + { "pthru_all", VCPU_STAT(pthru_all) }, + { "pthru_host", VCPU_STAT(pthru_host) }, + { "pthru_bad_aff", VCPU_STAT(pthru_bad_aff) }, { NULL } }; @@ -592,9 +599,6 @@ int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, case KVM_REG_PPC_BESCR: *val = get_reg_val(id, vcpu->arch.bescr); break; - case KVM_REG_PPC_VTB: - *val = get_reg_val(id, vcpu->arch.vtb); - break; case KVM_REG_PPC_IC: *val = get_reg_val(id, vcpu->arch.ic); break; @@ -666,9 +670,6 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, case KVM_REG_PPC_BESCR: vcpu->arch.bescr = set_reg_val(id, *val); break; - case KVM_REG_PPC_VTB: - vcpu->arch.vtb = set_reg_val(id, *val); - break; case KVM_REG_PPC_IC: vcpu->arch.ic = set_reg_val(id, *val); break; diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index 2afdb9c..8359752 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -498,6 +498,7 @@ int kvmppc_core_emulate_mtspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) case SPRN_MMCR0: case SPRN_MMCR1: case SPRN_MMCR2: + case SPRN_UMMCR2: #endif break; unprivileged: @@ -579,7 +580,7 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val *spr_val = vcpu->arch.spurr; break; case SPRN_VTB: - *spr_val = vcpu->arch.vtb; + *spr_val = to_book3s(vcpu)->vtb; break; case SPRN_IC: *spr_val = vcpu->arch.ic; @@ -640,6 +641,7 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val case SPRN_MMCR0: case SPRN_MMCR1: case SPRN_MMCR2: + case SPRN_UMMCR2: case SPRN_TIR: #endif *spr_val = 0; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 2fd5580..3686471 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -53,11 +53,15 @@ #include <asm/smp.h> #include <asm/dbell.h> #include <asm/hmi.h> +#include <asm/pnv-pci.h> #include <linux/gfp.h> #include <linux/vmalloc.h> #include <linux/highmem.h> #include <linux/hugetlb.h> +#include <linux/kvm_irqfd.h> +#include <linux/irqbypass.h> #include <linux/module.h> +#include <linux/compiler.h> #include "book3s.h" @@ -70,6 +74,8 @@ /* Used to indicate that a guest page fault needs to be handled */ #define RESUME_PAGE_FAULT (RESUME_GUEST | RESUME_FLAG_ARCH1) +/* Used to indicate that a guest passthrough interrupt needs to be handled */ +#define RESUME_PASSTHROUGH (RESUME_GUEST | RESUME_FLAG_ARCH2) /* Used as a "null" value for timebase values */ #define TB_NIL (~(u64)0) @@ -89,14 +95,55 @@ static struct kernel_param_ops module_param_ops = { .get = param_get_int, }; +module_param_cb(kvm_irq_bypass, &module_param_ops, &kvm_irq_bypass, + S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(kvm_irq_bypass, "Bypass passthrough interrupt optimization"); + module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core"); #endif +/* Maximum halt poll interval defaults to KVM_HALT_POLL_NS_DEFAULT */ +static unsigned int halt_poll_max_ns = KVM_HALT_POLL_NS_DEFAULT; +module_param(halt_poll_max_ns, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(halt_poll_max_ns, "Maximum halt poll time in ns"); + +/* Factor by which the vcore halt poll interval is grown, default is to double + */ +static unsigned int halt_poll_ns_grow = 2; +module_param(halt_poll_ns_grow, int, S_IRUGO); +MODULE_PARM_DESC(halt_poll_ns_grow, "Factor halt poll time is grown by"); + +/* Factor by which the vcore halt poll interval is shrunk, default is to reset + */ +static unsigned int halt_poll_ns_shrink; +module_param(halt_poll_ns_shrink, int, S_IRUGO); +MODULE_PARM_DESC(halt_poll_ns_shrink, "Factor halt poll time is shrunk by"); + static void kvmppc_end_cede(struct kvm_vcpu *vcpu); static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); +static inline struct kvm_vcpu *next_runnable_thread(struct kvmppc_vcore *vc, + int *ip) +{ + int i = *ip; + struct kvm_vcpu *vcpu; + + while (++i < MAX_SMT_THREADS) { + vcpu = READ_ONCE(vc->runnable_threads[i]); + if (vcpu) { + *ip = i; + return vcpu; + } + } + return NULL; +} + +/* Used to traverse the list of runnable threads for a given vcore */ +#define for_each_runnable_thread(i, vcpu, vc) \ + for (i = -1; (vcpu = next_runnable_thread(vc, &i)); ) + static bool kvmppc_ipi_thread(int cpu) { /* On POWER8 for IPIs to threads in the same core, use msgsnd */ @@ -991,6 +1038,9 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, kvmppc_core_queue_program(vcpu, SRR1_PROGILL); r = RESUME_GUEST; break; + case BOOK3S_INTERRUPT_HV_RM_HARD: + r = RESUME_PASSTHROUGH; + break; default: kvmppc_dump_regs(vcpu); printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n", @@ -1149,6 +1199,9 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, case KVM_REG_PPC_DPDES: *val = get_reg_val(id, vcpu->arch.vcore->dpdes); break; + case KVM_REG_PPC_VTB: + *val = get_reg_val(id, vcpu->arch.vcore->vtb); + break; case KVM_REG_PPC_DAWR: *val = get_reg_val(id, vcpu->arch.dawr); break; @@ -1341,6 +1394,9 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, case KVM_REG_PPC_DPDES: vcpu->arch.vcore->dpdes = set_reg_val(id, *val); break; + case KVM_REG_PPC_VTB: + vcpu->arch.vcore->vtb = set_reg_val(id, *val); + break; case KVM_REG_PPC_DAWR: vcpu->arch.dawr = set_reg_val(id, *val); break; @@ -1493,7 +1549,6 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core) if (vcore == NULL) return NULL; - INIT_LIST_HEAD(&vcore->runnable_threads); spin_lock_init(&vcore->lock); spin_lock_init(&vcore->stoltb_lock); init_swait_queue_head(&vcore->wq); @@ -1802,7 +1857,7 @@ static void kvmppc_remove_runnable(struct kvmppc_vcore *vc, vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; spin_unlock_irq(&vcpu->arch.tbacct_lock); --vc->n_runnable; - list_del(&vcpu->arch.run_list); + WRITE_ONCE(vc->runnable_threads[vcpu->arch.ptid], NULL); } static int kvmppc_grab_hwthread(int cpu) @@ -2048,66 +2103,6 @@ static void init_master_vcore(struct kvmppc_vcore *vc) vc->conferring_threads = 0; } -/* - * See if the existing subcores can be split into 3 (or fewer) subcores - * of at most two threads each, so we can fit in another vcore. This - * assumes there are at most two subcores and at most 6 threads in total. - */ -static bool can_split_piggybacked_subcores(struct core_info *cip) -{ - int sub, new_sub; - int large_sub = -1; - int thr; - int n_subcores = cip->n_subcores; - struct kvmppc_vcore *vc, *vcnext; - struct kvmppc_vcore *master_vc = NULL; - - for (sub = 0; sub < cip->n_subcores; ++sub) { - if (cip->subcore_threads[sub] <= 2) - continue; - if (large_sub >= 0) - return false; - large_sub = sub; - vc = list_first_entry(&cip->vcs[sub], struct kvmppc_vcore, - preempt_list); - if (vc->num_threads > 2) - return false; - n_subcores += (cip->subcore_threads[sub] - 1) >> 1; - } - if (large_sub < 0 || !subcore_config_ok(n_subcores + 1, 2)) - return false; - - /* - * Seems feasible, so go through and move vcores to new subcores. - * Note that when we have two or more vcores in one subcore, - * all those vcores must have only one thread each. - */ - new_sub = cip->n_subcores; - thr = 0; - sub = large_sub; - list_for_each_entry_safe(vc, vcnext, &cip->vcs[sub], preempt_list) { - if (thr >= 2) { - list_del(&vc->preempt_list); - list_add_tail(&vc->preempt_list, &cip->vcs[new_sub]); - /* vc->num_threads must be 1 */ - if (++cip->subcore_threads[new_sub] == 1) { - cip->subcore_vm[new_sub] = vc->kvm; - init_master_vcore(vc); - master_vc = vc; - ++cip->n_subcores; - } else { - vc->master_vcore = master_vc; - ++new_sub; - } - } - thr += vc->num_threads; - } - cip->subcore_threads[large_sub] = 2; - cip->max_subcore_threads = 2; - - return true; -} - static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip) { int n_threads = vc->num_threads; @@ -2118,23 +2113,9 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip) if (n_threads < cip->max_subcore_threads) n_threads = cip->max_subcore_threads; - if (subcore_config_ok(cip->n_subcores + 1, n_threads)) { - cip->max_subcore_threads = n_threads; - } else if (cip->n_subcores <= 2 && cip->total_threads <= 6 && - vc->num_threads <= 2) { - /* - * We may be able to fit another subcore in by - * splitting an existing subcore with 3 or 4 - * threads into two 2-thread subcores, or one - * with 5 or 6 threads into three subcores. - * We can only do this if those subcores have - * piggybacked virtual cores. - */ - if (!can_split_piggybacked_subcores(cip)) - return false; - } else { + if (!subcore_config_ok(cip->n_subcores + 1, n_threads)) return false; - } + cip->max_subcore_threads = n_threads; sub = cip->n_subcores; ++cip->n_subcores; @@ -2148,43 +2129,6 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip) return true; } -static bool can_piggyback_subcore(struct kvmppc_vcore *pvc, - struct core_info *cip, int sub) -{ - struct kvmppc_vcore *vc; - int n_thr; - - vc = list_first_entry(&cip->vcs[sub], struct kvmppc_vcore, - preempt_list); - - /* require same VM and same per-core reg values */ - if (pvc->kvm != vc->kvm || - pvc->tb_offset != vc->tb_offset || - pvc->pcr != vc->pcr || - pvc->lpcr != vc->lpcr) - return false; - - /* P8 guest with > 1 thread per core would see wrong TIR value */ - if (cpu_has_feature(CPU_FTR_ARCH_207S) && - (vc->num_threads > 1 || pvc->num_threads > 1)) - return false; - - n_thr = cip->subcore_threads[sub] + pvc->num_threads; - if (n_thr > cip->max_subcore_threads) { - if (!subcore_config_ok(cip->n_subcores, n_thr)) - return false; - cip->max_subcore_threads = n_thr; - } - - cip->total_threads += pvc->num_threads; - cip->subcore_threads[sub] = n_thr; - pvc->master_vcore = vc; - list_del(&pvc->preempt_list); - list_add_tail(&pvc->preempt_list, &cip->vcs[sub]); - - return true; -} - /* * Work out whether it is possible to piggyback the execution of * vcore *pvc onto the execution of the other vcores described in *cip. @@ -2192,27 +2136,18 @@ static bool can_piggyback_subcore(struct kvmppc_vcore *pvc, static bool can_piggyback(struct kvmppc_vcore *pvc, struct core_info *cip, int target_threads) { - int sub; - if (cip->total_threads + pvc->num_threads > target_threads) return false; - for (sub = 0; sub < cip->n_subcores; ++sub) - if (cip->subcore_threads[sub] && - can_piggyback_subcore(pvc, cip, sub)) - return true; - - if (can_dynamic_split(pvc, cip)) - return true; - return false; + return can_dynamic_split(pvc, cip); } static void prepare_threads(struct kvmppc_vcore *vc) { - struct kvm_vcpu *vcpu, *vnext; + int i; + struct kvm_vcpu *vcpu; - list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads, - arch.run_list) { + for_each_runnable_thread(i, vcpu, vc) { if (signal_pending(vcpu->arch.run_task)) vcpu->arch.ret = -EINTR; else if (vcpu->arch.vpa.update_pending || @@ -2259,15 +2194,14 @@ static void collect_piggybacks(struct core_info *cip, int target_threads) static void post_guest_process(struct kvmppc_vcore *vc, bool is_master) { - int still_running = 0; + int still_running = 0, i; u64 now; long ret; - struct kvm_vcpu *vcpu, *vnext; + struct kvm_vcpu *vcpu; spin_lock(&vc->lock); now = get_tb(); - list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads, - arch.run_list) { + for_each_runnable_thread(i, vcpu, vc) { /* cancel pending dec exception if dec is positive */ if (now < vcpu->arch.dec_expires && kvmppc_core_pending_dec(vcpu)) @@ -2307,8 +2241,8 @@ static void post_guest_process(struct kvmppc_vcore *vc, bool is_master) } if (vc->n_runnable > 0 && vc->runner == NULL) { /* make sure there's a candidate runner awake */ - vcpu = list_first_entry(&vc->runnable_threads, - struct kvm_vcpu, arch.run_list); + i = -1; + vcpu = next_runnable_thread(vc, &i); wake_up(&vcpu->arch.cpu_run); } } @@ -2361,7 +2295,7 @@ static inline void kvmppc_set_host_core(int cpu) */ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) { - struct kvm_vcpu *vcpu, *vnext; + struct kvm_vcpu *vcpu; int i; int srcu_idx; struct core_info core_info; @@ -2397,8 +2331,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) */ if ((threads_per_core > 1) && ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) { - list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads, - arch.run_list) { + for_each_runnable_thread(i, vcpu, vc) { vcpu->arch.ret = -EBUSY; kvmppc_remove_runnable(vc, vcpu); wake_up(&vcpu->arch.cpu_run); @@ -2477,8 +2410,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) active |= 1 << thr; list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list) { pvc->pcpu = pcpu + thr; - list_for_each_entry(vcpu, &pvc->runnable_threads, - arch.run_list) { + for_each_runnable_thread(i, vcpu, pvc) { kvmppc_start_thread(vcpu, pvc); kvmppc_create_dtl_entry(vcpu, pvc); trace_kvm_guest_enter(vcpu); @@ -2604,34 +2536,92 @@ static void kvmppc_wait_for_exec(struct kvmppc_vcore *vc, finish_wait(&vcpu->arch.cpu_run, &wait); } +static void grow_halt_poll_ns(struct kvmppc_vcore *vc) +{ + /* 10us base */ + if (vc->halt_poll_ns == 0 && halt_poll_ns_grow) + vc->halt_poll_ns = 10000; + else + vc->halt_poll_ns *= halt_poll_ns_grow; + + if (vc->halt_poll_ns > halt_poll_max_ns) + vc->halt_poll_ns = halt_poll_max_ns; +} + +static void shrink_halt_poll_ns(struct kvmppc_vcore *vc) +{ + if (halt_poll_ns_shrink == 0) + vc->halt_poll_ns = 0; + else + vc->halt_poll_ns /= halt_poll_ns_shrink; +} + +/* Check to see if any of the runnable vcpus on the vcore have pending + * exceptions or are no longer ceded + */ +static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc) +{ + struct kvm_vcpu *vcpu; + int i; + + for_each_runnable_thread(i, vcpu, vc) { + if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded) + return 1; + } + + return 0; +} + /* * All the vcpus in this vcore are idle, so wait for a decrementer * or external interrupt to one of the vcpus. vc->lock is held. */ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) { - struct kvm_vcpu *vcpu; + ktime_t cur, start_poll, start_wait; int do_sleep = 1; + u64 block_ns; DECLARE_SWAITQUEUE(wait); - prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE); + /* Poll for pending exceptions and ceded state */ + cur = start_poll = ktime_get(); + if (vc->halt_poll_ns) { + ktime_t stop = ktime_add_ns(start_poll, vc->halt_poll_ns); + ++vc->runner->stat.halt_attempted_poll; - /* - * Check one last time for pending exceptions and ceded state after - * we put ourselves on the wait queue - */ - list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { - if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded) { - do_sleep = 0; - break; + vc->vcore_state = VCORE_POLLING; + spin_unlock(&vc->lock); + + do { + if (kvmppc_vcore_check_block(vc)) { + do_sleep = 0; + break; + } + cur = ktime_get(); + } while (single_task_running() && ktime_before(cur, stop)); + + spin_lock(&vc->lock); + vc->vcore_state = VCORE_INACTIVE; + + if (!do_sleep) { + ++vc->runner->stat.halt_successful_poll; + goto out; } } - if (!do_sleep) { + prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE); + + if (kvmppc_vcore_check_block(vc)) { finish_swait(&vc->wq, &wait); - return; + do_sleep = 0; + /* If we polled, count this as a successful poll */ + if (vc->halt_poll_ns) + ++vc->runner->stat.halt_successful_poll; + goto out; } + start_wait = ktime_get(); + vc->vcore_state = VCORE_SLEEPING; trace_kvmppc_vcore_blocked(vc, 0); spin_unlock(&vc->lock); @@ -2640,13 +2630,52 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) spin_lock(&vc->lock); vc->vcore_state = VCORE_INACTIVE; trace_kvmppc_vcore_blocked(vc, 1); + ++vc->runner->stat.halt_successful_wait; + + cur = ktime_get(); + +out: + block_ns = ktime_to_ns(cur) - ktime_to_ns(start_poll); + + /* Attribute wait time */ + if (do_sleep) { + vc->runner->stat.halt_wait_ns += + ktime_to_ns(cur) - ktime_to_ns(start_wait); + /* Attribute failed poll time */ + if (vc->halt_poll_ns) + vc->runner->stat.halt_poll_fail_ns += + ktime_to_ns(start_wait) - + ktime_to_ns(start_poll); + } else { + /* Attribute successful poll time */ + if (vc->halt_poll_ns) + vc->runner->stat.halt_poll_success_ns += + ktime_to_ns(cur) - + ktime_to_ns(start_poll); + } + + /* Adjust poll time */ + if (halt_poll_max_ns) { + if (block_ns <= vc->halt_poll_ns) + ; + /* We slept and blocked for longer than the max halt time */ + else if (vc->halt_poll_ns && block_ns > halt_poll_max_ns) + shrink_halt_poll_ns(vc); + /* We slept and our poll time is too small */ + else if (vc->halt_poll_ns < halt_poll_max_ns && + block_ns < halt_poll_max_ns) + grow_halt_poll_ns(vc); + } else + vc->halt_poll_ns = 0; + + trace_kvmppc_vcore_wakeup(do_sleep, block_ns); } static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) { - int n_ceded; + int n_ceded, i; struct kvmppc_vcore *vc; - struct kvm_vcpu *v, *vn; + struct kvm_vcpu *v; trace_kvmppc_run_vcpu_enter(vcpu); @@ -2666,7 +2695,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb()); vcpu->arch.state = KVMPPC_VCPU_RUNNABLE; vcpu->arch.busy_preempt = TB_NIL; - list_add_tail(&vcpu->arch.run_list, &vc->runnable_threads); + WRITE_ONCE(vc->runnable_threads[vcpu->arch.ptid], vcpu); ++vc->n_runnable; /* @@ -2706,8 +2735,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) kvmppc_wait_for_exec(vc, vcpu, TASK_INTERRUPTIBLE); continue; } - list_for_each_entry_safe(v, vn, &vc->runnable_threads, - arch.run_list) { + for_each_runnable_thread(i, v, vc) { kvmppc_core_prepare_to_enter(v); if (signal_pending(v->arch.run_task)) { kvmppc_remove_runnable(vc, v); @@ -2720,7 +2748,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) if (!vc->n_runnable || vcpu->arch.state != KVMPPC_VCPU_RUNNABLE) break; n_ceded = 0; - list_for_each_entry(v, &vc->runnable_threads, arch.run_list) { + for_each_runnable_thread(i, v, vc) { if (!v->arch.pending_exceptions) n_ceded += v->arch.ceded; else @@ -2759,8 +2787,8 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) if (vc->n_runnable && vc->vcore_state == VCORE_INACTIVE) { /* Wake up some vcpu to run the core */ - v = list_first_entry(&vc->runnable_threads, - struct kvm_vcpu, arch.run_list); + i = -1; + v = next_runnable_thread(vc, &i); wake_up(&v->arch.cpu_run); } @@ -2818,7 +2846,8 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) r = kvmppc_book3s_hv_page_fault(run, vcpu, vcpu->arch.fault_dar, vcpu->arch.fault_dsisr); srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); - } + } else if (r == RESUME_PASSTHROUGH) + r = kvmppc_xics_rm_complete(vcpu, 0); } while (is_kvmppc_resume_guest(r)); out: @@ -3247,6 +3276,8 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm) kvmppc_free_vcores(kvm); kvmppc_free_hpt(kvm); + + kvmppc_free_pimap(kvm); } /* We don't need to emulate any privileged instructions or dcbz */ @@ -3282,6 +3313,184 @@ static int kvmppc_core_check_processor_compat_hv(void) return 0; } +#ifdef CONFIG_KVM_XICS + +void kvmppc_free_pimap(struct kvm *kvm) +{ + kfree(kvm->arch.pimap); +} + +static struct kvmppc_passthru_irqmap *kvmppc_alloc_pimap(void) +{ + return kzalloc(sizeof(struct kvmppc_passthru_irqmap), GFP_KERNEL); +} + +static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi) +{ + struct irq_desc *desc; + struct kvmppc_irq_map *irq_map; + struct kvmppc_passthru_irqmap *pimap; + struct irq_chip *chip; + int i; + + if (!kvm_irq_bypass) + return 1; + + desc = irq_to_desc(host_irq); + if (!desc) + return -EIO; + + mutex_lock(&kvm->lock); + + pimap = kvm->arch.pimap; + if (pimap == NULL) { + /* First call, allocate structure to hold IRQ map */ + pimap = kvmppc_alloc_pimap(); + if (pimap == NULL) { + mutex_unlock(&kvm->lock); + return -ENOMEM; + } + kvm->arch.pimap = pimap; + } + + /* + * For now, we only support interrupts for which the EOI operation + * is an OPAL call followed by a write to XIRR, since that's + * what our real-mode EOI code does. + */ + chip = irq_data_get_irq_chip(&desc->irq_data); + if (!chip || !is_pnv_opal_msi(chip)) { + pr_warn("kvmppc_set_passthru_irq_hv: Could not assign IRQ map for (%d,%d)\n", + host_irq, guest_gsi); + mutex_unlock(&kvm->lock); + return -ENOENT; + } + + /* + * See if we already have an entry for this guest IRQ number. + * If it's mapped to a hardware IRQ number, that's an error, + * otherwise re-use this entry. + */ + for (i = 0; i < pimap->n_mapped; i++) { + if (guest_gsi == pimap->mapped[i].v_hwirq) { + if (pimap->mapped[i].r_hwirq) { + mutex_unlock(&kvm->lock); + return -EINVAL; + } + break; + } + } + + if (i == KVMPPC_PIRQ_MAPPED) { + mutex_unlock(&kvm->lock); + return -EAGAIN; /* table is full */ + } + + irq_map = &pimap->mapped[i]; + + irq_map->v_hwirq = guest_gsi; + irq_map->desc = desc; + + /* + * Order the above two stores before the next to serialize with + * the KVM real mode handler. + */ + smp_wmb(); + irq_map->r_hwirq = desc->irq_data.hwirq; + + if (i == pimap->n_mapped) + pimap->n_mapped++; + + kvmppc_xics_set_mapped(kvm, guest_gsi, desc->irq_data.hwirq); + + mutex_unlock(&kvm->lock); + + return 0; +} + +static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi) +{ + struct irq_desc *desc; + struct kvmppc_passthru_irqmap *pimap; + int i; + + if (!kvm_irq_bypass) + return 0; + + desc = irq_to_desc(host_irq); + if (!desc) + return -EIO; + + mutex_lock(&kvm->lock); + + if (kvm->arch.pimap == NULL) { + mutex_unlock(&kvm->lock); + return 0; + } + pimap = kvm->arch.pimap; + + for (i = 0; i < pimap->n_mapped; i++) { + if (guest_gsi == pimap->mapped[i].v_hwirq) + break; + } + + if (i == pimap->n_mapped) { + mutex_unlock(&kvm->lock); + return -ENODEV; + } + + kvmppc_xics_clr_mapped(kvm, guest_gsi, pimap->mapped[i].r_hwirq); + + /* invalidate the entry */ + pimap->mapped[i].r_hwirq = 0; + + /* + * We don't free this structure even when the count goes to + * zero. The structure is freed when we destroy the VM. + */ + + mutex_unlock(&kvm->lock); + return 0; +} + +static int kvmppc_irq_bypass_add_producer_hv(struct irq_bypass_consumer *cons, + struct irq_bypass_producer *prod) +{ + int ret = 0; + struct kvm_kernel_irqfd *irqfd = + container_of(cons, struct kvm_kernel_irqfd, consumer); + + irqfd->producer = prod; + + ret = kvmppc_set_passthru_irq(irqfd->kvm, prod->irq, irqfd->gsi); + if (ret) + pr_info("kvmppc_set_passthru_irq (irq %d, gsi %d) fails: %d\n", + prod->irq, irqfd->gsi, ret); + + return ret; +} + +static void kvmppc_irq_bypass_del_producer_hv(struct irq_bypass_consumer *cons, + struct irq_bypass_producer *prod) +{ + int ret; + struct kvm_kernel_irqfd *irqfd = + container_of(cons, struct kvm_kernel_irqfd, consumer); + + irqfd->producer = NULL; + + /* + * When producer of consumer is unregistered, we change back to + * default external interrupt handling mode - KVM real mode + * will switch back to host. + */ + ret = kvmppc_clr_passthru_irq(irqfd->kvm, prod->irq, irqfd->gsi); + if (ret) + pr_warn("kvmppc_clr_passthru_irq (irq %d, gsi %d) fails: %d\n", + prod->irq, irqfd->gsi, ret); +} +#endif + static long kvm_arch_vm_ioctl_hv(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -3400,6 +3609,10 @@ static struct kvmppc_ops kvm_ops_hv = { .fast_vcpu_kick = kvmppc_fast_vcpu_kick_hv, .arch_vm_ioctl = kvm_arch_vm_ioctl_hv, .hcall_implemented = kvmppc_hcall_impl_hv, +#ifdef CONFIG_KVM_XICS + .irq_bypass_add_producer = kvmppc_irq_bypass_add_producer_hv, + .irq_bypass_del_producer = kvmppc_irq_bypass_del_producer_hv, +#endif }; static int kvm_init_subcore_bitmap(void) diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index 5f0380d..0c84d6b 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -25,6 +25,7 @@ #include <asm/xics.h> #include <asm/dbell.h> #include <asm/cputhreads.h> +#include <asm/io.h> #define KVM_CMA_CHUNK_ORDER 18 @@ -286,3 +287,158 @@ void kvmhv_commence_exit(int trap) struct kvmppc_host_rm_ops *kvmppc_host_rm_ops_hv; EXPORT_SYMBOL_GPL(kvmppc_host_rm_ops_hv); + +#ifdef CONFIG_KVM_XICS +static struct kvmppc_irq_map *get_irqmap(struct kvmppc_passthru_irqmap *pimap, + u32 xisr) +{ + int i; + + /* + * We access the mapped array here without a lock. That + * is safe because we never reduce the number of entries + * in the array and we never change the v_hwirq field of + * an entry once it is set. + * + * We have also carefully ordered the stores in the writer + * and the loads here in the reader, so that if we find a matching + * hwirq here, the associated GSI and irq_desc fields are valid. + */ + for (i = 0; i < pimap->n_mapped; i++) { + if (xisr == pimap->mapped[i].r_hwirq) { + /* + * Order subsequent reads in the caller to serialize + * with the writer. + */ + smp_rmb(); + return &pimap->mapped[i]; + } + } + return NULL; +} + +/* + * If we have an interrupt that's not an IPI, check if we have a + * passthrough adapter and if so, check if this external interrupt + * is for the adapter. + * We will attempt to deliver the IRQ directly to the target VCPU's + * ICP, the virtual ICP (based on affinity - the xive value in ICS). + * + * If the delivery fails or if this is not for a passthrough adapter, + * return to the host to handle this interrupt. We earlier + * saved a copy of the XIRR in the PACA, it will be picked up by + * the host ICP driver. + */ +static int kvmppc_check_passthru(u32 xisr, __be32 xirr) +{ + struct kvmppc_passthru_irqmap *pimap; + struct kvmppc_irq_map *irq_map; + struct kvm_vcpu *vcpu; + + vcpu = local_paca->kvm_hstate.kvm_vcpu; + if (!vcpu) + return 1; + pimap = kvmppc_get_passthru_irqmap(vcpu->kvm); + if (!pimap) + return 1; + irq_map = get_irqmap(pimap, xisr); + if (!irq_map) + return 1; + + /* We're handling this interrupt, generic code doesn't need to */ + local_paca->kvm_hstate.saved_xirr = 0; + + return kvmppc_deliver_irq_passthru(vcpu, xirr, irq_map, pimap); +} + +#else +static inline int kvmppc_check_passthru(u32 xisr, __be32 xirr) +{ + return 1; +} +#endif + +/* + * Determine what sort of external interrupt is pending (if any). + * Returns: + * 0 if no interrupt is pending + * 1 if an interrupt is pending that needs to be handled by the host + * 2 Passthrough that needs completion in the host + * -1 if there was a guest wakeup IPI (which has now been cleared) + * -2 if there is PCI passthrough external interrupt that was handled + */ + +long kvmppc_read_intr(void) +{ + unsigned long xics_phys; + u32 h_xirr; + __be32 xirr; + u32 xisr; + u8 host_ipi; + + /* see if a host IPI is pending */ + host_ipi = local_paca->kvm_hstate.host_ipi; + if (host_ipi) + return 1; + + /* Now read the interrupt from the ICP */ + xics_phys = local_paca->kvm_hstate.xics_phys; + if (unlikely(!xics_phys)) + return 1; + + /* + * Save XIRR for later. Since we get control in reverse endian + * on LE systems, save it byte reversed and fetch it back in + * host endian. Note that xirr is the value read from the + * XIRR register, while h_xirr is the host endian version. + */ + xirr = _lwzcix(xics_phys + XICS_XIRR); + h_xirr = be32_to_cpu(xirr); + local_paca->kvm_hstate.saved_xirr = h_xirr; + xisr = h_xirr & 0xffffff; + /* + * Ensure that the store/load complete to guarantee all side + * effects of loading from XIRR has completed + */ + smp_mb(); + + /* if nothing pending in the ICP */ + if (!xisr) + return 0; + + /* We found something in the ICP... + * + * If it is an IPI, clear the MFRR and EOI it. + */ + if (xisr == XICS_IPI) { + _stbcix(xics_phys + XICS_MFRR, 0xff); + _stwcix(xics_phys + XICS_XIRR, xirr); + /* + * Need to ensure side effects of above stores + * complete before proceeding. + */ + smp_mb(); + + /* + * We need to re-check host IPI now in case it got set in the + * meantime. If it's clear, we bounce the interrupt to the + * guest + */ + host_ipi = local_paca->kvm_hstate.host_ipi; + if (unlikely(host_ipi != 0)) { + /* We raced with the host, + * we need to resend that IPI, bummer + */ + _stbcix(xics_phys + XICS_MFRR, IPI_PRIORITY); + /* Let side effects complete */ + smp_mb(); + return 1; + } + + /* OK, it's an IPI for us */ + local_paca->kvm_hstate.saved_xirr = 0; + return -1; + } + + return kvmppc_check_passthru(xisr, xirr); +} diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index 980d8a6..82ff5de 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -10,6 +10,7 @@ #include <linux/kernel.h> #include <linux/kvm_host.h> #include <linux/err.h> +#include <linux/kernel_stat.h> #include <asm/kvm_book3s.h> #include <asm/kvm_ppc.h> @@ -18,7 +19,10 @@ #include <asm/debug.h> #include <asm/synch.h> #include <asm/cputhreads.h> +#include <asm/pgtable.h> #include <asm/ppc-opcode.h> +#include <asm/pnv-pci.h> +#include <asm/opal.h> #include "book3s_xics.h" @@ -26,9 +30,12 @@ int h_ipi_redirect = 1; EXPORT_SYMBOL(h_ipi_redirect); +int kvm_irq_bypass = 1; +EXPORT_SYMBOL(kvm_irq_bypass); static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, u32 new_irq); +static int xics_opal_rm_set_server(unsigned int hw_irq, int server_cpu); /* -- ICS routines -- */ static void ics_rm_check_resend(struct kvmppc_xics *xics, @@ -708,10 +715,123 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) icp->rm_action |= XICS_RM_NOTIFY_EOI; icp->rm_eoied_irq = irq; } + + if (state->host_irq) { + ++vcpu->stat.pthru_all; + if (state->intr_cpu != -1) { + int pcpu = raw_smp_processor_id(); + + pcpu = cpu_first_thread_sibling(pcpu); + ++vcpu->stat.pthru_host; + if (state->intr_cpu != pcpu) { + ++vcpu->stat.pthru_bad_aff; + xics_opal_rm_set_server(state->host_irq, pcpu); + } + state->intr_cpu = -1; + } + } bail: return check_too_hard(xics, icp); } +unsigned long eoi_rc; + +static void icp_eoi(struct irq_chip *c, u32 hwirq, u32 xirr) +{ + unsigned long xics_phys; + int64_t rc; + + rc = pnv_opal_pci_msi_eoi(c, hwirq); + + if (rc) + eoi_rc = rc; + + iosync(); + + /* EOI it */ + xics_phys = local_paca->kvm_hstate.xics_phys; + _stwcix(xics_phys + XICS_XIRR, xirr); +} + +static int xics_opal_rm_set_server(unsigned int hw_irq, int server_cpu) +{ + unsigned int mangle_cpu = get_hard_smp_processor_id(server_cpu) << 2; + + return opal_rm_set_xive(hw_irq, mangle_cpu, DEFAULT_PRIORITY); +} + +/* + * Increment a per-CPU 32-bit unsigned integer variable. + * Safe to call in real-mode. Handles vmalloc'ed addresses + * + * ToDo: Make this work for any integral type + */ + +static inline void this_cpu_inc_rm(unsigned int __percpu *addr) +{ + unsigned long l; + unsigned int *raddr; + int cpu = smp_processor_id(); + + raddr = per_cpu_ptr(addr, cpu); + l = (unsigned long)raddr; + + if (REGION_ID(l) == VMALLOC_REGION_ID) { + l = vmalloc_to_phys(raddr); + raddr = (unsigned int *)l; + } + ++*raddr; +} + +/* + * We don't try to update the flags in the irq_desc 'istate' field in + * here as would happen in the normal IRQ handling path for several reasons: + * - state flags represent internal IRQ state and are not expected to be + * updated outside the IRQ subsystem + * - more importantly, these are useful for edge triggered interrupts, + * IRQ probing, etc., but we are only handling MSI/MSIx interrupts here + * and these states shouldn't apply to us. + * + * However, we do update irq_stats - we somewhat duplicate the code in + * kstat_incr_irqs_this_cpu() for this since this function is defined + * in irq/internal.h which we don't want to include here. + * The only difference is that desc->kstat_irqs is an allocated per CPU + * variable and could have been vmalloc'ed, so we can't directly + * call __this_cpu_inc() on it. The kstat structure is a static + * per CPU variable and it should be accessible by real-mode KVM. + * + */ +static void kvmppc_rm_handle_irq_desc(struct irq_desc *desc) +{ + this_cpu_inc_rm(desc->kstat_irqs); + __this_cpu_inc(kstat.irqs_sum); +} + +long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, + u32 xirr, + struct kvmppc_irq_map *irq_map, + struct kvmppc_passthru_irqmap *pimap) +{ + struct kvmppc_xics *xics; + struct kvmppc_icp *icp; + u32 irq; + + irq = irq_map->v_hwirq; + xics = vcpu->kvm->arch.xics; + icp = vcpu->arch.icp; + + kvmppc_rm_handle_irq_desc(irq_map->desc); + icp_rm_deliver_irq(xics, icp, irq); + + /* EOI the interrupt */ + icp_eoi(irq_desc_get_chip(irq_map->desc), irq_map->r_hwirq, xirr); + + if (check_too_hard(xics, icp) == H_TOO_HARD) + return 2; + else + return -2; +} + /* --- Non-real mode XICS-related built-in routines --- */ /** diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 9756555..c3c1d1b 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -221,6 +221,13 @@ kvmppc_primary_no_guest: li r3, 0 /* Don't wake on privileged (OS) doorbell */ b kvm_do_nap +/* + * kvm_novcpu_wakeup + * Entered from kvm_start_guest if kvm_hstate.napping is set + * to NAPPING_NOVCPU + * r2 = kernel TOC + * r13 = paca + */ kvm_novcpu_wakeup: ld r1, HSTATE_HOST_R1(r13) ld r5, HSTATE_KVM_VCORE(r13) @@ -230,6 +237,13 @@ kvm_novcpu_wakeup: /* check the wake reason */ bl kvmppc_check_wake_reason + /* + * Restore volatile registers since we could have called + * a C routine in kvmppc_check_wake_reason. + * r5 = VCORE + */ + ld r5, HSTATE_KVM_VCORE(r13) + /* see if any other thread is already exiting */ lwz r0, VCORE_ENTRY_EXIT(r5) cmpwi r0, 0x100 @@ -322,6 +336,11 @@ kvm_start_guest: /* Check the wake reason in SRR1 to see why we got here */ bl kvmppc_check_wake_reason + /* + * kvmppc_check_wake_reason could invoke a C routine, but we + * have no volatile registers to restore when we return. + */ + cmpdi r3, 0 bge kvm_no_guest @@ -625,9 +644,11 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_207S) 38: BEGIN_FTR_SECTION - /* DPDES is shared between threads */ + /* DPDES and VTB are shared between threads */ ld r8, VCORE_DPDES(r5) + ld r7, VCORE_VTB(r5) mtspr SPRN_DPDES, r8 + mtspr SPRN_VTB, r7 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) /* Mark the subcore state as inside guest */ @@ -787,10 +808,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) mtspr SPRN_CIABR, r7 mtspr SPRN_TAR, r8 ld r5, VCPU_IC(r4) - ld r6, VCPU_VTB(r4) - mtspr SPRN_IC, r5 - mtspr SPRN_VTB, r6 ld r8, VCPU_EBBHR(r4) + mtspr SPRN_IC, r5 mtspr SPRN_EBBHR, r8 ld r5, VCPU_EBBRR(r4) ld r6, VCPU_BESCR(r4) @@ -881,6 +900,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) cmpwi r3, 512 /* 1 microsecond */ blt hdec_soon +deliver_guest_interrupt: ld r6, VCPU_CTR(r4) ld r7, VCPU_XER(r4) @@ -895,7 +915,6 @@ kvmppc_cede_reentry: /* r4 = vcpu, r13 = paca */ mtspr SPRN_SRR0, r6 mtspr SPRN_SRR1, r7 -deliver_guest_interrupt: /* r11 = vcpu->arch.msr & ~MSR_HV */ rldicl r11, r11, 63 - MSR_HV_LG, 1 rotldi r11, r11, 1 + MSR_HV_LG @@ -1155,10 +1174,54 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) * set, we know the host wants us out so let's do it now */ bl kvmppc_read_intr + + /* + * Restore the active volatile registers after returning from + * a C function. + */ + ld r9, HSTATE_KVM_VCPU(r13) + li r12, BOOK3S_INTERRUPT_EXTERNAL + + /* + * kvmppc_read_intr return codes: + * + * Exit to host (r3 > 0) + * 1 An interrupt is pending that needs to be handled by the host + * Exit guest and return to host by branching to guest_exit_cont + * + * 2 Passthrough that needs completion in the host + * Exit guest and return to host by branching to guest_exit_cont + * However, we also set r12 to BOOK3S_INTERRUPT_HV_RM_HARD + * to indicate to the host to complete handling the interrupt + * + * Before returning to guest, we check if any CPU is heading out + * to the host and if so, we head out also. If no CPUs are heading + * check return values <= 0. + * + * Return to guest (r3 <= 0) + * 0 No external interrupt is pending + * -1 A guest wakeup IPI (which has now been cleared) + * In either case, we return to guest to deliver any pending + * guest interrupts. + * + * -2 A PCI passthrough external interrupt was handled + * (interrupt was delivered directly to guest) + * Return to guest to deliver any pending guest interrupts. + */ + + cmpdi r3, 1 + ble 1f + + /* Return code = 2 */ + li r12, BOOK3S_INTERRUPT_HV_RM_HARD + stw r12, VCPU_TRAP(r9) + b guest_exit_cont + +1: /* Return code <= 1 */ cmpdi r3, 0 bgt guest_exit_cont - /* Check if any CPU is heading out to the host, if so head out too */ + /* Return code <= 0 */ 4: ld r5, HSTATE_KVM_VCORE(r13) lwz r0, VCORE_ENTRY_EXIT(r5) cmpwi r0, 0x100 @@ -1271,10 +1334,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) stw r6, VCPU_PSPB(r9) std r7, VCPU_FSCR(r9) mfspr r5, SPRN_IC - mfspr r6, SPRN_VTB mfspr r7, SPRN_TAR std r5, VCPU_IC(r9) - std r6, VCPU_VTB(r9) std r7, VCPU_TAR(r9) mfspr r8, SPRN_EBBHR std r8, VCPU_EBBHR(r9) @@ -1501,9 +1562,11 @@ kvmhv_switch_to_host: isync BEGIN_FTR_SECTION - /* DPDES is shared between threads */ + /* DPDES and VTB are shared between threads */ mfspr r7, SPRN_DPDES + mfspr r8, SPRN_VTB std r7, VCORE_DPDES(r5) + std r8, VCORE_VTB(r5) /* clear DPDES so we don't get guest doorbells in the host */ li r8, 0 mtspr SPRN_DPDES, r8 @@ -2213,10 +2276,20 @@ END_FTR_SECTION_IFSET(CPU_FTR_TM) ld r29, VCPU_GPR(R29)(r4) ld r30, VCPU_GPR(R30)(r4) ld r31, VCPU_GPR(R31)(r4) - + /* Check the wake reason in SRR1 to see why we got here */ bl kvmppc_check_wake_reason + /* + * Restore volatile registers since we could have called a + * C routine in kvmppc_check_wake_reason + * r4 = VCPU + * r3 tells us whether we need to return to host or not + * WARNING: it gets checked further down: + * should not modify r3 until this check is done. + */ + ld r4, HSTATE_KVM_VCPU(r13) + /* clear our bit in vcore->napping_threads */ 34: ld r5,HSTATE_KVM_VCORE(r13) lbz r7,HSTATE_PTID(r13) @@ -2230,7 +2303,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_TM) li r0,0 stb r0,HSTATE_NAPPING(r13) - /* See if the wake reason means we need to exit */ + /* See if the wake reason saved in r3 means we need to exit */ stw r12, VCPU_TRAP(r4) mr r9, r4 cmpdi r3, 0 @@ -2297,10 +2370,14 @@ machine_check_realmode: * 0 if nothing needs to be done * 1 if something happened that needs to be handled by the host * -1 if there was a guest wakeup (IPI or msgsnd) + * -2 if we handled a PCI passthrough interrupt (returned by + * kvmppc_read_intr only) * * Also sets r12 to the interrupt vector for any interrupt that needs * to be handled now by the host (0x500 for external interrupt), or zero. - * Modifies r0, r6, r7, r8. + * Modifies all volatile registers (since it may call a C function). + * This routine calls kvmppc_read_intr, a C function, if an external + * interrupt is pending. */ kvmppc_check_wake_reason: mfspr r6, SPRN_SRR1 @@ -2310,8 +2387,7 @@ FTR_SECTION_ELSE rlwinm r6, r6, 45-31, 0xe /* P7 wake reason field is 3 bits */ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_207S) cmpwi r6, 8 /* was it an external interrupt? */ - li r12, BOOK3S_INTERRUPT_EXTERNAL - beq kvmppc_read_intr /* if so, see what it was */ + beq 7f /* if so, see what it was */ li r3, 0 li r12, 0 cmpwi r6, 6 /* was it the decrementer? */ @@ -2350,83 +2426,28 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) li r3, 1 blr -/* - * Determine what sort of external interrupt is pending (if any). - * Returns: - * 0 if no interrupt is pending - * 1 if an interrupt is pending that needs to be handled by the host - * -1 if there was a guest wakeup IPI (which has now been cleared) - * Modifies r0, r6, r7, r8, returns value in r3. - */ -kvmppc_read_intr: - /* see if a host IPI is pending */ - li r3, 1 - lbz r0, HSTATE_HOST_IPI(r13) - cmpwi r0, 0 - bne 1f + /* external interrupt - create a stack frame so we can call C */ +7: mflr r0 + std r0, PPC_LR_STKOFF(r1) + stdu r1, -PPC_MIN_STKFRM(r1) + bl kvmppc_read_intr + nop + li r12, BOOK3S_INTERRUPT_EXTERNAL + cmpdi r3, 1 + ble 1f - /* Now read the interrupt from the ICP */ - ld r6, HSTATE_XICS_PHYS(r13) - li r7, XICS_XIRR - cmpdi r6, 0 - beq- 1f - lwzcix r0, r6, r7 /* - * Save XIRR for later. Since we get in in reverse endian on LE - * systems, save it byte reversed and fetch it back in host endian. - */ - li r3, HSTATE_SAVED_XIRR - STWX_BE r0, r3, r13 -#ifdef __LITTLE_ENDIAN__ - lwz r3, HSTATE_SAVED_XIRR(r13) -#else - mr r3, r0 -#endif - rlwinm. r3, r3, 0, 0xffffff - sync - beq 1f /* if nothing pending in the ICP */ - - /* We found something in the ICP... - * - * If it's not an IPI, stash it in the PACA and return to - * the host, we don't (yet) handle directing real external - * interrupts directly to the guest + * Return code of 2 means PCI passthrough interrupt, but + * we need to return back to host to complete handling the + * interrupt. Trap reason is expected in r12 by guest + * exit code. */ - cmpwi r3, XICS_IPI /* if there is, is it an IPI? */ - bne 42f - - /* It's an IPI, clear the MFRR and EOI it */ - li r3, 0xff - li r8, XICS_MFRR - stbcix r3, r6, r8 /* clear the IPI */ - stwcix r0, r6, r7 /* EOI it */ - sync - - /* We need to re-check host IPI now in case it got set in the - * meantime. If it's clear, we bounce the interrupt to the - * guest - */ - lbz r0, HSTATE_HOST_IPI(r13) - cmpwi r0, 0 - bne- 43f - - /* OK, it's an IPI for us */ - li r12, 0 - li r3, -1 -1: blr - -42: /* It's not an IPI and it's for the host. We saved a copy of XIRR in - * the PACA earlier, it will be picked up by the host ICP driver - */ - li r3, 1 - b 1b - -43: /* We raced with the host, we need to resend that IPI, bummer */ - li r0, IPI_PRIORITY - stbcix r0, r6, r8 /* set the IPI */ - sync - li r3, 1 - b 1b + li r12, BOOK3S_INTERRUPT_HV_RM_HARD +1: + ld r0, PPC_MIN_STKFRM+PPC_LR_STKOFF(r1) + addi r1, r1, PPC_MIN_STKFRM + mtlr r0 + blr /* * Save away FP, VMX and VSX registers. diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index e76f79a..826c541 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -226,7 +226,7 @@ void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu, */ vcpu->arch.purr += get_tb() - vcpu->arch.entry_tb; vcpu->arch.spurr += get_tb() - vcpu->arch.entry_tb; - vcpu->arch.vtb += get_vtb() - vcpu->arch.entry_vtb; + to_book3s(vcpu)->vtb += get_vtb() - vcpu->arch.entry_vtb; if (cpu_has_feature(CPU_FTR_ARCH_207S)) vcpu->arch.ic += mfspr(SPRN_IC) - vcpu->arch.entry_ic; svcpu->in_use = false; @@ -448,6 +448,8 @@ void kvmppc_set_pvr_pr(struct kvm_vcpu *vcpu, u32 pvr) case PVR_POWER7: case PVR_POWER7p: case PVR_POWER8: + case PVR_POWER8E: + case PVR_POWER8NVL: vcpu->arch.hflags |= BOOK3S_HFLAG_MULTI_PGSIZE | BOOK3S_HFLAG_NEW_TLBIE; break; @@ -1361,6 +1363,9 @@ static int kvmppc_get_one_reg_pr(struct kvm_vcpu *vcpu, u64 id, case KVM_REG_PPC_HIOR: *val = get_reg_val(id, to_book3s(vcpu)->hior); break; + case KVM_REG_PPC_VTB: + *val = get_reg_val(id, to_book3s(vcpu)->vtb); + break; case KVM_REG_PPC_LPCR: case KVM_REG_PPC_LPCR_64: /* @@ -1397,6 +1402,9 @@ static int kvmppc_set_one_reg_pr(struct kvm_vcpu *vcpu, u64 id, to_book3s(vcpu)->hior = set_reg_val(id, *val); to_book3s(vcpu)->hior_explicit = true; break; + case KVM_REG_PPC_VTB: + to_book3s(vcpu)->vtb = set_reg_val(id, *val); + break; case KVM_REG_PPC_LPCR: case KVM_REG_PPC_LPCR_64: kvmppc_set_lpcr_pr(vcpu, set_reg_val(id, *val)); diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index 05aa113..3bdc639 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -99,6 +99,10 @@ static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level) return 0; } + /* Record which CPU this arrived on for passed-through interrupts */ + if (state->host_irq) + state->intr_cpu = raw_smp_processor_id(); + /* Attempt delivery */ icp_deliver_irq(xics, NULL, irq); @@ -812,7 +816,7 @@ static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) return H_SUCCESS; } -static noinline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall) +int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall) { struct kvmppc_xics *xics = vcpu->kvm->arch.xics; struct kvmppc_icp *icp = vcpu->arch.icp; @@ -841,6 +845,7 @@ static noinline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall) return H_SUCCESS; } +EXPORT_SYMBOL_GPL(kvmppc_xics_rm_complete); int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 req) { @@ -892,6 +897,21 @@ EXPORT_SYMBOL_GPL(kvmppc_xics_hcall); /* -- Initialisation code etc. -- */ +static void xics_debugfs_irqmap(struct seq_file *m, + struct kvmppc_passthru_irqmap *pimap) +{ + int i; + + if (!pimap) + return; + seq_printf(m, "========\nPIRQ mappings: %d maps\n===========\n", + pimap->n_mapped); + for (i = 0; i < pimap->n_mapped; i++) { + seq_printf(m, "r_hwirq=%x, v_hwirq=%x\n", + pimap->mapped[i].r_hwirq, pimap->mapped[i].v_hwirq); + } +} + static int xics_debug_show(struct seq_file *m, void *private) { struct kvmppc_xics *xics = m->private; @@ -913,6 +933,8 @@ static int xics_debug_show(struct seq_file *m, void *private) t_check_resend = 0; t_reject = 0; + xics_debugfs_irqmap(m, kvm->arch.pimap); + seq_printf(m, "=========\nICP state\n=========\n"); kvm_for_each_vcpu(i, vcpu, kvm) { @@ -1252,6 +1274,8 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, { struct kvmppc_xics *xics = kvm->arch.xics; + if (!xics) + return -ENODEV; return ics_deliver_irq(xics, irq, level); } @@ -1418,3 +1442,34 @@ int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin) { return pin; } + +void kvmppc_xics_set_mapped(struct kvm *kvm, unsigned long irq, + unsigned long host_irq) +{ + struct kvmppc_xics *xics = kvm->arch.xics; + struct kvmppc_ics *ics; + u16 idx; + + ics = kvmppc_xics_find_ics(xics, irq, &idx); + if (!ics) + return; + + ics->irq_state[idx].host_irq = host_irq; + ics->irq_state[idx].intr_cpu = -1; +} +EXPORT_SYMBOL_GPL(kvmppc_xics_set_mapped); + +void kvmppc_xics_clr_mapped(struct kvm *kvm, unsigned long irq, + unsigned long host_irq) +{ + struct kvmppc_xics *xics = kvm->arch.xics; + struct kvmppc_ics *ics; + u16 idx; + + ics = kvmppc_xics_find_ics(xics, irq, &idx); + if (!ics) + return; + + ics->irq_state[idx].host_irq = 0; +} +EXPORT_SYMBOL_GPL(kvmppc_xics_clr_mapped); diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h index a46b954..2a50320 100644 --- a/arch/powerpc/kvm/book3s_xics.h +++ b/arch/powerpc/kvm/book3s_xics.h @@ -42,6 +42,8 @@ struct ics_irq_state { u8 lsi; /* level-sensitive interrupt */ u8 asserted; /* Only for LSI */ u8 exists; + int intr_cpu; + u32 host_irq; }; /* Atomic ICP state, updated with a single compare & swap */ diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 02b4672..df3f270 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -2038,7 +2038,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, if (type == KVMPPC_DEBUG_NONE) continue; - if (type & !(KVMPPC_DEBUG_WATCH_READ | + if (type & ~(KVMPPC_DEBUG_WATCH_READ | KVMPPC_DEBUG_WATCH_WRITE | KVMPPC_DEBUG_BREAKPOINT)) return -EINVAL; diff --git a/arch/powerpc/kvm/e500_mmu.c b/arch/powerpc/kvm/e500_mmu.c index 29911a0..ddbf8f0 100644 --- a/arch/powerpc/kvm/e500_mmu.c +++ b/arch/powerpc/kvm/e500_mmu.c @@ -743,7 +743,7 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu, char *virt; struct page **pages; struct tlbe_priv *privs[2] = {}; - u64 *g2h_bitmap = NULL; + u64 *g2h_bitmap; size_t array_len; u32 sets; int num_pages, ret, i; @@ -779,41 +779,44 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu, num_pages = DIV_ROUND_UP(cfg->array + array_len - 1, PAGE_SIZE) - cfg->array / PAGE_SIZE; - pages = kmalloc(sizeof(struct page *) * num_pages, GFP_KERNEL); + pages = kmalloc_array(num_pages, sizeof(*pages), GFP_KERNEL); if (!pages) return -ENOMEM; ret = get_user_pages_fast(cfg->array, num_pages, 1, pages); if (ret < 0) - goto err_pages; + goto free_pages; if (ret != num_pages) { num_pages = ret; ret = -EFAULT; - goto err_put_page; + goto put_pages; } virt = vmap(pages, num_pages, VM_MAP, PAGE_KERNEL); if (!virt) { ret = -ENOMEM; - goto err_put_page; + goto put_pages; } - privs[0] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[0], - GFP_KERNEL); - privs[1] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[1], - GFP_KERNEL); + privs[0] = kcalloc(params.tlb_sizes[0], sizeof(*privs[0]), GFP_KERNEL); + if (!privs[0]) { + ret = -ENOMEM; + goto put_pages; + } - if (!privs[0] || !privs[1]) { + privs[1] = kcalloc(params.tlb_sizes[1], sizeof(*privs[1]), GFP_KERNEL); + if (!privs[1]) { ret = -ENOMEM; - goto err_privs; + goto free_privs_first; } - g2h_bitmap = kzalloc(sizeof(u64) * params.tlb_sizes[1], - GFP_KERNEL); + g2h_bitmap = kcalloc(params.tlb_sizes[1], + sizeof(*g2h_bitmap), + GFP_KERNEL); if (!g2h_bitmap) { ret = -ENOMEM; - goto err_privs; + goto free_privs_second; } free_gtlb(vcpu_e500); @@ -845,16 +848,14 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu, kvmppc_recalc_tlb1map_range(vcpu_e500); return 0; - -err_privs: - kfree(privs[0]); + free_privs_second: kfree(privs[1]); - -err_put_page: + free_privs_first: + kfree(privs[0]); + put_pages: for (i = 0; i < num_pages; i++) put_page(pages[i]); - -err_pages: + free_pages: kfree(pages); return ret; } @@ -904,11 +905,9 @@ static int vcpu_mmu_init(struct kvm_vcpu *vcpu, int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500) { struct kvm_vcpu *vcpu = &vcpu_e500->vcpu; - int entry_size = sizeof(struct kvm_book3e_206_tlb_entry); - int entries = KVM_E500_TLB0_SIZE + KVM_E500_TLB1_SIZE; if (e500_mmu_host_init(vcpu_e500)) - goto err; + goto free_vcpu; vcpu_e500->gtlb_params[0].entries = KVM_E500_TLB0_SIZE; vcpu_e500->gtlb_params[1].entries = KVM_E500_TLB1_SIZE; @@ -920,37 +919,39 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500) vcpu_e500->gtlb_params[1].ways = KVM_E500_TLB1_SIZE; vcpu_e500->gtlb_params[1].sets = 1; - vcpu_e500->gtlb_arch = kmalloc(entries * entry_size, GFP_KERNEL); + vcpu_e500->gtlb_arch = kmalloc_array(KVM_E500_TLB0_SIZE + + KVM_E500_TLB1_SIZE, + sizeof(*vcpu_e500->gtlb_arch), + GFP_KERNEL); if (!vcpu_e500->gtlb_arch) return -ENOMEM; vcpu_e500->gtlb_offset[0] = 0; vcpu_e500->gtlb_offset[1] = KVM_E500_TLB0_SIZE; - vcpu_e500->gtlb_priv[0] = kzalloc(sizeof(struct tlbe_ref) * - vcpu_e500->gtlb_params[0].entries, + vcpu_e500->gtlb_priv[0] = kcalloc(vcpu_e500->gtlb_params[0].entries, + sizeof(struct tlbe_ref), GFP_KERNEL); if (!vcpu_e500->gtlb_priv[0]) - goto err; + goto free_vcpu; - vcpu_e500->gtlb_priv[1] = kzalloc(sizeof(struct tlbe_ref) * - vcpu_e500->gtlb_params[1].entries, + vcpu_e500->gtlb_priv[1] = kcalloc(vcpu_e500->gtlb_params[1].entries, + sizeof(struct tlbe_ref), GFP_KERNEL); if (!vcpu_e500->gtlb_priv[1]) - goto err; + goto free_vcpu; - vcpu_e500->g2h_tlb1_map = kzalloc(sizeof(u64) * - vcpu_e500->gtlb_params[1].entries, + vcpu_e500->g2h_tlb1_map = kcalloc(vcpu_e500->gtlb_params[1].entries, + sizeof(*vcpu_e500->g2h_tlb1_map), GFP_KERNEL); if (!vcpu_e500->g2h_tlb1_map) - goto err; + goto free_vcpu; vcpu_mmu_init(vcpu, vcpu_e500->gtlb_params); kvmppc_recalc_tlb1map_range(vcpu_e500); return 0; - -err: + free_vcpu: free_gtlb(vcpu_e500); return -1; } diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 6ce40dd..70963c8 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -27,6 +27,8 @@ #include <linux/slab.h> #include <linux/file.h> #include <linux/module.h> +#include <linux/irqbypass.h> +#include <linux/kvm_irqfd.h> #include <asm/cputable.h> #include <asm/uaccess.h> #include <asm/kvm_ppc.h> @@ -436,6 +438,16 @@ err_out: return -EINVAL; } +bool kvm_arch_has_vcpu_debugfs(void) +{ + return false; +} + +int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu) +{ + return 0; +} + void kvm_arch_destroy_vm(struct kvm *kvm) { unsigned int i; @@ -739,6 +751,42 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) #endif } +/* + * irq_bypass_add_producer and irq_bypass_del_producer are only + * useful if the architecture supports PCI passthrough. + * irq_bypass_stop and irq_bypass_start are not needed and so + * kvm_ops are not defined for them. + */ +bool kvm_arch_has_irq_bypass(void) +{ + return ((kvmppc_hv_ops && kvmppc_hv_ops->irq_bypass_add_producer) || + (kvmppc_pr_ops && kvmppc_pr_ops->irq_bypass_add_producer)); +} + +int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, + struct irq_bypass_producer *prod) +{ + struct kvm_kernel_irqfd *irqfd = + container_of(cons, struct kvm_kernel_irqfd, consumer); + struct kvm *kvm = irqfd->kvm; + + if (kvm->arch.kvm_ops->irq_bypass_add_producer) + return kvm->arch.kvm_ops->irq_bypass_add_producer(cons, prod); + + return 0; +} + +void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, + struct irq_bypass_producer *prod) +{ + struct kvm_kernel_irqfd *irqfd = + container_of(cons, struct kvm_kernel_irqfd, consumer); + struct kvm *kvm = irqfd->kvm; + + if (kvm->arch.kvm_ops->irq_bypass_del_producer) + kvm->arch.kvm_ops->irq_bypass_del_producer(cons, prod); +} + static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu, struct kvm_run *run) { @@ -1167,6 +1215,19 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, return r; } +bool kvm_arch_intc_initialized(struct kvm *kvm) +{ +#ifdef CONFIG_KVM_MPIC + if (kvm->arch.mpic) + return true; +#endif +#ifdef CONFIG_KVM_XICS + if (kvm->arch.xics) + return true; +#endif + return false; +} + int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { diff --git a/arch/powerpc/kvm/trace_hv.h b/arch/powerpc/kvm/trace_hv.h index 33d9daf..fb21990 100644 --- a/arch/powerpc/kvm/trace_hv.h +++ b/arch/powerpc/kvm/trace_hv.h @@ -432,6 +432,28 @@ TRACE_EVENT(kvmppc_vcore_blocked, __entry->runner_vcpu, __entry->n_runnable, __entry->tgid) ); +TRACE_EVENT(kvmppc_vcore_wakeup, + TP_PROTO(int do_sleep, __u64 ns), + + TP_ARGS(do_sleep, ns), + + TP_STRUCT__entry( + __field(__u64, ns) + __field(int, waited) + __field(pid_t, tgid) + ), + + TP_fast_assign( + __entry->ns = ns; + __entry->waited = do_sleep; + __entry->tgid = current->tgid; + ), + + TP_printk("%s time %lld ns, tgid=%d", + __entry->waited ? "wait" : "poll", + __entry->ns, __entry->tgid) +); + TRACE_EVENT(kvmppc_run_vcpu_enter, TP_PROTO(struct kvm_vcpu *vcpu), diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c index 0e4e965..83ddc0e 100644 --- a/arch/powerpc/mm/hash_native_64.c +++ b/arch/powerpc/mm/hash_native_64.c @@ -493,36 +493,6 @@ static void native_hugepage_invalidate(unsigned long vsid, } #endif -static inline int __hpte_actual_psize(unsigned int lp, int psize) -{ - int i, shift; - unsigned int mask; - - /* start from 1 ignoring MMU_PAGE_4K */ - for (i = 1; i < MMU_PAGE_COUNT; i++) { - - /* invalid penc */ - if (mmu_psize_defs[psize].penc[i] == -1) - continue; - /* - * encoding bits per actual page size - * PTE LP actual page size - * rrrr rrrz >=8KB - * rrrr rrzz >=16KB - * rrrr rzzz >=32KB - * rrrr zzzz >=64KB - * ....... - */ - shift = mmu_psize_defs[i].shift - LP_SHIFT; - if (shift > LP_BITS) - shift = LP_BITS; - mask = (1 << shift) - 1; - if ((lp & mask) == mmu_psize_defs[psize].penc[i]) - return i; - } - return -1; -} - static void hpte_decode(struct hash_pte *hpte, unsigned long slot, int *psize, int *apsize, int *ssize, unsigned long *vpn) { @@ -538,16 +508,8 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot, size = MMU_PAGE_4K; a_size = MMU_PAGE_4K; } else { - for (size = 0; size < MMU_PAGE_COUNT; size++) { - - /* valid entries have a shift value */ - if (!mmu_psize_defs[size].shift) - continue; - - a_size = __hpte_actual_psize(lp, size); - if (a_size != -1) - break; - } + size = hpte_page_sizes[lp] & 0xf; + a_size = hpte_page_sizes[lp] >> 4; } /* This works for all page sizes, and for 256M and 1T segments */ if (cpu_has_feature(CPU_FTR_ARCH_300)) diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 0821556..ef3ae89 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -93,6 +93,9 @@ static unsigned long _SDR1; struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; EXPORT_SYMBOL_GPL(mmu_psize_defs); +u8 hpte_page_sizes[1 << LP_BITS]; +EXPORT_SYMBOL_GPL(hpte_page_sizes); + struct hash_pte *htab_address; unsigned long htab_size_bytes; unsigned long htab_hash_mask; @@ -564,8 +567,60 @@ static void __init htab_scan_page_sizes(void) #endif /* CONFIG_HUGETLB_PAGE */ } +/* + * Fill in the hpte_page_sizes[] array. + * We go through the mmu_psize_defs[] array looking for all the + * supported base/actual page size combinations. Each combination + * has a unique pagesize encoding (penc) value in the low bits of + * the LP field of the HPTE. For actual page sizes less than 1MB, + * some of the upper LP bits are used for RPN bits, meaning that + * we need to fill in several entries in hpte_page_sizes[]. + * + * In diagrammatic form, with r = RPN bits and z = page size bits: + * PTE LP actual page size + * rrrr rrrz >=8KB + * rrrr rrzz >=16KB + * rrrr rzzz >=32KB + * rrrr zzzz >=64KB + * ... + * + * The zzzz bits are implementation-specific but are chosen so that + * no encoding for a larger page size uses the same value in its + * low-order N bits as the encoding for the 2^(12+N) byte page size + * (if it exists). + */ +static void init_hpte_page_sizes(void) +{ + long int ap, bp; + long int shift, penc; + + for (bp = 0; bp < MMU_PAGE_COUNT; ++bp) { + if (!mmu_psize_defs[bp].shift) + continue; /* not a supported page size */ + for (ap = bp; ap < MMU_PAGE_COUNT; ++ap) { + penc = mmu_psize_defs[bp].penc[ap]; + if (penc == -1) + continue; + shift = mmu_psize_defs[ap].shift - LP_SHIFT; + if (shift <= 0) + continue; /* should never happen */ + /* + * For page sizes less than 1MB, this loop + * replicates the entry for all possible values + * of the rrrr bits. + */ + while (penc < (1 << LP_BITS)) { + hpte_page_sizes[penc] = (ap << 4) | bp; + penc += 1 << shift; + } + } + } +} + static void __init htab_init_page_sizes(void) { + init_hpte_page_sizes(); + if (!debug_pagealloc_enabled()) { /* * Pick a size for the linear mapping. Currently, we only diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S index 3d29d40..44d2d84 100644 --- a/arch/powerpc/platforms/powernv/opal-wrappers.S +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -208,6 +208,7 @@ OPAL_CALL(opal_pci_config_write_byte, OPAL_PCI_CONFIG_WRITE_BYTE); OPAL_CALL(opal_pci_config_write_half_word, OPAL_PCI_CONFIG_WRITE_HALF_WORD); OPAL_CALL(opal_pci_config_write_word, OPAL_PCI_CONFIG_WRITE_WORD); OPAL_CALL(opal_set_xive, OPAL_SET_XIVE); +OPAL_CALL_REAL(opal_rm_set_xive, OPAL_SET_XIVE); OPAL_CALL(opal_get_xive, OPAL_GET_XIVE); OPAL_CALL(opal_register_exception_handler, OPAL_REGISTER_OPAL_EXCEPTION_HANDLER); OPAL_CALL(opal_pci_eeh_freeze_status, OPAL_PCI_EEH_FREEZE_STATUS); diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 38a5c65..d314ecc 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2718,15 +2718,21 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, } #ifdef CONFIG_PCI_MSI -static void pnv_ioda2_msi_eoi(struct irq_data *d) +int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq) { - unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); - struct irq_chip *chip = irq_data_get_irq_chip(d); struct pnv_phb *phb = container_of(chip, struct pnv_phb, ioda.irq_chip); + + return opal_pci_msi_eoi(phb->opal_id, hw_irq); +} + +static void pnv_ioda2_msi_eoi(struct irq_data *d) +{ int64_t rc; + unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); + struct irq_chip *chip = irq_data_get_irq_chip(d); - rc = opal_pci_msi_eoi(phb->opal_id, hw_irq); + rc = pnv_opal_pci_msi_eoi(chip, hw_irq); WARN_ON_ONCE(rc); icp_native_eoi(d); @@ -2756,6 +2762,16 @@ void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq) irq_set_chip(virq, &phb->ioda.irq_chip); } +/* + * Returns true iff chip is something that we could call + * pnv_opal_pci_msi_eoi for. + */ +bool is_pnv_opal_msi(struct irq_chip *chip) +{ + return chip->irq_eoi == pnv_ioda2_msi_eoi; +} +EXPORT_SYMBOL_GPL(is_pnv_opal_msi); + static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, unsigned int hwirq, unsigned int virq, unsigned int is_64, struct msi_msg *msg) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 8e5daf7..a41faf3 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -28,7 +28,7 @@ #define KVM_S390_BSCA_CPU_SLOTS 64 #define KVM_S390_ESCA_CPU_SLOTS 248 -#define KVM_MAX_VCPUS KVM_S390_ESCA_CPU_SLOTS +#define KVM_MAX_VCPUS 255 #define KVM_USER_MEM_SLOTS 32 /* @@ -245,72 +245,72 @@ struct sie_page { } __packed; struct kvm_vcpu_stat { - u32 exit_userspace; - u32 exit_null; - u32 exit_external_request; - u32 exit_external_interrupt; - u32 exit_stop_request; - u32 exit_validity; - u32 exit_instruction; - u32 exit_pei; - u32 halt_successful_poll; - u32 halt_attempted_poll; - u32 halt_poll_invalid; - u32 halt_wakeup; - u32 instruction_lctl; - u32 instruction_lctlg; - u32 instruction_stctl; - u32 instruction_stctg; - u32 exit_program_interruption; - u32 exit_instr_and_program; - u32 exit_operation_exception; - u32 deliver_external_call; - u32 deliver_emergency_signal; - u32 deliver_service_signal; - u32 deliver_virtio_interrupt; - u32 deliver_stop_signal; - u32 deliver_prefix_signal; - u32 deliver_restart_signal; - u32 deliver_program_int; - u32 deliver_io_int; - u32 exit_wait_state; - u32 instruction_pfmf; - u32 instruction_stidp; - u32 instruction_spx; - u32 instruction_stpx; - u32 instruction_stap; - u32 instruction_storage_key; - u32 instruction_ipte_interlock; - u32 instruction_stsch; - u32 instruction_chsc; - u32 instruction_stsi; - u32 instruction_stfl; - u32 instruction_tprot; - u32 instruction_sie; - u32 instruction_essa; - u32 instruction_sthyi; - u32 instruction_sigp_sense; - u32 instruction_sigp_sense_running; - u32 instruction_sigp_external_call; - u32 instruction_sigp_emergency; - u32 instruction_sigp_cond_emergency; - u32 instruction_sigp_start; - u32 instruction_sigp_stop; - u32 instruction_sigp_stop_store_status; - u32 instruction_sigp_store_status; - u32 instruction_sigp_store_adtl_status; - u32 instruction_sigp_arch; - u32 instruction_sigp_prefix; - u32 instruction_sigp_restart; - u32 instruction_sigp_init_cpu_reset; - u32 instruction_sigp_cpu_reset; - u32 instruction_sigp_unknown; - u32 diagnose_10; - u32 diagnose_44; - u32 diagnose_9c; - u32 diagnose_258; - u32 diagnose_308; - u32 diagnose_500; + u64 exit_userspace; + u64 exit_null; + u64 exit_external_request; + u64 exit_external_interrupt; + u64 exit_stop_request; + u64 exit_validity; + u64 exit_instruction; + u64 exit_pei; + u64 halt_successful_poll; + u64 halt_attempted_poll; + u64 halt_poll_invalid; + u64 halt_wakeup; + u64 instruction_lctl; + u64 instruction_lctlg; + u64 instruction_stctl; + u64 instruction_stctg; + u64 exit_program_interruption; + u64 exit_instr_and_program; + u64 exit_operation_exception; + u64 deliver_external_call; + u64 deliver_emergency_signal; + u64 deliver_service_signal; + u64 deliver_virtio_interrupt; + u64 deliver_stop_signal; + u64 deliver_prefix_signal; + u64 deliver_restart_signal; + u64 deliver_program_int; + u64 deliver_io_int; + u64 exit_wait_state; + u64 instruction_pfmf; + u64 instruction_stidp; + u64 instruction_spx; + u64 instruction_stpx; + u64 instruction_stap; + u64 instruction_storage_key; + u64 instruction_ipte_interlock; + u64 instruction_stsch; + u64 instruction_chsc; + u64 instruction_stsi; + u64 instruction_stfl; + u64 instruction_tprot; + u64 instruction_sie; + u64 instruction_essa; + u64 instruction_sthyi; + u64 instruction_sigp_sense; + u64 instruction_sigp_sense_running; + u64 instruction_sigp_external_call; + u64 instruction_sigp_emergency; + u64 instruction_sigp_cond_emergency; + u64 instruction_sigp_start; + u64 instruction_sigp_stop; + u64 instruction_sigp_stop_store_status; + u64 instruction_sigp_store_status; + u64 instruction_sigp_store_adtl_status; + u64 instruction_sigp_arch; + u64 instruction_sigp_prefix; + u64 instruction_sigp_restart; + u64 instruction_sigp_init_cpu_reset; + u64 instruction_sigp_cpu_reset; + u64 instruction_sigp_unknown; + u64 diagnose_10; + u64 diagnose_44; + u64 diagnose_9c; + u64 diagnose_258; + u64 diagnose_308; + u64 diagnose_500; }; #define PGM_OPERATION 0x01 @@ -577,7 +577,7 @@ struct kvm_vcpu_arch { }; struct kvm_vm_stat { - u32 remote_tlb_flush; + ulong remote_tlb_flush; }; struct kvm_arch_memory_slot { diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index 1f95cc1..f3df9e0 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -125,6 +125,7 @@ int main(void) OFFSET(__LC_STFL_FAC_LIST, lowcore, stfl_fac_list); OFFSET(__LC_STFLE_FAC_LIST, lowcore, stfle_fac_list); OFFSET(__LC_MCCK_CODE, lowcore, mcck_interruption_code); + OFFSET(__LC_EXT_DAMAGE_CODE, lowcore, external_damage_code); OFFSET(__LC_MCCK_FAIL_STOR_ADDR, lowcore, failing_storage_address); OFFSET(__LC_LAST_BREAK, lowcore, breaking_event_addr); OFFSET(__LC_RST_OLD_PSW, lowcore, restart_old_psw); diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 5420020..4aa8a7e 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -495,6 +495,18 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva, tec = (struct trans_exc_code_bits *)&pgm->trans_exc_code; switch (code) { + case PGM_PROTECTION: + switch (prot) { + case PROT_TYPE_ALC: + tec->b60 = 1; + /* FALL THROUGH */ + case PROT_TYPE_DAT: + tec->b61 = 1; + break; + default: /* LA and KEYC set b61 to 0, other params undefined */ + return code; + } + /* FALL THROUGH */ case PGM_ASCE_TYPE: case PGM_PAGE_TRANSLATION: case PGM_REGION_FIRST_TRANS: @@ -504,8 +516,7 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva, /* * op_access_id only applies to MOVE_PAGE -> set bit 61 * exc_access_id has to be set to 0 for some instructions. Both - * cases have to be handled by the caller. We can always store - * exc_access_id, as it is undefined for non-ar cases. + * cases have to be handled by the caller. */ tec->addr = gva >> PAGE_SHIFT; tec->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH; @@ -516,25 +527,13 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva, case PGM_ASTE_VALIDITY: case PGM_ASTE_SEQUENCE: case PGM_EXTENDED_AUTHORITY: + /* + * We can always store exc_access_id, as it is + * undefined for non-ar cases. It is undefined for + * most DAT protection exceptions. + */ pgm->exc_access_id = ar; break; - case PGM_PROTECTION: - switch (prot) { - case PROT_TYPE_ALC: - tec->b60 = 1; - /* FALL THROUGH */ - case PROT_TYPE_DAT: - tec->b61 = 1; - tec->addr = gva >> PAGE_SHIFT; - tec->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH; - tec->as = psw_bits(vcpu->arch.sie_block->gpsw).as; - /* exc_access_id is undefined for most cases */ - pgm->exc_access_id = ar; - break; - default: /* LA and KEYC set b61 to 0, other params undefined */ - break; - } - break; } return code; } diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c index 31a0533..d7c6a7f 100644 --- a/arch/s390/kvm/guestdbg.c +++ b/arch/s390/kvm/guestdbg.c @@ -206,7 +206,7 @@ static int __import_wp_info(struct kvm_vcpu *vcpu, int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) { - int ret = 0, nr_wp = 0, nr_bp = 0, i, size; + int ret = 0, nr_wp = 0, nr_bp = 0, i; struct kvm_hw_breakpoint *bp_data = NULL; struct kvm_hw_wp_info_arch *wp_info = NULL; struct kvm_hw_bp_info_arch *bp_info = NULL; @@ -216,17 +216,10 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu, else if (dbg->arch.nr_hw_bp > MAX_BP_COUNT) return -EINVAL; - size = dbg->arch.nr_hw_bp * sizeof(struct kvm_hw_breakpoint); - bp_data = kmalloc(size, GFP_KERNEL); - if (!bp_data) { - ret = -ENOMEM; - goto error; - } - - if (copy_from_user(bp_data, dbg->arch.hw_bp, size)) { - ret = -EFAULT; - goto error; - } + bp_data = memdup_user(dbg->arch.hw_bp, + sizeof(*bp_data) * dbg->arch.nr_hw_bp); + if (IS_ERR(bp_data)) + return PTR_ERR(bp_data); for (i = 0; i < dbg->arch.nr_hw_bp; i++) { switch (bp_data[i].type) { @@ -241,17 +234,19 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu, } } - size = nr_wp * sizeof(struct kvm_hw_wp_info_arch); - if (size > 0) { - wp_info = kmalloc(size, GFP_KERNEL); + if (nr_wp > 0) { + wp_info = kmalloc_array(nr_wp, + sizeof(*wp_info), + GFP_KERNEL); if (!wp_info) { ret = -ENOMEM; goto error; } } - size = nr_bp * sizeof(struct kvm_hw_bp_info_arch); - if (size > 0) { - bp_info = kmalloc(size, GFP_KERNEL); + if (nr_bp > 0) { + bp_info = kmalloc_array(nr_bp, + sizeof(*bp_info), + GFP_KERNEL); if (!bp_info) { ret = -ENOMEM; goto error; @@ -382,14 +377,20 @@ void kvm_s390_prepare_debug_exit(struct kvm_vcpu *vcpu) vcpu->guest_debug &= ~KVM_GUESTDBG_EXIT_PENDING; } +#define PER_CODE_MASK (PER_EVENT_MASK >> 24) +#define PER_CODE_BRANCH (PER_EVENT_BRANCH >> 24) +#define PER_CODE_IFETCH (PER_EVENT_IFETCH >> 24) +#define PER_CODE_STORE (PER_EVENT_STORE >> 24) +#define PER_CODE_STORE_REAL (PER_EVENT_STORE_REAL >> 24) + #define per_bp_event(code) \ - (code & (PER_EVENT_IFETCH | PER_EVENT_BRANCH)) + (code & (PER_CODE_IFETCH | PER_CODE_BRANCH)) #define per_write_wp_event(code) \ - (code & (PER_EVENT_STORE | PER_EVENT_STORE_REAL)) + (code & (PER_CODE_STORE | PER_CODE_STORE_REAL)) static int debug_exit_required(struct kvm_vcpu *vcpu) { - u32 perc = (vcpu->arch.sie_block->perc << 24); + u8 perc = vcpu->arch.sie_block->perc; struct kvm_debug_exit_arch *debug_exit = &vcpu->run->debug.arch; struct kvm_hw_wp_info_arch *wp_info = NULL; struct kvm_hw_bp_info_arch *bp_info = NULL; @@ -444,7 +445,7 @@ int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu) const u8 ilen = kvm_s390_get_ilen(vcpu); struct kvm_s390_pgm_info pgm_info = { .code = PGM_PER, - .per_code = PER_EVENT_IFETCH >> 24, + .per_code = PER_CODE_IFETCH, .per_address = __rewind_psw(vcpu->arch.sie_block->gpsw, ilen), }; @@ -458,33 +459,33 @@ int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu) static void filter_guest_per_event(struct kvm_vcpu *vcpu) { - u32 perc = vcpu->arch.sie_block->perc << 24; + const u8 perc = vcpu->arch.sie_block->perc; u64 peraddr = vcpu->arch.sie_block->peraddr; u64 addr = vcpu->arch.sie_block->gpsw.addr; u64 cr9 = vcpu->arch.sie_block->gcr[9]; u64 cr10 = vcpu->arch.sie_block->gcr[10]; u64 cr11 = vcpu->arch.sie_block->gcr[11]; /* filter all events, demanded by the guest */ - u32 guest_perc = perc & cr9 & PER_EVENT_MASK; + u8 guest_perc = perc & (cr9 >> 24) & PER_CODE_MASK; if (!guest_per_enabled(vcpu)) guest_perc = 0; /* filter "successful-branching" events */ - if (guest_perc & PER_EVENT_BRANCH && + if (guest_perc & PER_CODE_BRANCH && cr9 & PER_CONTROL_BRANCH_ADDRESS && !in_addr_range(addr, cr10, cr11)) - guest_perc &= ~PER_EVENT_BRANCH; + guest_perc &= ~PER_CODE_BRANCH; /* filter "instruction-fetching" events */ - if (guest_perc & PER_EVENT_IFETCH && + if (guest_perc & PER_CODE_IFETCH && !in_addr_range(peraddr, cr10, cr11)) - guest_perc &= ~PER_EVENT_IFETCH; + guest_perc &= ~PER_CODE_IFETCH; /* All other PER events will be given to the guest */ /* TODO: Check altered address/address space */ - vcpu->arch.sie_block->perc = guest_perc >> 24; + vcpu->arch.sie_block->perc = guest_perc; if (!guest_perc) vcpu->arch.sie_block->iprcc &= ~PGM_PER; diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index dfd0ca2..1cab8a1 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -29,6 +29,7 @@ static const intercept_handler_t instruction_handlers[256] = { [0x01] = kvm_s390_handle_01, [0x82] = kvm_s390_handle_lpsw, [0x83] = kvm_s390_handle_diag, + [0xaa] = kvm_s390_handle_aa, [0xae] = kvm_s390_handle_sigp, [0xb2] = kvm_s390_handle_b2, [0xb6] = kvm_s390_handle_stctl, diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 24524c0..be4db07 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -24,6 +24,8 @@ #include <asm/sclp.h> #include <asm/isc.h> #include <asm/gmap.h> +#include <asm/switch_to.h> +#include <asm/nmi.h> #include "kvm-s390.h" #include "gaccess.h" #include "trace-s390.h" @@ -40,6 +42,7 @@ static int sca_ext_call_pending(struct kvm_vcpu *vcpu, int *src_id) if (!(atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_ECALL_PEND)) return 0; + BUG_ON(!kvm_s390_use_sca_entries()); read_lock(&vcpu->kvm->arch.sca_lock); if (vcpu->kvm->arch.use_esca) { struct esca_block *sca = vcpu->kvm->arch.sca; @@ -68,6 +71,7 @@ static int sca_inject_ext_call(struct kvm_vcpu *vcpu, int src_id) { int expect, rc; + BUG_ON(!kvm_s390_use_sca_entries()); read_lock(&vcpu->kvm->arch.sca_lock); if (vcpu->kvm->arch.use_esca) { struct esca_block *sca = vcpu->kvm->arch.sca; @@ -109,6 +113,8 @@ static void sca_clear_ext_call(struct kvm_vcpu *vcpu) struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; int rc, expect; + if (!kvm_s390_use_sca_entries()) + return; atomic_andnot(CPUSTAT_ECALL_PEND, li->cpuflags); read_lock(&vcpu->kvm->arch.sca_lock); if (vcpu->kvm->arch.use_esca) { @@ -400,12 +406,78 @@ static int __must_check __deliver_pfault_init(struct kvm_vcpu *vcpu) return rc ? -EFAULT : 0; } +static int __write_machine_check(struct kvm_vcpu *vcpu, + struct kvm_s390_mchk_info *mchk) +{ + unsigned long ext_sa_addr; + freg_t fprs[NUM_FPRS]; + union mci mci; + int rc; + + mci.val = mchk->mcic; + /* take care of lazy register loading via vcpu load/put */ + save_fpu_regs(); + save_access_regs(vcpu->run->s.regs.acrs); + + /* Extended save area */ + rc = read_guest_lc(vcpu, __LC_VX_SAVE_AREA_ADDR, &ext_sa_addr, + sizeof(unsigned long)); + /* Only bits 0-53 are used for address formation */ + ext_sa_addr &= ~0x3ffUL; + if (!rc && mci.vr && ext_sa_addr && test_kvm_facility(vcpu->kvm, 129)) { + if (write_guest_abs(vcpu, ext_sa_addr, vcpu->run->s.regs.vrs, + 512)) + mci.vr = 0; + } else { + mci.vr = 0; + } + + /* General interruption information */ + rc |= put_guest_lc(vcpu, 1, (u8 __user *) __LC_AR_MODE_ID); + rc |= write_guest_lc(vcpu, __LC_MCK_OLD_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= read_guest_lc(vcpu, __LC_MCK_NEW_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= put_guest_lc(vcpu, mci.val, (u64 __user *) __LC_MCCK_CODE); + + /* Register-save areas */ + if (MACHINE_HAS_VX) { + convert_vx_to_fp(fprs, (__vector128 *) vcpu->run->s.regs.vrs); + rc |= write_guest_lc(vcpu, __LC_FPREGS_SAVE_AREA, fprs, 128); + } else { + rc |= write_guest_lc(vcpu, __LC_FPREGS_SAVE_AREA, + vcpu->run->s.regs.fprs, 128); + } + rc |= write_guest_lc(vcpu, __LC_GPREGS_SAVE_AREA, + vcpu->run->s.regs.gprs, 128); + rc |= put_guest_lc(vcpu, current->thread.fpu.fpc, + (u32 __user *) __LC_FP_CREG_SAVE_AREA); + rc |= put_guest_lc(vcpu, vcpu->arch.sie_block->todpr, + (u32 __user *) __LC_TOD_PROGREG_SAVE_AREA); + rc |= put_guest_lc(vcpu, kvm_s390_get_cpu_timer(vcpu), + (u64 __user *) __LC_CPU_TIMER_SAVE_AREA); + rc |= put_guest_lc(vcpu, vcpu->arch.sie_block->ckc >> 8, + (u64 __user *) __LC_CLOCK_COMP_SAVE_AREA); + rc |= write_guest_lc(vcpu, __LC_AREGS_SAVE_AREA, + &vcpu->run->s.regs.acrs, 64); + rc |= write_guest_lc(vcpu, __LC_CREGS_SAVE_AREA, + &vcpu->arch.sie_block->gcr, 128); + + /* Extended interruption information */ + rc |= put_guest_lc(vcpu, mchk->ext_damage_code, + (u32 __user *) __LC_EXT_DAMAGE_CODE); + rc |= put_guest_lc(vcpu, mchk->failing_storage_address, + (u64 __user *) __LC_MCCK_FAIL_STOR_ADDR); + rc |= write_guest_lc(vcpu, __LC_PSW_SAVE_AREA, &mchk->fixed_logout, + sizeof(mchk->fixed_logout)); + return rc ? -EFAULT : 0; +} + static int __must_check __deliver_machine_check(struct kvm_vcpu *vcpu) { struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; struct kvm_s390_mchk_info mchk = {}; - unsigned long adtl_status_addr; int deliver = 0; int rc = 0; @@ -446,29 +518,9 @@ static int __must_check __deliver_machine_check(struct kvm_vcpu *vcpu) trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_MCHK, mchk.cr14, mchk.mcic); - - rc = kvm_s390_vcpu_store_status(vcpu, - KVM_S390_STORE_STATUS_PREFIXED); - rc |= read_guest_lc(vcpu, __LC_VX_SAVE_AREA_ADDR, - &adtl_status_addr, - sizeof(unsigned long)); - rc |= kvm_s390_vcpu_store_adtl_status(vcpu, - adtl_status_addr); - rc |= put_guest_lc(vcpu, mchk.mcic, - (u64 __user *) __LC_MCCK_CODE); - rc |= put_guest_lc(vcpu, mchk.failing_storage_address, - (u64 __user *) __LC_MCCK_FAIL_STOR_ADDR); - rc |= write_guest_lc(vcpu, __LC_PSW_SAVE_AREA, - &mchk.fixed_logout, - sizeof(mchk.fixed_logout)); - rc |= write_guest_lc(vcpu, __LC_MCK_OLD_PSW, - &vcpu->arch.sie_block->gpsw, - sizeof(psw_t)); - rc |= read_guest_lc(vcpu, __LC_MCK_NEW_PSW, - &vcpu->arch.sie_block->gpsw, - sizeof(psw_t)); + rc = __write_machine_check(vcpu, &mchk); } - return rc ? -EFAULT : 0; + return rc; } static int __must_check __deliver_restart(struct kvm_vcpu *vcpu) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 7e8cb6a..9c7a1ec 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -384,7 +384,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_NR_VCPUS: case KVM_CAP_MAX_VCPUS: r = KVM_S390_BSCA_CPU_SLOTS; - if (sclp.has_esca && sclp.has_64bscao) + if (!kvm_s390_use_sca_entries()) + r = KVM_MAX_VCPUS; + else if (sclp.has_esca && sclp.has_64bscao) r = KVM_S390_ESCA_CPU_SLOTS; break; case KVM_CAP_NR_MEMSLOTS: @@ -1498,6 +1500,16 @@ out_err: return rc; } +bool kvm_arch_has_vcpu_debugfs(void) +{ + return false; +} + +int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu) +{ + return 0; +} + void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { VCPU_EVENT(vcpu, 3, "%s", "free cpu"); @@ -1561,6 +1573,8 @@ static int __kvm_ucontrol_vcpu_init(struct kvm_vcpu *vcpu) static void sca_del_vcpu(struct kvm_vcpu *vcpu) { + if (!kvm_s390_use_sca_entries()) + return; read_lock(&vcpu->kvm->arch.sca_lock); if (vcpu->kvm->arch.use_esca) { struct esca_block *sca = vcpu->kvm->arch.sca; @@ -1578,6 +1592,13 @@ static void sca_del_vcpu(struct kvm_vcpu *vcpu) static void sca_add_vcpu(struct kvm_vcpu *vcpu) { + if (!kvm_s390_use_sca_entries()) { + struct bsca_block *sca = vcpu->kvm->arch.sca; + + /* we still need the basic sca for the ipte control */ + vcpu->arch.sie_block->scaoh = (__u32)(((__u64)sca) >> 32); + vcpu->arch.sie_block->scaol = (__u32)(__u64)sca; + } read_lock(&vcpu->kvm->arch.sca_lock); if (vcpu->kvm->arch.use_esca) { struct esca_block *sca = vcpu->kvm->arch.sca; @@ -1658,6 +1679,11 @@ static int sca_can_add_vcpu(struct kvm *kvm, unsigned int id) { int rc; + if (!kvm_s390_use_sca_entries()) { + if (id < KVM_MAX_VCPUS) + return true; + return false; + } if (id < KVM_S390_BSCA_CPU_SLOTS) return true; if (!sclp.has_esca || !sclp.has_64bscao) @@ -1946,8 +1972,6 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->eca |= 1; if (sclp.has_sigpif) vcpu->arch.sie_block->eca |= 0x10000000U; - if (test_kvm_facility(vcpu->kvm, 64)) - vcpu->arch.sie_block->ecb3 |= 0x01; if (test_kvm_facility(vcpu->kvm, 129)) { vcpu->arch.sie_block->eca |= 0x00020000; vcpu->arch.sie_block->ecd |= 0x20000000; @@ -2704,6 +2728,19 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (vcpu->arch.pfault_token == KVM_S390_PFAULT_TOKEN_INVALID) kvm_clear_async_pf_completion_queue(vcpu); } + /* + * If userspace sets the riccb (e.g. after migration) to a valid state, + * we should enable RI here instead of doing the lazy enablement. + */ + if ((kvm_run->kvm_dirty_regs & KVM_SYNC_RICCB) && + test_kvm_facility(vcpu->kvm, 64)) { + struct runtime_instr_cb *riccb = + (struct runtime_instr_cb *) &kvm_run->s.regs.riccb; + + if (riccb->valid) + vcpu->arch.sie_block->ecb3 |= 0x01; + } + kvm_run->kvm_dirty_regs = 0; } @@ -2847,38 +2884,6 @@ int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr) return kvm_s390_store_status_unloaded(vcpu, addr); } -/* - * store additional status at address - */ -int kvm_s390_store_adtl_status_unloaded(struct kvm_vcpu *vcpu, - unsigned long gpa) -{ - /* Only bits 0-53 are used for address formation */ - if (!(gpa & ~0x3ff)) - return 0; - - return write_guest_abs(vcpu, gpa & ~0x3ff, - (void *)&vcpu->run->s.regs.vrs, 512); -} - -int kvm_s390_vcpu_store_adtl_status(struct kvm_vcpu *vcpu, unsigned long addr) -{ - if (!test_kvm_facility(vcpu->kvm, 129)) - return 0; - - /* - * The guest VXRS are in the host VXRs due to the lazy - * copying in vcpu load/put. We can simply call save_fpu_regs() - * to save the current register state because we are in the - * middle of a load/put cycle. - * - * Let's update our copies before we save it into the save area. - */ - save_fpu_regs(); - - return kvm_s390_store_adtl_status_unloaded(vcpu, addr); -} - static void __disable_ibs_on_vcpu(struct kvm_vcpu *vcpu) { kvm_check_request(KVM_REQ_ENABLE_IBS, vcpu); diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index b843286..3a4e97f 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -20,6 +20,7 @@ #include <linux/kvm_host.h> #include <asm/facility.h> #include <asm/processor.h> +#include <asm/sclp.h> typedef int (*intercept_handler_t)(struct kvm_vcpu *vcpu); @@ -245,6 +246,7 @@ static inline void kvm_s390_retry_instr(struct kvm_vcpu *vcpu) /* implemented in priv.c */ int is_valid_psw(psw_t *psw); +int kvm_s390_handle_aa(struct kvm_vcpu *vcpu); int kvm_s390_handle_b2(struct kvm_vcpu *vcpu); int kvm_s390_handle_e5(struct kvm_vcpu *vcpu); int kvm_s390_handle_01(struct kvm_vcpu *vcpu); @@ -273,10 +275,7 @@ int handle_sthyi(struct kvm_vcpu *vcpu); void kvm_s390_set_tod_clock(struct kvm *kvm, u64 tod); long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable); int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr); -int kvm_s390_store_adtl_status_unloaded(struct kvm_vcpu *vcpu, - unsigned long addr); int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr); -int kvm_s390_vcpu_store_adtl_status(struct kvm_vcpu *vcpu, unsigned long addr); void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu); void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu); void kvm_s390_vcpu_block(struct kvm_vcpu *vcpu); @@ -389,4 +388,13 @@ static inline union ipte_control *kvm_s390_get_ipte_control(struct kvm *kvm) return &sca->ipte_control; } +static inline int kvm_s390_use_sca_entries(void) +{ + /* + * Without SIGP interpretation, only SRS interpretation (if available) + * might use the entries. By not setting the entries and keeping them + * invalid, hardware will not access them but intercept. + */ + return sclp.has_sigpif; +} #endif diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 4616038..e184353 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -32,6 +32,24 @@ #include "kvm-s390.h" #include "trace.h" +static int handle_ri(struct kvm_vcpu *vcpu) +{ + if (test_kvm_facility(vcpu->kvm, 64)) { + vcpu->arch.sie_block->ecb3 |= 0x01; + kvm_s390_retry_instr(vcpu); + return 0; + } else + return kvm_s390_inject_program_int(vcpu, PGM_OPERATION); +} + +int kvm_s390_handle_aa(struct kvm_vcpu *vcpu) +{ + if ((vcpu->arch.sie_block->ipa & 0xf) <= 4) + return handle_ri(vcpu); + else + return -EOPNOTSUPP; +} + /* Handle SCK (SET CLOCK) interception */ static int handle_set_clock(struct kvm_vcpu *vcpu) { @@ -1093,6 +1111,9 @@ static int handle_stctg(struct kvm_vcpu *vcpu) static const intercept_handler_t eb_handlers[256] = { [0x2f] = handle_lctlg, [0x25] = handle_stctg, + [0x60] = handle_ri, + [0x61] = handle_ri, + [0x62] = handle_ri, }; int kvm_s390_handle_eb(struct kvm_vcpu *vcpu) diff --git a/arch/x86/configs/kvm_guest.config b/arch/x86/configs/kvm_guest.config deleted file mode 100644 index 9906505..0000000 --- a/arch/x86/configs/kvm_guest.config +++ /dev/null @@ -1,31 +0,0 @@ -CONFIG_NET=y -CONFIG_NET_CORE=y -CONFIG_NETDEVICES=y -CONFIG_BLOCK=y -CONFIG_BLK_DEV=y -CONFIG_NETWORK_FILESYSTEMS=y -CONFIG_INET=y -CONFIG_TTY=y -CONFIG_SERIAL_8250=y -CONFIG_SERIAL_8250_CONSOLE=y -CONFIG_IP_PNP=y -CONFIG_IP_PNP_DHCP=y -CONFIG_BINFMT_ELF=y -CONFIG_PCI=y -CONFIG_PCI_MSI=y -CONFIG_DEBUG_KERNEL=y -CONFIG_VIRTUALIZATION=y -CONFIG_HYPERVISOR_GUEST=y -CONFIG_PARAVIRT=y -CONFIG_KVM_GUEST=y -CONFIG_VIRTIO=y -CONFIG_VIRTIO_PCI=y -CONFIG_VIRTIO_BLK=y -CONFIG_VIRTIO_CONSOLE=y -CONFIG_VIRTIO_NET=y -CONFIG_9P_FS=y -CONFIG_NET_9P=y -CONFIG_NET_9P_VIRTIO=y -CONFIG_SCSI_LOWLEVEL=y -CONFIG_SCSI_VIRTIO=y -CONFIG_VIRTIO_INPUT=y diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index 94d54d0..02223cb 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -129,7 +129,7 @@ static notrace cycle_t vread_pvclock(int *mode) return 0; } - ret = __pvclock_read_cycles(pvti); + ret = __pvclock_read_cycles(pvti, rdtsc_ordered()); } while (pvclock_read_retry(pvti, version)); /* refer to vread_tsc() comment for rationale */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 33ae3a4..4b20f73 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -568,6 +568,7 @@ struct kvm_vcpu_arch { struct kvm_steal_time steal; } st; + u64 tsc_offset; u64 last_guest_tsc; u64 last_host_tsc; u64 tsc_offset_adjustment; @@ -701,6 +702,8 @@ struct kvm_hv { /* Hyper-v based guest crash (NT kernel bugcheck) parameters */ u64 hv_crash_param[HV_X64_MSR_CRASH_PARAMS]; u64 hv_crash_ctl; + + HV_REFERENCE_TSC_PAGE tsc_ref; }; struct kvm_arch { @@ -781,54 +784,56 @@ struct kvm_arch { bool disabled_lapic_found; /* Struct members for AVIC */ + u32 avic_vm_id; u32 ldr_mode; struct page *avic_logical_id_table_page; struct page *avic_physical_id_table_page; + struct hlist_node hnode; bool x2apic_format; bool x2apic_broadcast_quirk_disabled; }; struct kvm_vm_stat { - u32 mmu_shadow_zapped; - u32 mmu_pte_write; - u32 mmu_pte_updated; - u32 mmu_pde_zapped; - u32 mmu_flooded; - u32 mmu_recycled; - u32 mmu_cache_miss; - u32 mmu_unsync; - u32 remote_tlb_flush; - u32 lpages; + ulong mmu_shadow_zapped; + ulong mmu_pte_write; + ulong mmu_pte_updated; + ulong mmu_pde_zapped; + ulong mmu_flooded; + ulong mmu_recycled; + ulong mmu_cache_miss; + ulong mmu_unsync; + ulong remote_tlb_flush; + ulong lpages; }; struct kvm_vcpu_stat { - u32 pf_fixed; - u32 pf_guest; - u32 tlb_flush; - u32 invlpg; - - u32 exits; - u32 io_exits; - u32 mmio_exits; - u32 signal_exits; - u32 irq_window_exits; - u32 nmi_window_exits; - u32 halt_exits; - u32 halt_successful_poll; - u32 halt_attempted_poll; - u32 halt_poll_invalid; - u32 halt_wakeup; - u32 request_irq_exits; - u32 irq_exits; - u32 host_state_reload; - u32 efer_reload; - u32 fpu_reload; - u32 insn_emulation; - u32 insn_emulation_fail; - u32 hypercalls; - u32 irq_injections; - u32 nmi_injections; + u64 pf_fixed; + u64 pf_guest; + u64 tlb_flush; + u64 invlpg; + + u64 exits; + u64 io_exits; + u64 mmio_exits; + u64 signal_exits; + u64 irq_window_exits; + u64 nmi_window_exits; + u64 halt_exits; + u64 halt_successful_poll; + u64 halt_attempted_poll; + u64 halt_poll_invalid; + u64 halt_wakeup; + u64 request_irq_exits; + u64 irq_exits; + u64 host_state_reload; + u64 efer_reload; + u64 fpu_reload; + u64 insn_emulation; + u64 insn_emulation_fail; + u64 hypercalls; + u64 irq_injections; + u64 nmi_injections; }; struct x86_instruction_info; @@ -951,7 +956,6 @@ struct kvm_x86_ops { bool (*has_wbinvd_exit)(void); - u64 (*read_tsc_offset)(struct kvm_vcpu *vcpu); void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu, u64 host_tsc); diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index d019f0c..3ad741b 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -87,9 +87,10 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift) } static __always_inline -cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src) +cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, + u64 tsc) { - u64 delta = rdtsc_ordered() - src->tsc_timestamp; + u64 delta = tsc - src->tsc_timestamp; cycle_t offset = pvclock_scale_delta(delta, src->tsc_to_system_mul, src->tsc_shift); return src->system_time + offset; diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 3599404..5b2cc88 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -80,7 +80,7 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) do { version = pvclock_read_begin(src); - ret = __pvclock_read_cycles(src); + ret = __pvclock_read_cycles(src, rdtsc_ordered()); flags = src->flags; } while (pvclock_read_retry(src, version)); diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 464fa47..3bff207 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -13,7 +13,7 @@ kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ - hyperv.o page_track.o + hyperv.o page_track.o debugfs.o kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += assigned-dev.o iommu.o diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 3235e0f..afa7bbb 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -366,7 +366,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) | - F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB); + F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) | + F(AVX512BW) | F(AVX512VL); /* cpuid 0xD.1.eax */ const u32 kvm_cpuid_D_1_eax_x86_features = diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c new file mode 100644 index 0000000..c19c7ed --- /dev/null +++ b/arch/x86/kvm/debugfs.c @@ -0,0 +1,69 @@ +/* + * Kernel-based Virtual Machine driver for Linux + * + * Copyright 2016 Red Hat, Inc. and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ +#include <linux/kvm_host.h> +#include <linux/debugfs.h> + +bool kvm_arch_has_vcpu_debugfs(void) +{ + return true; +} + +static int vcpu_get_tsc_offset(void *data, u64 *val) +{ + struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data; + *val = vcpu->arch.tsc_offset; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_offset_fops, vcpu_get_tsc_offset, NULL, "%lld\n"); + +static int vcpu_get_tsc_scaling_ratio(void *data, u64 *val) +{ + struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data; + *val = vcpu->arch.tsc_scaling_ratio; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_fops, vcpu_get_tsc_scaling_ratio, NULL, "%llu\n"); + +static int vcpu_get_tsc_scaling_frac_bits(void *data, u64 *val) +{ + *val = kvm_tsc_scaling_ratio_frac_bits; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_frac_fops, vcpu_get_tsc_scaling_frac_bits, NULL, "%llu\n"); + +int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu) +{ + struct dentry *ret; + + ret = debugfs_create_file("tsc-offset", 0444, + vcpu->debugfs_dentry, + vcpu, &vcpu_tsc_offset_fops); + if (!ret) + return -ENOMEM; + + if (kvm_has_tsc_control) { + ret = debugfs_create_file("tsc-scaling-ratio", 0444, + vcpu->debugfs_dentry, + vcpu, &vcpu_tsc_scaling_fops); + if (!ret) + return -ENOMEM; + ret = debugfs_create_file("tsc-scaling-ratio-frac-bits", 0444, + vcpu->debugfs_dentry, + vcpu, &vcpu_tsc_scaling_frac_fops); + if (!ret) + return -ENOMEM; + + } + + return 0; +} diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 01bd7b7..42b1c83 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -386,7 +386,21 @@ static void synic_init(struct kvm_vcpu_hv_synic *synic) static u64 get_time_ref_counter(struct kvm *kvm) { - return div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset, 100); + struct kvm_hv *hv = &kvm->arch.hyperv; + struct kvm_vcpu *vcpu; + u64 tsc; + + /* + * The guest has not set up the TSC page or the clock isn't + * stable, fall back to get_kvmclock_ns. + */ + if (!hv->tsc_ref.tsc_sequence) + return div_u64(get_kvmclock_ns(kvm), 100); + + vcpu = kvm_get_vcpu(kvm, 0); + tsc = kvm_read_l1_tsc(vcpu, rdtsc()); + return mul_u64_u64_shr(tsc, hv->tsc_ref.tsc_scale, 64) + + hv->tsc_ref.tsc_offset; } static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer, @@ -756,6 +770,129 @@ static int kvm_hv_msr_set_crash_data(struct kvm_vcpu *vcpu, return 0; } +/* + * The kvmclock and Hyper-V TSC page use similar formulas, and converting + * between them is possible: + * + * kvmclock formula: + * nsec = (ticks - tsc_timestamp) * tsc_to_system_mul * 2^(tsc_shift-32) + * + system_time + * + * Hyper-V formula: + * nsec/100 = ticks * scale / 2^64 + offset + * + * When tsc_timestamp = system_time = 0, offset is zero in the Hyper-V formula. + * By dividing the kvmclock formula by 100 and equating what's left we get: + * ticks * scale / 2^64 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100 + * scale / 2^64 = tsc_to_system_mul * 2^(tsc_shift-32) / 100 + * scale = tsc_to_system_mul * 2^(32+tsc_shift) / 100 + * + * Now expand the kvmclock formula and divide by 100: + * nsec = ticks * tsc_to_system_mul * 2^(tsc_shift-32) + * - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32) + * + system_time + * nsec/100 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100 + * - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32) / 100 + * + system_time / 100 + * + * Replace tsc_to_system_mul * 2^(tsc_shift-32) / 100 by scale / 2^64: + * nsec/100 = ticks * scale / 2^64 + * - tsc_timestamp * scale / 2^64 + * + system_time / 100 + * + * Equate with the Hyper-V formula so that ticks * scale / 2^64 cancels out: + * offset = system_time / 100 - tsc_timestamp * scale / 2^64 + * + * These two equivalencies are implemented in this function. + */ +static bool compute_tsc_page_parameters(struct pvclock_vcpu_time_info *hv_clock, + HV_REFERENCE_TSC_PAGE *tsc_ref) +{ + u64 max_mul; + + if (!(hv_clock->flags & PVCLOCK_TSC_STABLE_BIT)) + return false; + + /* + * check if scale would overflow, if so we use the time ref counter + * tsc_to_system_mul * 2^(tsc_shift+32) / 100 >= 2^64 + * tsc_to_system_mul / 100 >= 2^(32-tsc_shift) + * tsc_to_system_mul >= 100 * 2^(32-tsc_shift) + */ + max_mul = 100ull << (32 - hv_clock->tsc_shift); + if (hv_clock->tsc_to_system_mul >= max_mul) + return false; + + /* + * Otherwise compute the scale and offset according to the formulas + * derived above. + */ + tsc_ref->tsc_scale = + mul_u64_u32_div(1ULL << (32 + hv_clock->tsc_shift), + hv_clock->tsc_to_system_mul, + 100); + + tsc_ref->tsc_offset = hv_clock->system_time; + do_div(tsc_ref->tsc_offset, 100); + tsc_ref->tsc_offset -= + mul_u64_u64_shr(hv_clock->tsc_timestamp, tsc_ref->tsc_scale, 64); + return true; +} + +void kvm_hv_setup_tsc_page(struct kvm *kvm, + struct pvclock_vcpu_time_info *hv_clock) +{ + struct kvm_hv *hv = &kvm->arch.hyperv; + u32 tsc_seq; + u64 gfn; + + BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(hv->tsc_ref.tsc_sequence)); + BUILD_BUG_ON(offsetof(HV_REFERENCE_TSC_PAGE, tsc_sequence) != 0); + + if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)) + return; + + gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT; + /* + * Because the TSC parameters only vary when there is a + * change in the master clock, do not bother with caching. + */ + if (unlikely(kvm_read_guest(kvm, gfn_to_gpa(gfn), + &tsc_seq, sizeof(tsc_seq)))) + return; + + /* + * While we're computing and writing the parameters, force the + * guest to use the time reference count MSR. + */ + hv->tsc_ref.tsc_sequence = 0; + if (kvm_write_guest(kvm, gfn_to_gpa(gfn), + &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence))) + return; + + if (!compute_tsc_page_parameters(hv_clock, &hv->tsc_ref)) + return; + + /* Ensure sequence is zero before writing the rest of the struct. */ + smp_wmb(); + if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref))) + return; + + /* + * Now switch to the TSC page mechanism by writing the sequence. + */ + tsc_seq++; + if (tsc_seq == 0xFFFFFFFF || tsc_seq == 0) + tsc_seq = 1; + + /* Write the struct entirely before the non-zero sequence. */ + smp_wmb(); + + hv->tsc_ref.tsc_sequence = tsc_seq; + kvm_write_guest(kvm, gfn_to_gpa(gfn), + &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)); +} + static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) { @@ -793,23 +930,11 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, mark_page_dirty(kvm, gfn); break; } - case HV_X64_MSR_REFERENCE_TSC: { - u64 gfn; - HV_REFERENCE_TSC_PAGE tsc_ref; - - memset(&tsc_ref, 0, sizeof(tsc_ref)); + case HV_X64_MSR_REFERENCE_TSC: hv->hv_tsc_page = data; - if (!(data & HV_X64_MSR_TSC_REFERENCE_ENABLE)) - break; - gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT; - if (kvm_write_guest( - kvm, - gfn << HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT, - &tsc_ref, sizeof(tsc_ref))) - return 1; - mark_page_dirty(kvm, gfn); + if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE) + kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); break; - } case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: return kvm_hv_msr_set_crash_data(vcpu, msr - HV_X64_MSR_CRASH_P0, diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index 60eccd4..cd11195 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -84,4 +84,7 @@ static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu) void kvm_hv_process_stimers(struct kvm_vcpu *vcpu); +void kvm_hv_setup_tsc_page(struct kvm *kvm, + struct pvclock_vcpu_time_info *hv_clock); + #endif diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index b62c852..23b99f3 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1761,9 +1761,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) if (value & MSR_IA32_APICBASE_ENABLE) { kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); static_key_slow_dec_deferred(&apic_hw_disabled); - } else + } else { static_key_slow_inc(&apic_hw_disabled.key); - recalculate_apic_map(vcpu->kvm); + recalculate_apic_map(vcpu->kvm); + } } if ((old_value ^ value) & X2APIC_ENABLE) { diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 3d4cc8cc..d9c7e98 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1207,7 +1207,7 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) * * Return true if tlb need be flushed. */ -static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect) +static bool spte_write_protect(u64 *sptep, bool pt_protect) { u64 spte = *sptep; @@ -1233,12 +1233,12 @@ static bool __rmap_write_protect(struct kvm *kvm, bool flush = false; for_each_rmap_spte(rmap_head, &iter, sptep) - flush |= spte_write_protect(kvm, sptep, pt_protect); + flush |= spte_write_protect(sptep, pt_protect); return flush; } -static bool spte_clear_dirty(struct kvm *kvm, u64 *sptep) +static bool spte_clear_dirty(u64 *sptep) { u64 spte = *sptep; @@ -1256,12 +1256,12 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) bool flush = false; for_each_rmap_spte(rmap_head, &iter, sptep) - flush |= spte_clear_dirty(kvm, sptep); + flush |= spte_clear_dirty(sptep); return flush; } -static bool spte_set_dirty(struct kvm *kvm, u64 *sptep) +static bool spte_set_dirty(u64 *sptep) { u64 spte = *sptep; @@ -1279,7 +1279,7 @@ static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) bool flush = false; for_each_rmap_spte(rmap_head, &iter, sptep) - flush |= spte_set_dirty(kvm, sptep); + flush |= spte_set_dirty(sptep); return flush; } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 1e6b84b..f8157a3 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -34,6 +34,8 @@ #include <linux/sched.h> #include <linux/trace_events.h> #include <linux/slab.h> +#include <linux/amd-iommu.h> +#include <linux/hashtable.h> #include <asm/apic.h> #include <asm/perf_event.h> @@ -41,6 +43,7 @@ #include <asm/desc.h> #include <asm/debugreg.h> #include <asm/kvm_para.h> +#include <asm/irq_remapping.h> #include <asm/virtext.h> #include "trace.h" @@ -96,6 +99,19 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id); #define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0 #define AVIC_UNACCEL_ACCESS_VECTOR_MASK 0xFFFFFFFF +/* AVIC GATAG is encoded using VM and VCPU IDs */ +#define AVIC_VCPU_ID_BITS 8 +#define AVIC_VCPU_ID_MASK ((1 << AVIC_VCPU_ID_BITS) - 1) + +#define AVIC_VM_ID_BITS 24 +#define AVIC_VM_ID_NR (1 << AVIC_VM_ID_BITS) +#define AVIC_VM_ID_MASK ((1 << AVIC_VM_ID_BITS) - 1) + +#define AVIC_GATAG(x, y) (((x & AVIC_VM_ID_MASK) << AVIC_VCPU_ID_BITS) | \ + (y & AVIC_VCPU_ID_MASK)) +#define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VCPU_ID_BITS) & AVIC_VM_ID_MASK) +#define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK) + static bool erratum_383_found __read_mostly; static const u32 host_save_user_msrs[] = { @@ -185,6 +201,23 @@ struct vcpu_svm { struct page *avic_backing_page; u64 *avic_physical_id_cache; bool avic_is_running; + + /* + * Per-vcpu list of struct amd_svm_iommu_ir: + * This is used mainly to store interrupt remapping information used + * when update the vcpu affinity. This avoids the need to scan for + * IRTE and try to match ga_tag in the IOMMU driver. + */ + struct list_head ir_list; + spinlock_t ir_list_lock; +}; + +/* + * This is a wrapper of struct amd_iommu_ir_data. + */ +struct amd_svm_iommu_ir { + struct list_head node; /* Used by SVM for per-vcpu ir_list */ + void *data; /* Storing pointer to struct amd_ir_data */ }; #define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF) @@ -242,6 +275,10 @@ static int avic; module_param(avic, int, S_IRUGO); #endif +/* AVIC VM ID bit masks and lock */ +static DECLARE_BITMAP(avic_vm_id_bitmap, AVIC_VM_ID_NR); +static DEFINE_SPINLOCK(avic_vm_id_lock); + static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); static void svm_flush_tlb(struct kvm_vcpu *vcpu); static void svm_complete_interrupts(struct vcpu_svm *svm); @@ -928,6 +965,55 @@ static void svm_disable_lbrv(struct vcpu_svm *svm) set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0); } +/* Note: + * This hash table is used to map VM_ID to a struct kvm_arch, + * when handling AMD IOMMU GALOG notification to schedule in + * a particular vCPU. + */ +#define SVM_VM_DATA_HASH_BITS 8 +DECLARE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS); +static spinlock_t svm_vm_data_hash_lock; + +/* Note: + * This function is called from IOMMU driver to notify + * SVM to schedule in a particular vCPU of a particular VM. + */ +static int avic_ga_log_notifier(u32 ga_tag) +{ + unsigned long flags; + struct kvm_arch *ka = NULL; + struct kvm_vcpu *vcpu = NULL; + u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag); + u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag); + + pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id); + + spin_lock_irqsave(&svm_vm_data_hash_lock, flags); + hash_for_each_possible(svm_vm_data_hash, ka, hnode, vm_id) { + struct kvm *kvm = container_of(ka, struct kvm, arch); + struct kvm_arch *vm_data = &kvm->arch; + + if (vm_data->avic_vm_id != vm_id) + continue; + vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id); + break; + } + spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); + + if (!vcpu) + return 0; + + /* Note: + * At this point, the IOMMU should have already set the pending + * bit in the vAPIC backing page. So, we just need to schedule + * in the vcpu. + */ + if (vcpu->mode == OUTSIDE_GUEST_MODE) + kvm_vcpu_wake_up(vcpu); + + return 0; +} + static __init int svm_hardware_setup(void) { int cpu; @@ -986,10 +1072,15 @@ static __init int svm_hardware_setup(void) if (avic) { if (!npt_enabled || !boot_cpu_has(X86_FEATURE_AVIC) || - !IS_ENABLED(CONFIG_X86_LOCAL_APIC)) + !IS_ENABLED(CONFIG_X86_LOCAL_APIC)) { avic = false; - else + } else { pr_info("AVIC enabled\n"); + + hash_init(svm_vm_data_hash); + spin_lock_init(&svm_vm_data_hash_lock); + amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); + } } return 0; @@ -1028,13 +1119,6 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) seg->base = 0; } -static u64 svm_read_tsc_offset(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - return svm->vmcb->control.tsc_offset; -} - static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1280,19 +1364,55 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) return 0; } +static inline int avic_get_next_vm_id(void) +{ + int id; + + spin_lock(&avic_vm_id_lock); + + /* AVIC VM ID is one-based. */ + id = find_next_zero_bit(avic_vm_id_bitmap, AVIC_VM_ID_NR, 1); + if (id <= AVIC_VM_ID_MASK) + __set_bit(id, avic_vm_id_bitmap); + else + id = -EAGAIN; + + spin_unlock(&avic_vm_id_lock); + return id; +} + +static inline int avic_free_vm_id(int id) +{ + if (id <= 0 || id > AVIC_VM_ID_MASK) + return -EINVAL; + + spin_lock(&avic_vm_id_lock); + __clear_bit(id, avic_vm_id_bitmap); + spin_unlock(&avic_vm_id_lock); + return 0; +} + static void avic_vm_destroy(struct kvm *kvm) { + unsigned long flags; struct kvm_arch *vm_data = &kvm->arch; + avic_free_vm_id(vm_data->avic_vm_id); + if (vm_data->avic_logical_id_table_page) __free_page(vm_data->avic_logical_id_table_page); if (vm_data->avic_physical_id_table_page) __free_page(vm_data->avic_physical_id_table_page); + + spin_lock_irqsave(&svm_vm_data_hash_lock, flags); + hash_del(&vm_data->hnode); + spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); } static int avic_vm_init(struct kvm *kvm) { - int err = -ENOMEM; + unsigned long flags; + int vm_id, err = -ENOMEM; struct kvm_arch *vm_data = &kvm->arch; struct page *p_page; struct page *l_page; @@ -1300,6 +1420,11 @@ static int avic_vm_init(struct kvm *kvm) if (!avic) return 0; + vm_id = avic_get_next_vm_id(); + if (vm_id < 0) + return vm_id; + vm_data->avic_vm_id = (u32)vm_id; + /* Allocating physical APIC ID table (4KB) */ p_page = alloc_page(GFP_KERNEL); if (!p_page) @@ -1316,6 +1441,10 @@ static int avic_vm_init(struct kvm *kvm) vm_data->avic_logical_id_table_page = l_page; clear_page(page_address(l_page)); + spin_lock_irqsave(&svm_vm_data_hash_lock, flags); + hash_add(svm_vm_data_hash, &vm_data->hnode, vm_data->avic_vm_id); + spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); + return 0; free_avic: @@ -1323,31 +1452,34 @@ free_avic: return err; } -/** - * This function is called during VCPU halt/unhalt. - */ -static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run) +static inline int +avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r) { - u64 entry; - int h_physical_id = kvm_cpu_get_apicid(vcpu->cpu); + int ret = 0; + unsigned long flags; + struct amd_svm_iommu_ir *ir; struct vcpu_svm *svm = to_svm(vcpu); - if (!kvm_vcpu_apicv_active(vcpu)) - return; - - svm->avic_is_running = is_run; + if (!kvm_arch_has_assigned_device(vcpu->kvm)) + return 0; - /* ID = 0xff (broadcast), ID > 0xff (reserved) */ - if (WARN_ON(h_physical_id >= AVIC_MAX_PHYSICAL_ID_COUNT)) - return; + /* + * Here, we go through the per-vcpu ir_list to update all existing + * interrupt remapping table entry targeting this vcpu. + */ + spin_lock_irqsave(&svm->ir_list_lock, flags); - entry = READ_ONCE(*(svm->avic_physical_id_cache)); - WARN_ON(is_run == !!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)); + if (list_empty(&svm->ir_list)) + goto out; - entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; - if (is_run) - entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; - WRITE_ONCE(*(svm->avic_physical_id_cache), entry); + list_for_each_entry(ir, &svm->ir_list, node) { + ret = amd_iommu_update_ga(cpu, r, ir->data); + if (ret) + break; + } +out: + spin_unlock_irqrestore(&svm->ir_list_lock, flags); + return ret; } static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) @@ -1374,6 +1506,8 @@ static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; WRITE_ONCE(*(svm->avic_physical_id_cache), entry); + avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, + svm->avic_is_running); } static void avic_vcpu_put(struct kvm_vcpu *vcpu) @@ -1385,10 +1519,27 @@ static void avic_vcpu_put(struct kvm_vcpu *vcpu) return; entry = READ_ONCE(*(svm->avic_physical_id_cache)); + if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) + avic_update_iommu_vcpu_affinity(vcpu, -1, 0); + entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; WRITE_ONCE(*(svm->avic_physical_id_cache), entry); } +/** + * This function is called during VCPU halt/unhalt. + */ +static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + svm->avic_is_running = is_run; + if (is_run) + avic_vcpu_load(vcpu, vcpu->cpu); + else + avic_vcpu_put(vcpu); +} + static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1450,6 +1601,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) err = avic_init_backing_page(&svm->vcpu); if (err) goto free_page4; + + INIT_LIST_HEAD(&svm->ir_list); + spin_lock_init(&svm->ir_list_lock); } /* We initialize this flag to true to make sure that the is_running @@ -4246,6 +4400,209 @@ static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec) kvm_vcpu_wake_up(vcpu); } +static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) +{ + unsigned long flags; + struct amd_svm_iommu_ir *cur; + + spin_lock_irqsave(&svm->ir_list_lock, flags); + list_for_each_entry(cur, &svm->ir_list, node) { + if (cur->data != pi->ir_data) + continue; + list_del(&cur->node); + kfree(cur); + break; + } + spin_unlock_irqrestore(&svm->ir_list_lock, flags); +} + +static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) +{ + int ret = 0; + unsigned long flags; + struct amd_svm_iommu_ir *ir; + + /** + * In some cases, the existing irte is updaed and re-set, + * so we need to check here if it's already been * added + * to the ir_list. + */ + if (pi->ir_data && (pi->prev_ga_tag != 0)) { + struct kvm *kvm = svm->vcpu.kvm; + u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag); + struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id); + struct vcpu_svm *prev_svm; + + if (!prev_vcpu) { + ret = -EINVAL; + goto out; + } + + prev_svm = to_svm(prev_vcpu); + svm_ir_list_del(prev_svm, pi); + } + + /** + * Allocating new amd_iommu_pi_data, which will get + * add to the per-vcpu ir_list. + */ + ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL); + if (!ir) { + ret = -ENOMEM; + goto out; + } + ir->data = pi->ir_data; + + spin_lock_irqsave(&svm->ir_list_lock, flags); + list_add(&ir->node, &svm->ir_list); + spin_unlock_irqrestore(&svm->ir_list_lock, flags); +out: + return ret; +} + +/** + * Note: + * The HW cannot support posting multicast/broadcast + * interrupts to a vCPU. So, we still use legacy interrupt + * remapping for these kind of interrupts. + * + * For lowest-priority interrupts, we only support + * those with single CPU as the destination, e.g. user + * configures the interrupts via /proc/irq or uses + * irqbalance to make the interrupts single-CPU. + */ +static int +get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, + struct vcpu_data *vcpu_info, struct vcpu_svm **svm) +{ + struct kvm_lapic_irq irq; + struct kvm_vcpu *vcpu = NULL; + + kvm_set_msi_irq(kvm, e, &irq); + + if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) { + pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n", + __func__, irq.vector); + return -1; + } + + pr_debug("SVM: %s: use GA mode for irq %u\n", __func__, + irq.vector); + *svm = to_svm(vcpu); + vcpu_info->pi_desc_addr = page_to_phys((*svm)->avic_backing_page); + vcpu_info->vector = irq.vector; + + return 0; +} + +/* + * svm_update_pi_irte - set IRTE for Posted-Interrupts + * + * @kvm: kvm + * @host_irq: host irq of the interrupt + * @guest_irq: gsi of the interrupt + * @set: set or unset PI + * returns 0 on success, < 0 on failure + */ +static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq, + uint32_t guest_irq, bool set) +{ + struct kvm_kernel_irq_routing_entry *e; + struct kvm_irq_routing_table *irq_rt; + int idx, ret = -EINVAL; + + if (!kvm_arch_has_assigned_device(kvm) || + !irq_remapping_cap(IRQ_POSTING_CAP)) + return 0; + + pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n", + __func__, host_irq, guest_irq, set); + + idx = srcu_read_lock(&kvm->irq_srcu); + irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); + WARN_ON(guest_irq >= irq_rt->nr_rt_entries); + + hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { + struct vcpu_data vcpu_info; + struct vcpu_svm *svm = NULL; + + if (e->type != KVM_IRQ_ROUTING_MSI) + continue; + + /** + * Here, we setup with legacy mode in the following cases: + * 1. When cannot target interrupt to a specific vcpu. + * 2. Unsetting posted interrupt. + * 3. APIC virtialization is disabled for the vcpu. + */ + if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set && + kvm_vcpu_apicv_active(&svm->vcpu)) { + struct amd_iommu_pi_data pi; + + /* Try to enable guest_mode in IRTE */ + pi.base = page_to_phys(svm->avic_backing_page) & AVIC_HPA_MASK; + pi.ga_tag = AVIC_GATAG(kvm->arch.avic_vm_id, + svm->vcpu.vcpu_id); + pi.is_guest_mode = true; + pi.vcpu_data = &vcpu_info; + ret = irq_set_vcpu_affinity(host_irq, &pi); + + /** + * Here, we successfully setting up vcpu affinity in + * IOMMU guest mode. Now, we need to store the posted + * interrupt information in a per-vcpu ir_list so that + * we can reference to them directly when we update vcpu + * scheduling information in IOMMU irte. + */ + if (!ret && pi.is_guest_mode) + svm_ir_list_add(svm, &pi); + } else { + /* Use legacy mode in IRTE */ + struct amd_iommu_pi_data pi; + + /** + * Here, pi is used to: + * - Tell IOMMU to use legacy mode for this interrupt. + * - Retrieve ga_tag of prior interrupt remapping data. + */ + pi.is_guest_mode = false; + ret = irq_set_vcpu_affinity(host_irq, &pi); + + /** + * Check if the posted interrupt was previously + * setup with the guest_mode by checking if the ga_tag + * was cached. If so, we need to clean up the per-vcpu + * ir_list. + */ + if (!ret && pi.prev_ga_tag) { + int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag); + struct kvm_vcpu *vcpu; + + vcpu = kvm_get_vcpu_by_id(kvm, id); + if (vcpu) + svm_ir_list_del(to_svm(vcpu), &pi); + } + } + + if (!ret && svm) { + trace_kvm_pi_irte_update(svm->vcpu.vcpu_id, + host_irq, e->gsi, + vcpu_info.vector, + vcpu_info.pi_desc_addr, set); + } + + if (ret < 0) { + pr_err("%s: failed to update PI IRTE\n", __func__); + goto out; + } + } + + ret = 0; +out: + srcu_read_unlock(&kvm->irq_srcu, idx); + return ret; +} + static int svm_nmi_allowed(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -5064,7 +5421,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .has_wbinvd_exit = svm_has_wbinvd_exit, - .read_tsc_offset = svm_read_tsc_offset, .write_tsc_offset = svm_write_tsc_offset, .adjust_tsc_offset_guest = svm_adjust_tsc_offset_guest, .read_l1_tsc = svm_read_l1_tsc, @@ -5078,6 +5434,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .pmu_ops = &amd_pmu_ops, .deliver_posted_interrupt = svm_deliver_avic_intr, + .update_pi_irte = svm_update_pi_irte, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 121fdf6..cf1b16d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -927,6 +927,8 @@ static unsigned long *vmx_msr_bitmap_legacy; static unsigned long *vmx_msr_bitmap_longmode; static unsigned long *vmx_msr_bitmap_legacy_x2apic; static unsigned long *vmx_msr_bitmap_longmode_x2apic; +static unsigned long *vmx_msr_bitmap_legacy_x2apic_apicv_inactive; +static unsigned long *vmx_msr_bitmap_longmode_x2apic_apicv_inactive; static unsigned long *vmx_vmread_bitmap; static unsigned long *vmx_vmwrite_bitmap; @@ -939,6 +941,7 @@ static DEFINE_SPINLOCK(vmx_vpid_lock); static struct vmcs_config { int size; int order; + u32 basic_cap; u32 revision_id; u32 pin_based_exec_ctrl; u32 cpu_based_exec_ctrl; @@ -1215,6 +1218,11 @@ static inline bool cpu_has_vmx_ple(void) SECONDARY_EXEC_PAUSE_LOOP_EXITING; } +static inline bool cpu_has_vmx_basic_inout(void) +{ + return (((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT); +} + static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu) { return flexpriority_enabled && lapic_in_kernel(vcpu); @@ -2518,10 +2526,17 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu) else if (cpu_has_secondary_exec_ctrls() && (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) & SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { - if (is_long_mode(vcpu)) - msr_bitmap = vmx_msr_bitmap_longmode_x2apic; - else - msr_bitmap = vmx_msr_bitmap_legacy_x2apic; + if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) { + if (is_long_mode(vcpu)) + msr_bitmap = vmx_msr_bitmap_longmode_x2apic; + else + msr_bitmap = vmx_msr_bitmap_legacy_x2apic; + } else { + if (is_long_mode(vcpu)) + msr_bitmap = vmx_msr_bitmap_longmode_x2apic_apicv_inactive; + else + msr_bitmap = vmx_msr_bitmap_legacy_x2apic_apicv_inactive; + } } else { if (is_long_mode(vcpu)) msr_bitmap = vmx_msr_bitmap_longmode; @@ -2603,11 +2618,6 @@ static u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) return host_tsc + tsc_offset; } -static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu) -{ - return vmcs_read64(TSC_OFFSET); -} - /* * writes 'offset' into guest's timestamp counter offset register */ @@ -2877,6 +2887,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) *pdata = VMCS12_REVISION | VMX_BASIC_TRUE_CTLS | ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); + if (cpu_has_vmx_basic_inout()) + *pdata |= VMX_BASIC_INOUT; break; case MSR_IA32_VMX_TRUE_PINBASED_CTLS: case MSR_IA32_VMX_PINBASED_CTLS: @@ -3457,7 +3469,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) return -EIO; vmcs_conf->size = vmx_msr_high & 0x1fff; - vmcs_conf->order = get_order(vmcs_config.size); + vmcs_conf->order = get_order(vmcs_conf->size); + vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff; vmcs_conf->revision_id = vmx_msr_low; vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; @@ -4678,28 +4691,49 @@ static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) msr, MSR_TYPE_R | MSR_TYPE_W); } -static void vmx_enable_intercept_msr_read_x2apic(u32 msr) +static void vmx_enable_intercept_msr_read_x2apic(u32 msr, bool apicv_active) { - __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, - msr, MSR_TYPE_R); - __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, - msr, MSR_TYPE_R); + if (apicv_active) { + __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, + msr, MSR_TYPE_R); + __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, + msr, MSR_TYPE_R); + } else { + __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive, + msr, MSR_TYPE_R); + __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive, + msr, MSR_TYPE_R); + } } -static void vmx_disable_intercept_msr_read_x2apic(u32 msr) +static void vmx_disable_intercept_msr_read_x2apic(u32 msr, bool apicv_active) { - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, - msr, MSR_TYPE_R); - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, - msr, MSR_TYPE_R); + if (apicv_active) { + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, + msr, MSR_TYPE_R); + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, + msr, MSR_TYPE_R); + } else { + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive, + msr, MSR_TYPE_R); + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive, + msr, MSR_TYPE_R); + } } -static void vmx_disable_intercept_msr_write_x2apic(u32 msr) +static void vmx_disable_intercept_msr_write_x2apic(u32 msr, bool apicv_active) { - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, - msr, MSR_TYPE_W); - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, - msr, MSR_TYPE_W); + if (apicv_active) { + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, + msr, MSR_TYPE_W); + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, + msr, MSR_TYPE_W); + } else { + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive, + msr, MSR_TYPE_W); + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive, + msr, MSR_TYPE_W); + } } static bool vmx_get_enable_apicv(void) @@ -5279,29 +5313,30 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (is_guest_mode(vcpu)) - return; + if (!is_guest_mode(vcpu)) { + if (!cpu_has_virtual_nmis()) { + /* + * Tracking the NMI-blocked state in software is built upon + * finding the next open IRQ window. This, in turn, depends on + * well-behaving guests: They have to keep IRQs disabled at + * least as long as the NMI handler runs. Otherwise we may + * cause NMI nesting, maybe breaking the guest. But as this is + * highly unlikely, we can live with the residual risk. + */ + vmx->soft_vnmi_blocked = 1; + vmx->vnmi_blocked_time = 0; + } - if (!cpu_has_virtual_nmis()) { - /* - * Tracking the NMI-blocked state in software is built upon - * finding the next open IRQ window. This, in turn, depends on - * well-behaving guests: They have to keep IRQs disabled at - * least as long as the NMI handler runs. Otherwise we may - * cause NMI nesting, maybe breaking the guest. But as this is - * highly unlikely, we can live with the residual risk. - */ - vmx->soft_vnmi_blocked = 1; - vmx->vnmi_blocked_time = 0; + ++vcpu->stat.nmi_injections; + vmx->nmi_known_unmasked = false; } - ++vcpu->stat.nmi_injections; - vmx->nmi_known_unmasked = false; if (vmx->rmode.vm86_active) { if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE) kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); return; } + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); } @@ -6109,7 +6144,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) exit_qualification = vmcs_readl(EXIT_QUALIFICATION); gla_validity = (exit_qualification >> 7) & 0x3; - if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) { + if (gla_validity == 0x2) { printk(KERN_ERR "EPT: Handling EPT violation failed!\n"); printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n", (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS), @@ -6360,22 +6395,32 @@ static __init int hardware_setup(void) if (!vmx_msr_bitmap_legacy_x2apic) goto out2; + vmx_msr_bitmap_legacy_x2apic_apicv_inactive = + (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_msr_bitmap_legacy_x2apic_apicv_inactive) + goto out3; + vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); if (!vmx_msr_bitmap_longmode) - goto out3; + goto out4; vmx_msr_bitmap_longmode_x2apic = (unsigned long *)__get_free_page(GFP_KERNEL); if (!vmx_msr_bitmap_longmode_x2apic) - goto out4; + goto out5; + + vmx_msr_bitmap_longmode_x2apic_apicv_inactive = + (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_msr_bitmap_longmode_x2apic_apicv_inactive) + goto out6; vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); if (!vmx_vmread_bitmap) - goto out6; + goto out7; vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); if (!vmx_vmwrite_bitmap) - goto out7; + goto out8; memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); @@ -6394,7 +6439,7 @@ static __init int hardware_setup(void) if (setup_vmcs_config(&vmcs_config) < 0) { r = -EIO; - goto out8; + goto out9; } if (boot_cpu_has(X86_FEATURE_NX)) @@ -6461,20 +6506,35 @@ static __init int hardware_setup(void) vmx_msr_bitmap_legacy, PAGE_SIZE); memcpy(vmx_msr_bitmap_longmode_x2apic, vmx_msr_bitmap_longmode, PAGE_SIZE); + memcpy(vmx_msr_bitmap_legacy_x2apic_apicv_inactive, + vmx_msr_bitmap_legacy, PAGE_SIZE); + memcpy(vmx_msr_bitmap_longmode_x2apic_apicv_inactive, + vmx_msr_bitmap_longmode, PAGE_SIZE); set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ + /* + * enable_apicv && kvm_vcpu_apicv_active() + */ for (msr = 0x800; msr <= 0x8ff; msr++) - vmx_disable_intercept_msr_read_x2apic(msr); + vmx_disable_intercept_msr_read_x2apic(msr, true); /* TMCCT */ - vmx_enable_intercept_msr_read_x2apic(0x839); + vmx_enable_intercept_msr_read_x2apic(0x839, true); /* TPR */ - vmx_disable_intercept_msr_write_x2apic(0x808); + vmx_disable_intercept_msr_write_x2apic(0x808, true); /* EOI */ - vmx_disable_intercept_msr_write_x2apic(0x80b); + vmx_disable_intercept_msr_write_x2apic(0x80b, true); /* SELF-IPI */ - vmx_disable_intercept_msr_write_x2apic(0x83f); + vmx_disable_intercept_msr_write_x2apic(0x83f, true); + + /* + * (enable_apicv && !kvm_vcpu_apicv_active()) || + * !enable_apicv + */ + /* TPR */ + vmx_disable_intercept_msr_read_x2apic(0x808, false); + vmx_disable_intercept_msr_write_x2apic(0x808, false); if (enable_ept) { kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, @@ -6521,14 +6581,18 @@ static __init int hardware_setup(void) return alloc_kvm_area(); -out8: +out9: free_page((unsigned long)vmx_vmwrite_bitmap); -out7: +out8: free_page((unsigned long)vmx_vmread_bitmap); +out7: + free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive); out6: free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); -out4: +out5: free_page((unsigned long)vmx_msr_bitmap_longmode); +out4: + free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive); out3: free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); out2: @@ -6544,7 +6608,9 @@ out: static __exit void hardware_unsetup(void) { free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); + free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive); free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); + free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive); free_page((unsigned long)vmx_msr_bitmap_legacy); free_page((unsigned long)vmx_msr_bitmap_longmode); free_page((unsigned long)vmx_io_bitmap_b); @@ -6726,7 +6792,7 @@ static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) { /* TODO: not to reset guest simply here. */ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); - pr_warn("kvm: nested vmx abort, indicator %d\n", indicator); + pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator); } static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) @@ -7013,7 +7079,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu) vmx->nested.vmcs02_num = 0; hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); + HRTIMER_MODE_REL_PINNED); vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; vmx->nested.vmxon = true; @@ -8435,12 +8501,7 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) return; } - /* - * There is not point to enable virtualize x2apic without enable - * apicv - */ - if (!cpu_has_vmx_virtualize_x2apic_mode() || - !kvm_vcpu_apicv_active(vcpu)) + if (!cpu_has_vmx_virtualize_x2apic_mode()) return; if (!cpu_need_tpr_shadow(vcpu)) @@ -9598,7 +9659,7 @@ static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, maxphyaddr = cpuid_maxphyaddr(vcpu); if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr || (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) { - pr_warn_ratelimited( + pr_debug_ratelimited( "nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)", addr_field, maxphyaddr, count, addr); return -EINVAL; @@ -9671,13 +9732,13 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) for (i = 0; i < count; i++) { if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), &e, sizeof(e))) { - pr_warn_ratelimited( + pr_debug_ratelimited( "%s cannot read MSR entry (%u, 0x%08llx)\n", __func__, i, gpa + i * sizeof(e)); goto fail; } if (nested_vmx_load_msr_check(vcpu, &e)) { - pr_warn_ratelimited( + pr_debug_ratelimited( "%s check failed (%u, 0x%x, 0x%x)\n", __func__, i, e.index, e.reserved); goto fail; @@ -9685,7 +9746,7 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) msr.index = e.index; msr.data = e.value; if (kvm_set_msr(vcpu, &msr)) { - pr_warn_ratelimited( + pr_debug_ratelimited( "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", __func__, i, e.index, e.value); goto fail; @@ -9706,13 +9767,13 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), &e, 2 * sizeof(u32))) { - pr_warn_ratelimited( + pr_debug_ratelimited( "%s cannot read MSR entry (%u, 0x%08llx)\n", __func__, i, gpa + i * sizeof(e)); return -EINVAL; } if (nested_vmx_store_msr_check(vcpu, &e)) { - pr_warn_ratelimited( + pr_debug_ratelimited( "%s check failed (%u, 0x%x, 0x%x)\n", __func__, i, e.index, e.reserved); return -EINVAL; @@ -9720,7 +9781,7 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) msr_info.host_initiated = false; msr_info.index = e.index; if (kvm_get_msr(vcpu, &msr_info)) { - pr_warn_ratelimited( + pr_debug_ratelimited( "%s cannot read MSR (%u, 0x%x)\n", __func__, i, e.index); return -EINVAL; @@ -9729,7 +9790,7 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) gpa + i * sizeof(e) + offsetof(struct vmx_msr_entry, value), &msr_info.data, sizeof(msr_info.data))) { - pr_warn_ratelimited( + pr_debug_ratelimited( "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", __func__, i, e.index, msr_info.data); return -EINVAL; @@ -10500,6 +10561,9 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); } + if (nested_cpu_has_ept(vmcs12)) + vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); + if (nested_cpu_has_vid(vmcs12)) vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); @@ -10793,7 +10857,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, * We are now running in L2, mmu_notifier will force to reload the * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1. */ - kvm_vcpu_reload_apic_access_page(vcpu); + kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); /* * Exiting from L2 to L1, we're now back to L1 which thinks it just @@ -11274,7 +11338,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, - .read_tsc_offset = vmx_read_tsc_offset, .write_tsc_offset = vmx_write_tsc_offset, .adjust_tsc_offset_guest = vmx_adjust_tsc_offset_guest, .read_l1_tsc = vmx_read_l1_tsc, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 699f872..6c633de 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1367,7 +1367,7 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset) { - u64 curr_offset = kvm_x86_ops->read_tsc_offset(vcpu); + u64 curr_offset = vcpu->arch.tsc_offset; vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset; } @@ -1413,6 +1413,12 @@ u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) } EXPORT_SYMBOL_GPL(kvm_read_l1_tsc); +static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) +{ + kvm_x86_ops->write_tsc_offset(vcpu, offset); + vcpu->arch.tsc_offset = offset; +} + void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) { struct kvm *kvm = vcpu->kvm; @@ -1425,7 +1431,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); offset = kvm_compute_tsc_offset(vcpu, data); - ns = get_kernel_ns(); + ns = ktime_get_boot_ns(); elapsed = ns - kvm->arch.last_tsc_nsec; if (vcpu->arch.virtual_tsc_khz) { @@ -1522,7 +1528,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) if (guest_cpuid_has_tsc_adjust(vcpu) && !msr->host_initiated) update_ia32_tsc_adjust_msr(vcpu, offset); - kvm_x86_ops->write_tsc_offset(vcpu, offset); + kvm_vcpu_write_tsc_offset(vcpu, offset); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); spin_lock(&kvm->arch.pvclock_gtod_sync_lock); @@ -1716,6 +1722,88 @@ static void kvm_gen_update_masterclock(struct kvm *kvm) #endif } +static u64 __get_kvmclock_ns(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu = kvm_get_vcpu(kvm, 0); + struct kvm_arch *ka = &kvm->arch; + s64 ns; + + if (vcpu->arch.hv_clock.flags & PVCLOCK_TSC_STABLE_BIT) { + u64 tsc = kvm_read_l1_tsc(vcpu, rdtsc()); + ns = __pvclock_read_cycles(&vcpu->arch.hv_clock, tsc); + } else { + ns = ktime_get_boot_ns() + ka->kvmclock_offset; + } + + return ns; +} + +u64 get_kvmclock_ns(struct kvm *kvm) +{ + unsigned long flags; + s64 ns; + + local_irq_save(flags); + ns = __get_kvmclock_ns(kvm); + local_irq_restore(flags); + + return ns; +} + +static void kvm_setup_pvclock_page(struct kvm_vcpu *v) +{ + struct kvm_vcpu_arch *vcpu = &v->arch; + struct pvclock_vcpu_time_info guest_hv_clock; + + if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time, + &guest_hv_clock, sizeof(guest_hv_clock)))) + return; + + /* This VCPU is paused, but it's legal for a guest to read another + * VCPU's kvmclock, so we really have to follow the specification where + * it says that version is odd if data is being modified, and even after + * it is consistent. + * + * Version field updates must be kept separate. This is because + * kvm_write_guest_cached might use a "rep movs" instruction, and + * writes within a string instruction are weakly ordered. So there + * are three writes overall. + * + * As a small optimization, only write the version field in the first + * and third write. The vcpu->pv_time cache is still valid, because the + * version field is the first in the struct. + */ + BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0); + + vcpu->hv_clock.version = guest_hv_clock.version + 1; + kvm_write_guest_cached(v->kvm, &vcpu->pv_time, + &vcpu->hv_clock, + sizeof(vcpu->hv_clock.version)); + + smp_wmb(); + + /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */ + vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED); + + if (vcpu->pvclock_set_guest_stopped_request) { + vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED; + vcpu->pvclock_set_guest_stopped_request = false; + } + + trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock); + + kvm_write_guest_cached(v->kvm, &vcpu->pv_time, + &vcpu->hv_clock, + sizeof(vcpu->hv_clock)); + + smp_wmb(); + + vcpu->hv_clock.version++; + kvm_write_guest_cached(v->kvm, &vcpu->pv_time, + &vcpu->hv_clock, + sizeof(vcpu->hv_clock.version)); +} + static int kvm_guest_time_update(struct kvm_vcpu *v) { unsigned long flags, tgt_tsc_khz; @@ -1723,7 +1811,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) struct kvm_arch *ka = &v->kvm->arch; s64 kernel_ns; u64 tsc_timestamp, host_tsc; - struct pvclock_vcpu_time_info guest_hv_clock; u8 pvclock_flags; bool use_master_clock; @@ -1752,7 +1839,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) } if (!use_master_clock) { host_tsc = rdtsc(); - kernel_ns = get_kernel_ns(); + kernel_ns = ktime_get_boot_ns(); } tsc_timestamp = kvm_read_l1_tsc(v, host_tsc); @@ -1777,8 +1864,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) local_irq_restore(flags); - if (!vcpu->pv_time_enabled) - return 0; + /* With all the info we got, fill in the values */ if (kvm_has_tsc_control) tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz); @@ -1790,64 +1876,21 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) vcpu->hw_tsc_khz = tgt_tsc_khz; } - /* With all the info we got, fill in the values */ vcpu->hv_clock.tsc_timestamp = tsc_timestamp; vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; vcpu->last_guest_tsc = tsc_timestamp; - if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time, - &guest_hv_clock, sizeof(guest_hv_clock)))) - return 0; - - /* This VCPU is paused, but it's legal for a guest to read another - * VCPU's kvmclock, so we really have to follow the specification where - * it says that version is odd if data is being modified, and even after - * it is consistent. - * - * Version field updates must be kept separate. This is because - * kvm_write_guest_cached might use a "rep movs" instruction, and - * writes within a string instruction are weakly ordered. So there - * are three writes overall. - * - * As a small optimization, only write the version field in the first - * and third write. The vcpu->pv_time cache is still valid, because the - * version field is the first in the struct. - */ - BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0); - - vcpu->hv_clock.version = guest_hv_clock.version + 1; - kvm_write_guest_cached(v->kvm, &vcpu->pv_time, - &vcpu->hv_clock, - sizeof(vcpu->hv_clock.version)); - - smp_wmb(); - - /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */ - pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED); - - if (vcpu->pvclock_set_guest_stopped_request) { - pvclock_flags |= PVCLOCK_GUEST_STOPPED; - vcpu->pvclock_set_guest_stopped_request = false; - } - /* If the host uses TSC clocksource, then it is stable */ + pvclock_flags = 0; if (use_master_clock) pvclock_flags |= PVCLOCK_TSC_STABLE_BIT; vcpu->hv_clock.flags = pvclock_flags; - trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock); - - kvm_write_guest_cached(v->kvm, &vcpu->pv_time, - &vcpu->hv_clock, - sizeof(vcpu->hv_clock)); - - smp_wmb(); - - vcpu->hv_clock.version++; - kvm_write_guest_cached(v->kvm, &vcpu->pv_time, - &vcpu->hv_clock, - sizeof(vcpu->hv_clock.version)); + if (vcpu->pv_time_enabled) + kvm_setup_pvclock_page(v); + if (v == kvm_get_vcpu(v->kvm, 0)) + kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock); return 0; } @@ -2746,7 +2789,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (check_tsc_unstable()) { u64 offset = kvm_compute_tsc_offset(vcpu, vcpu->arch.last_guest_tsc); - kvm_x86_ops->write_tsc_offset(vcpu, offset); + kvm_vcpu_write_tsc_offset(vcpu, offset); vcpu->arch.tsc_catchup = 1; } if (kvm_lapic_hv_timer_in_use(vcpu) && @@ -4039,7 +4082,6 @@ long kvm_arch_vm_ioctl(struct file *filp, case KVM_SET_CLOCK: { struct kvm_clock_data user_ns; u64 now_ns; - s64 delta; r = -EFAULT; if (copy_from_user(&user_ns, argp, sizeof(user_ns))) @@ -4051,10 +4093,9 @@ long kvm_arch_vm_ioctl(struct file *filp, r = 0; local_irq_disable(); - now_ns = get_kernel_ns(); - delta = user_ns.clock - now_ns; + now_ns = __get_kvmclock_ns(kvm); + kvm->arch.kvmclock_offset += user_ns.clock - now_ns; local_irq_enable(); - kvm->arch.kvmclock_offset = delta; kvm_gen_update_masterclock(kvm); break; } @@ -4062,10 +4103,8 @@ long kvm_arch_vm_ioctl(struct file *filp, struct kvm_clock_data user_ns; u64 now_ns; - local_irq_disable(); - now_ns = get_kernel_ns(); - user_ns.clock = kvm->arch.kvmclock_offset + now_ns; - local_irq_enable(); + now_ns = get_kvmclock_ns(kvm); + user_ns.clock = now_ns; user_ns.flags = 0; memset(&user_ns.pad, 0, sizeof(user_ns.pad)); @@ -6700,7 +6739,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_put_guest_xcr0(vcpu); - /* Interrupt is enabled by handle_external_intr() */ kvm_x86_ops->handle_external_intr(vcpu); ++vcpu->stat.exits; @@ -7530,7 +7568,7 @@ int kvm_arch_hardware_enable(void) * before any KVM threads can be running. Unfortunately, we can't * bring the TSCs fully up to date with real time, as we aren't yet far * enough into CPU bringup that we know how much real time has actually - * elapsed; our helper function, get_kernel_ns() will be using boot + * elapsed; our helper function, ktime_get_boot_ns() will be using boot * variables that haven't been updated yet. * * So we simply find the maximum observed TSC above, then record the @@ -7765,6 +7803,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) mutex_init(&kvm->arch.apic_map_lock); spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock); + kvm->arch.kvmclock_offset = -ktime_get_boot_ns(); pvclock_update_vm_gtod_copy(kvm); INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index a82ca46..e8ff3e4 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -148,11 +148,6 @@ static inline void kvm_register_writel(struct kvm_vcpu *vcpu, return kvm_register_write(vcpu, reg, val); } -static inline u64 get_kernel_ns(void) -{ - return ktime_get_boot_ns(); -} - static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk) { return !(kvm->arch.disabled_quirks & quirk); @@ -164,6 +159,7 @@ void kvm_set_pending_timer(struct kvm_vcpu *vcpu); int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr); +u64 get_kvmclock_ns(struct kvm *kvm); int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, unsigned int bytes, |