diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-02-13 09:55:09 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-02-13 09:55:09 -0800 |
commit | b9085bcbf5f43adf60533f9b635b2e7faeed0fe9 (patch) | |
tree | e397abf5682a45c096e75b3d0fa99c8e228425fc /arch | |
parent | c7d7b98671552abade78834c522b7308bda73c0d (diff) | |
parent | 6557bada461afeaa920a189fae2cff7c8fdce39f (diff) | |
download | op-kernel-dev-b9085bcbf5f43adf60533f9b635b2e7faeed0fe9.zip op-kernel-dev-b9085bcbf5f43adf60533f9b635b2e7faeed0fe9.tar.gz |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM update from Paolo Bonzini:
"Fairly small update, but there are some interesting new features.
Common:
Optional support for adding a small amount of polling on each HLT
instruction executed in the guest (or equivalent for other
architectures). This can improve latency up to 50% on some
scenarios (e.g. O_DSYNC writes or TCP_RR netperf tests). This
also has to be enabled manually for now, but the plan is to
auto-tune this in the future.
ARM/ARM64:
The highlights are support for GICv3 emulation and dirty page
tracking
s390:
Several optimizations and bugfixes. Also a first: a feature
exposed by KVM (UUID and long guest name in /proc/sysinfo) before
it is available in IBM's hypervisor! :)
MIPS:
Bugfixes.
x86:
Support for PML (page modification logging, a new feature in
Broadwell Xeons that speeds up dirty page tracking), nested
virtualization improvements (nested APICv---a nice optimization),
usual round of emulation fixes.
There is also a new option to reduce latency of the TSC deadline
timer in the guest; this needs to be tuned manually.
Some commits are common between this pull and Catalin's; I see you
have already included his tree.
Powerpc:
Nothing yet.
The KVM/PPC changes will come in through the PPC maintainers,
because I haven't received them yet and I might end up being
offline for some part of next week"
* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (130 commits)
KVM: ia64: drop kvm.h from installed user headers
KVM: x86: fix build with !CONFIG_SMP
KVM: x86: emulate: correct page fault error code for NoWrite instructions
KVM: Disable compat ioctl for s390
KVM: s390: add cpu model support
KVM: s390: use facilities and cpu_id per KVM
KVM: s390/CPACF: Choose crypto control block format
s390/kernel: Update /proc/sysinfo file with Extended Name and UUID
KVM: s390: reenable LPP facility
KVM: s390: floating irqs: fix user triggerable endless loop
kvm: add halt_poll_ns module parameter
kvm: remove KVM_MMIO_SIZE
KVM: MIPS: Don't leak FPU/DSP to guest
KVM: MIPS: Disable HTW while in guest
KVM: nVMX: Enable nested posted interrupt processing
KVM: nVMX: Enable nested virtual interrupt delivery
KVM: nVMX: Enable nested apic register virtualization
KVM: nVMX: Make nested control MSRs per-cpu
KVM: nVMX: Enable nested virtualize x2apic mode
KVM: nVMX: Prepare for using hardware MSR bitmap
...
Diffstat (limited to 'arch')
70 files changed, 3276 insertions, 718 deletions
diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h index 3a67bec..25410b2 100644 --- a/arch/arm/include/asm/kvm_asm.h +++ b/arch/arm/include/asm/kvm_asm.h @@ -96,6 +96,7 @@ extern char __kvm_hyp_code_end[]; extern void __kvm_flush_vm_context(void); extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa); +extern void __kvm_tlb_flush_vmid(struct kvm *kvm); extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); #endif diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h index 7b01523..a9c80a2 100644 --- a/arch/arm/include/asm/kvm_emulate.h +++ b/arch/arm/include/asm/kvm_emulate.h @@ -23,6 +23,7 @@ #include <asm/kvm_asm.h> #include <asm/kvm_mmio.h> #include <asm/kvm_arm.h> +#include <asm/cputype.h> unsigned long *vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num); unsigned long *vcpu_spsr(struct kvm_vcpu *vcpu); @@ -177,9 +178,9 @@ static inline u32 kvm_vcpu_hvc_get_imm(struct kvm_vcpu *vcpu) return kvm_vcpu_get_hsr(vcpu) & HSR_HVC_IMM_MASK; } -static inline unsigned long kvm_vcpu_get_mpidr(struct kvm_vcpu *vcpu) +static inline unsigned long kvm_vcpu_get_mpidr_aff(struct kvm_vcpu *vcpu) { - return vcpu->arch.cp15[c0_MPIDR]; + return vcpu->arch.cp15[c0_MPIDR] & MPIDR_HWID_BITMASK; } static inline void kvm_vcpu_set_be(struct kvm_vcpu *vcpu) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 04b4ea0..41008cd 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -68,6 +68,7 @@ struct kvm_arch { /* Interrupt controller */ struct vgic_dist vgic; + int max_vcpus; }; #define KVM_NR_MEM_OBJS 40 @@ -144,6 +145,7 @@ struct kvm_vm_stat { }; struct kvm_vcpu_stat { + u32 halt_successful_poll; u32 halt_wakeup; }; @@ -231,6 +233,10 @@ static inline void vgic_arch_setup(const struct vgic_params *vgic) int kvm_perf_init(void); int kvm_perf_teardown(void); +void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot); + +struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr); + static inline void kvm_arch_hardware_disable(void) {} static inline void kvm_arch_hardware_unsetup(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} diff --git a/arch/arm/include/asm/kvm_mmio.h b/arch/arm/include/asm/kvm_mmio.h index adcc0d7..3f83db2 100644 --- a/arch/arm/include/asm/kvm_mmio.h +++ b/arch/arm/include/asm/kvm_mmio.h @@ -37,6 +37,7 @@ struct kvm_exit_mmio { u8 data[8]; u32 len; bool is_write; + void *private; }; static inline void kvm_prepare_mmio(struct kvm_run *run, diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 1bca8f8..37ca2a4 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -115,6 +115,27 @@ static inline void kvm_set_s2pmd_writable(pmd_t *pmd) pmd_val(*pmd) |= L_PMD_S2_RDWR; } +static inline void kvm_set_s2pte_readonly(pte_t *pte) +{ + pte_val(*pte) = (pte_val(*pte) & ~L_PTE_S2_RDWR) | L_PTE_S2_RDONLY; +} + +static inline bool kvm_s2pte_readonly(pte_t *pte) +{ + return (pte_val(*pte) & L_PTE_S2_RDWR) == L_PTE_S2_RDONLY; +} + +static inline void kvm_set_s2pmd_readonly(pmd_t *pmd) +{ + pmd_val(*pmd) = (pmd_val(*pmd) & ~L_PMD_S2_RDWR) | L_PMD_S2_RDONLY; +} + +static inline bool kvm_s2pmd_readonly(pmd_t *pmd) +{ + return (pmd_val(*pmd) & L_PMD_S2_RDWR) == L_PMD_S2_RDONLY; +} + + /* Open coded p*d_addr_end that can deal with 64bit addresses */ #define kvm_pgd_addr_end(addr, end) \ ({ u64 __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK; \ diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index 423a5ac..a745a2a 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h @@ -129,6 +129,7 @@ #define L_PTE_S2_RDONLY (_AT(pteval_t, 1) << 6) /* HAP[1] */ #define L_PTE_S2_RDWR (_AT(pteval_t, 3) << 6) /* HAP[2:1] */ +#define L_PMD_S2_RDONLY (_AT(pmdval_t, 1) << 6) /* HAP[1] */ #define L_PMD_S2_RDWR (_AT(pmdval_t, 3) << 6) /* HAP[2:1] */ /* diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h index 09ee408..0db25bc 100644 --- a/arch/arm/include/uapi/asm/kvm.h +++ b/arch/arm/include/uapi/asm/kvm.h @@ -175,6 +175,8 @@ struct kvm_arch_memory_slot { #define KVM_DEV_ARM_VGIC_OFFSET_SHIFT 0 #define KVM_DEV_ARM_VGIC_OFFSET_MASK (0xffffffffULL << KVM_DEV_ARM_VGIC_OFFSET_SHIFT) #define KVM_DEV_ARM_VGIC_GRP_NR_IRQS 3 +#define KVM_DEV_ARM_VGIC_GRP_CTRL 4 +#define KVM_DEV_ARM_VGIC_CTRL_INIT 0 /* KVM_IRQ_LINE irq field index values */ #define KVM_ARM_IRQ_TYPE_SHIFT 24 diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig index 3afee5f..338ace7 100644 --- a/arch/arm/kvm/Kconfig +++ b/arch/arm/kvm/Kconfig @@ -21,8 +21,10 @@ config KVM select PREEMPT_NOTIFIERS select ANON_INODES select HAVE_KVM_CPU_RELAX_INTERCEPT + select HAVE_KVM_ARCH_TLB_FLUSH_ALL select KVM_MMIO select KVM_ARM_HOST + select KVM_GENERIC_DIRTYLOG_READ_PROTECT select SRCU depends on ARM_VIRT_EXT && ARM_LPAE ---help--- diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile index f7057ed..443b8be 100644 --- a/arch/arm/kvm/Makefile +++ b/arch/arm/kvm/Makefile @@ -22,4 +22,5 @@ obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o obj-y += coproc.o coproc_a15.o coproc_a7.o mmio.o psci.o perf.o obj-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic.o obj-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v2.o +obj-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v2-emul.o obj-$(CONFIG_KVM_ARM_TIMER) += $(KVM)/arm/arch_timer.o diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 0b0d58a..07e7eb1 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -132,6 +132,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) /* Mark the initial VMID generation invalid */ kvm->arch.vmid_gen = 0; + /* The maximum number of VCPUs is limited by the host's GIC model */ + kvm->arch.max_vcpus = kvm_vgic_get_max_vcpus(); + return ret; out_free_stage2_pgd: kvm_free_stage2_pgd(kvm); @@ -218,6 +221,11 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) goto out; } + if (id >= kvm->arch.max_vcpus) { + err = -EINVAL; + goto out; + } + vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); if (!vcpu) { err = -ENOMEM; @@ -241,9 +249,8 @@ out: return ERR_PTR(err); } -int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) +void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) { - return 0; } void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) @@ -777,9 +784,39 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } } +/** + * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot + * @kvm: kvm instance + * @log: slot id and address to which we copy the log + * + * Steps 1-4 below provide general overview of dirty page logging. See + * kvm_get_dirty_log_protect() function description for additional details. + * + * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we + * always flush the TLB (step 4) even if previous step failed and the dirty + * bitmap may be corrupt. Regardless of previous outcome the KVM logging API + * does not preclude user space subsequent dirty log read. Flushing TLB ensures + * writes will be marked dirty for next log read. + * + * 1. Take a snapshot of the bit and clear it if needed. + * 2. Write protect the corresponding page. + * 3. Copy the snapshot to the userspace. + * 4. Flush TLB's if needed. + */ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { - return -EINVAL; + bool is_dirty = false; + int r; + + mutex_lock(&kvm->slots_lock); + + r = kvm_get_dirty_log_protect(kvm, log, &is_dirty); + + if (is_dirty) + kvm_flush_remote_tlbs(kvm); + + mutex_unlock(&kvm->slots_lock); + return r; } static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, @@ -811,7 +848,7 @@ long kvm_arch_vm_ioctl(struct file *filp, switch (ioctl) { case KVM_CREATE_IRQCHIP: { if (vgic_present) - return kvm_vgic_create(kvm); + return kvm_vgic_create(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); else return -ENXIO; } @@ -1035,6 +1072,19 @@ static void check_kvm_target_cpu(void *ret) *(int *)ret = kvm_target_cpu(); } +struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr) +{ + struct kvm_vcpu *vcpu; + int i; + + mpidr &= MPIDR_HWID_BITMASK; + kvm_for_each_vcpu(i, vcpu, kvm) { + if (mpidr == kvm_vcpu_get_mpidr_aff(vcpu)) + return vcpu; + } + return NULL; +} + /** * Initialize Hyp-mode and memory mappings on all CPUs. */ diff --git a/arch/arm/kvm/handle_exit.c b/arch/arm/kvm/handle_exit.c index a96a804..95f12b2 100644 --- a/arch/arm/kvm/handle_exit.c +++ b/arch/arm/kvm/handle_exit.c @@ -87,11 +87,13 @@ static int handle_dabt_hyp(struct kvm_vcpu *vcpu, struct kvm_run *run) */ static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct kvm_run *run) { - trace_kvm_wfi(*vcpu_pc(vcpu)); - if (kvm_vcpu_get_hsr(vcpu) & HSR_WFI_IS_WFE) + if (kvm_vcpu_get_hsr(vcpu) & HSR_WFI_IS_WFE) { + trace_kvm_wfx(*vcpu_pc(vcpu), true); kvm_vcpu_on_spin(vcpu); - else + } else { + trace_kvm_wfx(*vcpu_pc(vcpu), false); kvm_vcpu_block(vcpu); + } kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S index 01dcb0e..79caf79 100644 --- a/arch/arm/kvm/interrupts.S +++ b/arch/arm/kvm/interrupts.S @@ -66,6 +66,17 @@ ENTRY(__kvm_tlb_flush_vmid_ipa) bx lr ENDPROC(__kvm_tlb_flush_vmid_ipa) +/** + * void __kvm_tlb_flush_vmid(struct kvm *kvm) - Flush per-VMID TLBs + * + * Reuses __kvm_tlb_flush_vmid_ipa() for ARMv7, without passing address + * parameter + */ + +ENTRY(__kvm_tlb_flush_vmid) + b __kvm_tlb_flush_vmid_ipa +ENDPROC(__kvm_tlb_flush_vmid) + /******************************************************************** * Flush TLBs and instruction caches of all CPUs inside the inner-shareable * domain, for all VMIDs diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 1366625..3e6859b 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -45,6 +45,26 @@ static phys_addr_t hyp_idmap_vector; #define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t)) #define kvm_pmd_huge(_x) (pmd_huge(_x) || pmd_trans_huge(_x)) +#define kvm_pud_huge(_x) pud_huge(_x) + +#define KVM_S2PTE_FLAG_IS_IOMAP (1UL << 0) +#define KVM_S2_FLAG_LOGGING_ACTIVE (1UL << 1) + +static bool memslot_is_logging(struct kvm_memory_slot *memslot) +{ + return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY); +} + +/** + * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8 + * @kvm: pointer to kvm structure. + * + * Interface to HYP function to flush all VM TLB entries + */ +void kvm_flush_remote_tlbs(struct kvm *kvm) +{ + kvm_call_hyp(__kvm_tlb_flush_vmid, kvm); +} static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) { @@ -78,6 +98,25 @@ static void kvm_flush_dcache_pud(pud_t pud) __kvm_flush_dcache_pud(pud); } +/** + * stage2_dissolve_pmd() - clear and flush huge PMD entry + * @kvm: pointer to kvm structure. + * @addr: IPA + * @pmd: pmd pointer for IPA + * + * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all + * pages in the range dirty. + */ +static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd) +{ + if (!kvm_pmd_huge(*pmd)) + return; + + pmd_clear(pmd); + kvm_tlb_flush_vmid_ipa(kvm, addr); + put_page(virt_to_page(pmd)); +} + static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, int min, int max) { @@ -819,10 +858,15 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache } static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, - phys_addr_t addr, const pte_t *new_pte, bool iomap) + phys_addr_t addr, const pte_t *new_pte, + unsigned long flags) { pmd_t *pmd; pte_t *pte, old_pte; + bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP; + bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE; + + VM_BUG_ON(logging_active && !cache); /* Create stage-2 page table mapping - Levels 0 and 1 */ pmd = stage2_get_pmd(kvm, cache, addr); @@ -834,6 +878,13 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, return 0; } + /* + * While dirty page logging - dissolve huge PMD, then continue on to + * allocate page. + */ + if (logging_active) + stage2_dissolve_pmd(kvm, addr, pmd); + /* Create stage-2 page mappings - Level 2 */ if (pmd_none(*pmd)) { if (!cache) @@ -890,7 +941,8 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, if (ret) goto out; spin_lock(&kvm->mmu_lock); - ret = stage2_set_pte(kvm, &cache, addr, &pte, true); + ret = stage2_set_pte(kvm, &cache, addr, &pte, + KVM_S2PTE_FLAG_IS_IOMAP); spin_unlock(&kvm->mmu_lock); if (ret) goto out; @@ -957,6 +1009,165 @@ static bool kvm_is_device_pfn(unsigned long pfn) return !pfn_valid(pfn); } +/** + * stage2_wp_ptes - write protect PMD range + * @pmd: pointer to pmd entry + * @addr: range start address + * @end: range end address + */ +static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end) +{ + pte_t *pte; + + pte = pte_offset_kernel(pmd, addr); + do { + if (!pte_none(*pte)) { + if (!kvm_s2pte_readonly(pte)) + kvm_set_s2pte_readonly(pte); + } + } while (pte++, addr += PAGE_SIZE, addr != end); +} + +/** + * stage2_wp_pmds - write protect PUD range + * @pud: pointer to pud entry + * @addr: range start address + * @end: range end address + */ +static void stage2_wp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end) +{ + pmd_t *pmd; + phys_addr_t next; + + pmd = pmd_offset(pud, addr); + + do { + next = kvm_pmd_addr_end(addr, end); + if (!pmd_none(*pmd)) { + if (kvm_pmd_huge(*pmd)) { + if (!kvm_s2pmd_readonly(pmd)) + kvm_set_s2pmd_readonly(pmd); + } else { + stage2_wp_ptes(pmd, addr, next); + } + } + } while (pmd++, addr = next, addr != end); +} + +/** + * stage2_wp_puds - write protect PGD range + * @pgd: pointer to pgd entry + * @addr: range start address + * @end: range end address + * + * Process PUD entries, for a huge PUD we cause a panic. + */ +static void stage2_wp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end) +{ + pud_t *pud; + phys_addr_t next; + + pud = pud_offset(pgd, addr); + do { + next = kvm_pud_addr_end(addr, end); + if (!pud_none(*pud)) { + /* TODO:PUD not supported, revisit later if supported */ + BUG_ON(kvm_pud_huge(*pud)); + stage2_wp_pmds(pud, addr, next); + } + } while (pud++, addr = next, addr != end); +} + +/** + * stage2_wp_range() - write protect stage2 memory region range + * @kvm: The KVM pointer + * @addr: Start address of range + * @end: End address of range + */ +static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) +{ + pgd_t *pgd; + phys_addr_t next; + + pgd = kvm->arch.pgd + pgd_index(addr); + do { + /* + * Release kvm_mmu_lock periodically if the memory region is + * large. Otherwise, we may see kernel panics with + * CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR, + * CONFIG_LOCKDEP. Additionally, holding the lock too long + * will also starve other vCPUs. + */ + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) + cond_resched_lock(&kvm->mmu_lock); + + next = kvm_pgd_addr_end(addr, end); + if (pgd_present(*pgd)) + stage2_wp_puds(pgd, addr, next); + } while (pgd++, addr = next, addr != end); +} + +/** + * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot + * @kvm: The KVM pointer + * @slot: The memory slot to write protect + * + * Called to start logging dirty pages after memory region + * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns + * all present PMD and PTEs are write protected in the memory region. + * Afterwards read of dirty page log can be called. + * + * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired, + * serializing operations for VM memory regions. + */ +void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) +{ + struct kvm_memory_slot *memslot = id_to_memslot(kvm->memslots, slot); + phys_addr_t start = memslot->base_gfn << PAGE_SHIFT; + phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; + + spin_lock(&kvm->mmu_lock); + stage2_wp_range(kvm, start, end); + spin_unlock(&kvm->mmu_lock); + kvm_flush_remote_tlbs(kvm); +} + +/** + * kvm_mmu_write_protect_pt_masked() - write protect dirty pages + * @kvm: The KVM pointer + * @slot: The memory slot associated with mask + * @gfn_offset: The gfn offset in memory slot + * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory + * slot to be write protected + * + * Walks bits set in mask write protects the associated pte's. Caller must + * acquire kvm_mmu_lock. + */ +static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask) +{ + phys_addr_t base_gfn = slot->base_gfn + gfn_offset; + phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT; + phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT; + + stage2_wp_range(kvm, start, end); +} + +/* + * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected + * dirty pages. + * + * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to + * enable dirty logging for them. + */ +void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask) +{ + kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); +} + static void coherent_cache_guest_page(struct kvm_vcpu *vcpu, pfn_t pfn, unsigned long size, bool uncached) { @@ -977,6 +1188,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, pfn_t pfn; pgprot_t mem_type = PAGE_S2; bool fault_ipa_uncached; + bool logging_active = memslot_is_logging(memslot); + unsigned long flags = 0; write_fault = kvm_is_write_fault(vcpu); if (fault_status == FSC_PERM && !write_fault) { @@ -993,7 +1206,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, return -EFAULT; } - if (is_vm_hugetlb_page(vma)) { + if (is_vm_hugetlb_page(vma) && !logging_active) { hugetlb = true; gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT; } else { @@ -1034,12 +1247,30 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (is_error_pfn(pfn)) return -EFAULT; - if (kvm_is_device_pfn(pfn)) + if (kvm_is_device_pfn(pfn)) { mem_type = PAGE_S2_DEVICE; + flags |= KVM_S2PTE_FLAG_IS_IOMAP; + } else if (logging_active) { + /* + * Faults on pages in a memslot with logging enabled + * should not be mapped with huge pages (it introduces churn + * and performance degradation), so force a pte mapping. + */ + force_pte = true; + flags |= KVM_S2_FLAG_LOGGING_ACTIVE; + + /* + * Only actually map the page as writable if this was a write + * fault. + */ + if (!write_fault) + writable = false; + } spin_lock(&kvm->mmu_lock); if (mmu_notifier_retry(kvm, mmu_seq)) goto out_unlock; + if (!hugetlb && !force_pte) hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa); @@ -1056,16 +1287,16 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd); } else { pte_t new_pte = pfn_pte(pfn, mem_type); + if (writable) { kvm_set_s2pte_writable(&new_pte); kvm_set_pfn_dirty(pfn); + mark_page_dirty(kvm, gfn); } coherent_cache_guest_page(vcpu, pfn, PAGE_SIZE, fault_ipa_uncached); - ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, - pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE)); + ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags); } - out_unlock: spin_unlock(&kvm->mmu_lock); kvm_release_pfn_clean(pfn); @@ -1215,7 +1446,14 @@ static void kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data) { pte_t *pte = (pte_t *)data; - stage2_set_pte(kvm, NULL, gpa, pte, false); + /* + * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE + * flag clear because MMU notifiers will have unmapped a huge PMD before + * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and + * therefore stage2_set_pte() never needs to clear out a huge PMD + * through this calling path. + */ + stage2_set_pte(kvm, NULL, gpa, pte, 0); } @@ -1348,6 +1586,13 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, const struct kvm_memory_slot *old, enum kvm_mr_change change) { + /* + * At this point memslot has been committed and there is an + * allocated dirty_bitmap[], dirty pages will be be tracked while the + * memory slot is write protected. + */ + if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) + kvm_mmu_wp_memory_region(kvm, mem->slot); } int kvm_arch_prepare_memory_region(struct kvm *kvm, @@ -1360,7 +1605,8 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, bool writable = !(mem->flags & KVM_MEM_READONLY); int ret = 0; - if (change != KVM_MR_CREATE && change != KVM_MR_MOVE) + if (change != KVM_MR_CREATE && change != KVM_MR_MOVE && + change != KVM_MR_FLAGS_ONLY) return 0; /* @@ -1411,6 +1657,10 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, phys_addr_t pa = (vma->vm_pgoff << PAGE_SHIFT) + vm_start - vma->vm_start; + /* IO region dirty page logging not allowed */ + if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) + return -EINVAL; + ret = kvm_phys_addr_ioremap(kvm, gpa, pa, vm_end - vm_start, writable); @@ -1420,6 +1670,9 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, hva = vm_end; } while (hva < reg_end); + if (change == KVM_MR_FLAGS_ONLY) + return ret; + spin_lock(&kvm->mmu_lock); if (ret) unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size); diff --git a/arch/arm/kvm/psci.c b/arch/arm/kvm/psci.c index 58cb324..02fa8ef 100644 --- a/arch/arm/kvm/psci.c +++ b/arch/arm/kvm/psci.c @@ -22,6 +22,7 @@ #include <asm/cputype.h> #include <asm/kvm_emulate.h> #include <asm/kvm_psci.h> +#include <asm/kvm_host.h> /* * This is an implementation of the Power State Coordination Interface @@ -66,25 +67,17 @@ static void kvm_psci_vcpu_off(struct kvm_vcpu *vcpu) static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) { struct kvm *kvm = source_vcpu->kvm; - struct kvm_vcpu *vcpu = NULL, *tmp; + struct kvm_vcpu *vcpu = NULL; wait_queue_head_t *wq; unsigned long cpu_id; unsigned long context_id; - unsigned long mpidr; phys_addr_t target_pc; - int i; - cpu_id = *vcpu_reg(source_vcpu, 1); + cpu_id = *vcpu_reg(source_vcpu, 1) & MPIDR_HWID_BITMASK; if (vcpu_mode_is_32bit(source_vcpu)) cpu_id &= ~((u32) 0); - kvm_for_each_vcpu(i, tmp, kvm) { - mpidr = kvm_vcpu_get_mpidr(tmp); - if ((mpidr & MPIDR_HWID_BITMASK) == (cpu_id & MPIDR_HWID_BITMASK)) { - vcpu = tmp; - break; - } - } + vcpu = kvm_mpidr_to_vcpu(kvm, cpu_id); /* * Make sure the caller requested a valid CPU and that the CPU is @@ -155,7 +148,7 @@ static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu) * then ON else OFF */ kvm_for_each_vcpu(i, tmp, kvm) { - mpidr = kvm_vcpu_get_mpidr(tmp); + mpidr = kvm_vcpu_get_mpidr_aff(tmp); if (((mpidr & target_affinity_mask) == target_affinity) && !tmp->arch.pause) { return PSCI_0_2_AFFINITY_LEVEL_ON; diff --git a/arch/arm/kvm/trace.h b/arch/arm/kvm/trace.h index b6a6e71..881874b 100644 --- a/arch/arm/kvm/trace.h +++ b/arch/arm/kvm/trace.h @@ -140,19 +140,22 @@ TRACE_EVENT(kvm_emulate_cp15_imp, __entry->CRm, __entry->Op2) ); -TRACE_EVENT(kvm_wfi, - TP_PROTO(unsigned long vcpu_pc), - TP_ARGS(vcpu_pc), +TRACE_EVENT(kvm_wfx, + TP_PROTO(unsigned long vcpu_pc, bool is_wfe), + TP_ARGS(vcpu_pc, is_wfe), TP_STRUCT__entry( __field( unsigned long, vcpu_pc ) + __field( bool, is_wfe ) ), TP_fast_assign( __entry->vcpu_pc = vcpu_pc; + __entry->is_wfe = is_wfe; ), - TP_printk("guest executed wfi at: 0x%08lx", __entry->vcpu_pc) + TP_printk("guest executed wf%c at: 0x%08lx", + __entry->is_wfe ? 'e' : 'i', __entry->vcpu_pc) ); TRACE_EVENT(kvm_unmap_hva, diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h index 6216709..92bbae3 100644 --- a/arch/arm64/include/asm/esr.h +++ b/arch/arm64/include/asm/esr.h @@ -96,6 +96,7 @@ #define ESR_ELx_COND_SHIFT (20) #define ESR_ELx_COND_MASK (UL(0xF) << ESR_ELx_COND_SHIFT) #define ESR_ELx_WFx_ISS_WFE (UL(1) << 0) +#define ESR_ELx_xVC_IMM_MASK ((1UL << 16) - 1) #ifndef __ASSEMBLY__ #include <asm/types.h> diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 4838421..4f7310f 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -126,6 +126,7 @@ extern char __kvm_hyp_vector[]; extern void __kvm_flush_vm_context(void); extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa); +extern void __kvm_tlb_flush_vmid(struct kvm *kvm); extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 0163b57..17e92f0 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -29,6 +29,7 @@ #include <asm/kvm_asm.h> #include <asm/kvm_mmio.h> #include <asm/ptrace.h> +#include <asm/cputype.h> unsigned long *vcpu_reg32(const struct kvm_vcpu *vcpu, u8 reg_num); unsigned long *vcpu_spsr32(const struct kvm_vcpu *vcpu); @@ -140,6 +141,11 @@ static inline phys_addr_t kvm_vcpu_get_fault_ipa(const struct kvm_vcpu *vcpu) return ((phys_addr_t)vcpu->arch.fault.hpfar_el2 & HPFAR_MASK) << 8; } +static inline u32 kvm_vcpu_hvc_get_imm(const struct kvm_vcpu *vcpu) +{ + return kvm_vcpu_get_hsr(vcpu) & ESR_ELx_xVC_IMM_MASK; +} + static inline bool kvm_vcpu_dabt_isvalid(const struct kvm_vcpu *vcpu) { return !!(kvm_vcpu_get_hsr(vcpu) & ESR_ELx_ISV); @@ -201,9 +207,9 @@ static inline u8 kvm_vcpu_trap_get_fault_type(const struct kvm_vcpu *vcpu) return kvm_vcpu_get_hsr(vcpu) & ESR_ELx_FSC_TYPE; } -static inline unsigned long kvm_vcpu_get_mpidr(struct kvm_vcpu *vcpu) +static inline unsigned long kvm_vcpu_get_mpidr_aff(struct kvm_vcpu *vcpu) { - return vcpu_sys_reg(vcpu, MPIDR_EL1); + return vcpu_sys_reg(vcpu, MPIDR_EL1) & MPIDR_HWID_BITMASK; } static inline void kvm_vcpu_set_be(struct kvm_vcpu *vcpu) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index acd101a..8ac3c70f 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -59,6 +59,9 @@ struct kvm_arch { /* VTTBR value associated with above pgd and vmid */ u64 vttbr; + /* The maximum number of vCPUs depends on the used GIC model */ + int max_vcpus; + /* Interrupt controller */ struct vgic_dist vgic; @@ -159,6 +162,7 @@ struct kvm_vm_stat { }; struct kvm_vcpu_stat { + u32 halt_successful_poll; u32 halt_wakeup; }; @@ -196,6 +200,7 @@ struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void); u64 kvm_call_hyp(void *hypfn, ...); void force_vm_exit(const cpumask_t *mask); +void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot); int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, int exception_index); @@ -203,6 +208,8 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, int kvm_perf_init(void); int kvm_perf_teardown(void); +struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr); + static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr, phys_addr_t pgd_ptr, unsigned long hyp_stack_ptr, diff --git a/arch/arm64/include/asm/kvm_mmio.h b/arch/arm64/include/asm/kvm_mmio.h index fc2f689..9f52beb 100644 --- a/arch/arm64/include/asm/kvm_mmio.h +++ b/arch/arm64/include/asm/kvm_mmio.h @@ -40,6 +40,7 @@ struct kvm_exit_mmio { u8 data[8]; u32 len; bool is_write; + void *private; }; static inline void kvm_prepare_mmio(struct kvm_run *run, diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index adcf495..6458b53 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -118,6 +118,27 @@ static inline void kvm_set_s2pmd_writable(pmd_t *pmd) pmd_val(*pmd) |= PMD_S2_RDWR; } +static inline void kvm_set_s2pte_readonly(pte_t *pte) +{ + pte_val(*pte) = (pte_val(*pte) & ~PTE_S2_RDWR) | PTE_S2_RDONLY; +} + +static inline bool kvm_s2pte_readonly(pte_t *pte) +{ + return (pte_val(*pte) & PTE_S2_RDWR) == PTE_S2_RDONLY; +} + +static inline void kvm_set_s2pmd_readonly(pmd_t *pmd) +{ + pmd_val(*pmd) = (pmd_val(*pmd) & ~PMD_S2_RDWR) | PMD_S2_RDONLY; +} + +static inline bool kvm_s2pmd_readonly(pmd_t *pmd) +{ + return (pmd_val(*pmd) & PMD_S2_RDWR) == PMD_S2_RDONLY; +} + + #define kvm_pgd_addr_end(addr, end) pgd_addr_end(addr, end) #define kvm_pud_addr_end(addr, end) pud_addr_end(addr, end) #define kvm_pmd_addr_end(addr, end) pmd_addr_end(addr, end) diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index 88174e0..5f930cc 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -119,6 +119,7 @@ #define PTE_S2_RDONLY (_AT(pteval_t, 1) << 6) /* HAP[2:1] */ #define PTE_S2_RDWR (_AT(pteval_t, 3) << 6) /* HAP[2:1] */ +#define PMD_S2_RDONLY (_AT(pmdval_t, 1) << 6) /* HAP[2:1] */ #define PMD_S2_RDWR (_AT(pmdval_t, 3) << 6) /* HAP[2:1] */ /* diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index 8e38878..3ef77a4 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -78,6 +78,13 @@ struct kvm_regs { #define KVM_VGIC_V2_DIST_SIZE 0x1000 #define KVM_VGIC_V2_CPU_SIZE 0x2000 +/* Supported VGICv3 address types */ +#define KVM_VGIC_V3_ADDR_TYPE_DIST 2 +#define KVM_VGIC_V3_ADDR_TYPE_REDIST 3 + +#define KVM_VGIC_V3_DIST_SIZE SZ_64K +#define KVM_VGIC_V3_REDIST_SIZE (2 * SZ_64K) + #define KVM_ARM_VCPU_POWER_OFF 0 /* CPU is started in OFF state */ #define KVM_ARM_VCPU_EL1_32BIT 1 /* CPU running a 32bit VM */ #define KVM_ARM_VCPU_PSCI_0_2 2 /* CPU uses PSCI v0.2 */ @@ -161,6 +168,8 @@ struct kvm_arch_memory_slot { #define KVM_DEV_ARM_VGIC_OFFSET_SHIFT 0 #define KVM_DEV_ARM_VGIC_OFFSET_MASK (0xffffffffULL << KVM_DEV_ARM_VGIC_OFFSET_SHIFT) #define KVM_DEV_ARM_VGIC_GRP_NR_IRQS 3 +#define KVM_DEV_ARM_VGIC_GRP_CTRL 4 +#define KVM_DEV_ARM_VGIC_CTRL_INIT 0 /* KVM_IRQ_LINE irq field index values */ #define KVM_ARM_IRQ_TYPE_SHIFT 24 diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index a2ae194..f7fa65d 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -140,6 +140,7 @@ int main(void) DEFINE(VGIC_V2_CPU_ELRSR, offsetof(struct vgic_cpu, vgic_v2.vgic_elrsr)); DEFINE(VGIC_V2_CPU_APR, offsetof(struct vgic_cpu, vgic_v2.vgic_apr)); DEFINE(VGIC_V2_CPU_LR, offsetof(struct vgic_cpu, vgic_v2.vgic_lr)); + DEFINE(VGIC_V3_CPU_SRE, offsetof(struct vgic_cpu, vgic_v3.vgic_sre)); DEFINE(VGIC_V3_CPU_HCR, offsetof(struct vgic_cpu, vgic_v3.vgic_hcr)); DEFINE(VGIC_V3_CPU_VMCR, offsetof(struct vgic_cpu, vgic_v3.vgic_vmcr)); DEFINE(VGIC_V3_CPU_MISR, offsetof(struct vgic_cpu, vgic_v3.vgic_misr)); diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index b334084..f5590c8 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -22,10 +22,12 @@ config KVM select PREEMPT_NOTIFIERS select ANON_INODES select HAVE_KVM_CPU_RELAX_INTERCEPT + select HAVE_KVM_ARCH_TLB_FLUSH_ALL select KVM_MMIO select KVM_ARM_HOST select KVM_ARM_VGIC select KVM_ARM_TIMER + select KVM_GENERIC_DIRTYLOG_READ_PROTECT select SRCU ---help--- Support hosting virtualized guest machines. diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index 32a0961..4e6e09e 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -21,7 +21,9 @@ kvm-$(CONFIG_KVM_ARM_HOST) += guest.o reset.o sys_regs.o sys_regs_generic_v8.o kvm-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic.o kvm-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v2.o +kvm-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v2-emul.o kvm-$(CONFIG_KVM_ARM_VGIC) += vgic-v2-switch.o kvm-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v3.o +kvm-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic-v3-emul.o kvm-$(CONFIG_KVM_ARM_VGIC) += vgic-v3-switch.o kvm-$(CONFIG_KVM_ARM_TIMER) += $(KVM)/arm/arch_timer.o diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index 29b184a..524fa25 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -28,12 +28,18 @@ #include <asm/kvm_mmu.h> #include <asm/kvm_psci.h> +#define CREATE_TRACE_POINTS +#include "trace.h" + typedef int (*exit_handle_fn)(struct kvm_vcpu *, struct kvm_run *); static int handle_hvc(struct kvm_vcpu *vcpu, struct kvm_run *run) { int ret; + trace_kvm_hvc_arm64(*vcpu_pc(vcpu), *vcpu_reg(vcpu, 0), + kvm_vcpu_hvc_get_imm(vcpu)); + ret = kvm_psci_call(vcpu); if (ret < 0) { kvm_inject_undefined(vcpu); @@ -63,10 +69,13 @@ static int handle_smc(struct kvm_vcpu *vcpu, struct kvm_run *run) */ static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct kvm_run *run) { - if (kvm_vcpu_get_hsr(vcpu) & ESR_ELx_WFx_ISS_WFE) + if (kvm_vcpu_get_hsr(vcpu) & ESR_ELx_WFx_ISS_WFE) { + trace_kvm_wfx_arm64(*vcpu_pc(vcpu), true); kvm_vcpu_on_spin(vcpu); - else + } else { + trace_kvm_wfx_arm64(*vcpu_pc(vcpu), false); kvm_vcpu_block(vcpu); + } kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S index 9bff671..5befd01 100644 --- a/arch/arm64/kvm/hyp.S +++ b/arch/arm64/kvm/hyp.S @@ -1032,6 +1032,28 @@ ENTRY(__kvm_tlb_flush_vmid_ipa) ret ENDPROC(__kvm_tlb_flush_vmid_ipa) +/** + * void __kvm_tlb_flush_vmid(struct kvm *kvm) - Flush per-VMID TLBs + * @struct kvm *kvm - pointer to kvm structure + * + * Invalidates all Stage 1 and 2 TLB entries for current VMID. + */ +ENTRY(__kvm_tlb_flush_vmid) + dsb ishst + + kern_hyp_va x0 + ldr x2, [x0, #KVM_VTTBR] + msr vttbr_el2, x2 + isb + + tlbi vmalls12e1is + dsb ish + isb + + msr vttbr_el2, xzr + ret +ENDPROC(__kvm_tlb_flush_vmid) + ENTRY(__kvm_flush_vm_context) dsb ishst tlbi alle1is diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index b96afdf..c370b40 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -113,6 +113,27 @@ static bool access_vm_reg(struct kvm_vcpu *vcpu, return true; } +/* + * Trap handler for the GICv3 SGI generation system register. + * Forward the request to the VGIC emulation. + * The cp15_64 code makes sure this automatically works + * for both AArch64 and AArch32 accesses. + */ +static bool access_gic_sgi(struct kvm_vcpu *vcpu, + const struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + u64 val; + + if (!p->is_write) + return read_from_write_only(vcpu, p); + + val = *vcpu_reg(vcpu, p->Rt); + vgic_v3_dispatch_sgi(vcpu, val); + + return true; +} + static bool trap_raz_wi(struct kvm_vcpu *vcpu, const struct sys_reg_params *p, const struct sys_reg_desc *r) @@ -200,10 +221,19 @@ static void reset_amair_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) static void reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { + u64 mpidr; + /* - * Simply map the vcpu_id into the Aff0 field of the MPIDR. + * Map the vcpu_id into the first three affinity level fields of + * the MPIDR. We limit the number of VCPUs in level 0 due to a + * limitation to 16 CPUs in that level in the ICC_SGIxR registers + * of the GICv3 to be able to address each CPU directly when + * sending IPIs. */ - vcpu_sys_reg(vcpu, MPIDR_EL1) = (1UL << 31) | (vcpu->vcpu_id & 0xff); + mpidr = (vcpu->vcpu_id & 0x0f) << MPIDR_LEVEL_SHIFT(0); + mpidr |= ((vcpu->vcpu_id >> 4) & 0xff) << MPIDR_LEVEL_SHIFT(1); + mpidr |= ((vcpu->vcpu_id >> 12) & 0xff) << MPIDR_LEVEL_SHIFT(2); + vcpu_sys_reg(vcpu, MPIDR_EL1) = (1ULL << 31) | mpidr; } /* Silly macro to expand the DBG{BCR,BVR,WVR,WCR}n_EL1 registers in one go */ @@ -373,6 +403,9 @@ static const struct sys_reg_desc sys_reg_descs[] = { { Op0(0b11), Op1(0b000), CRn(0b1100), CRm(0b0000), Op2(0b000), NULL, reset_val, VBAR_EL1, 0 }, + /* ICC_SGI1R_EL1 */ + { Op0(0b11), Op1(0b000), CRn(0b1100), CRm(0b1011), Op2(0b101), + access_gic_sgi }, /* ICC_SRE_EL1 */ { Op0(0b11), Op1(0b000), CRn(0b1100), CRm(0b1100), Op2(0b101), trap_raz_wi }, @@ -605,6 +638,8 @@ static const struct sys_reg_desc cp14_64_regs[] = { * register). */ static const struct sys_reg_desc cp15_regs[] = { + { Op1( 0), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, + { Op1( 0), CRn( 1), CRm( 0), Op2( 0), access_vm_reg, NULL, c1_SCTLR }, { Op1( 0), CRn( 2), CRm( 0), Op2( 0), access_vm_reg, NULL, c2_TTBR0 }, { Op1( 0), CRn( 2), CRm( 0), Op2( 1), access_vm_reg, NULL, c2_TTBR1 }, @@ -652,6 +687,7 @@ static const struct sys_reg_desc cp15_regs[] = { static const struct sys_reg_desc cp15_64_regs[] = { { Op1( 0), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, c2_TTBR0 }, + { Op1( 0), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, { Op1( 1), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, c2_TTBR1 }, }; diff --git a/arch/arm64/kvm/trace.h b/arch/arm64/kvm/trace.h new file mode 100644 index 0000000..157416e9 --- /dev/null +++ b/arch/arm64/kvm/trace.h @@ -0,0 +1,55 @@ +#if !defined(_TRACE_ARM64_KVM_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_ARM64_KVM_H + +#include <linux/tracepoint.h> + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kvm + +TRACE_EVENT(kvm_wfx_arm64, + TP_PROTO(unsigned long vcpu_pc, bool is_wfe), + TP_ARGS(vcpu_pc, is_wfe), + + TP_STRUCT__entry( + __field(unsigned long, vcpu_pc) + __field(bool, is_wfe) + ), + + TP_fast_assign( + __entry->vcpu_pc = vcpu_pc; + __entry->is_wfe = is_wfe; + ), + + TP_printk("guest executed wf%c at: 0x%08lx", + __entry->is_wfe ? 'e' : 'i', __entry->vcpu_pc) +); + +TRACE_EVENT(kvm_hvc_arm64, + TP_PROTO(unsigned long vcpu_pc, unsigned long r0, unsigned long imm), + TP_ARGS(vcpu_pc, r0, imm), + + TP_STRUCT__entry( + __field(unsigned long, vcpu_pc) + __field(unsigned long, r0) + __field(unsigned long, imm) + ), + + TP_fast_assign( + __entry->vcpu_pc = vcpu_pc; + __entry->r0 = r0; + __entry->imm = imm; + ), + + TP_printk("HVC at 0x%08lx (r0: 0x%08lx, imm: 0x%lx)", + __entry->vcpu_pc, __entry->r0, __entry->imm) +); + +#endif /* _TRACE_ARM64_KVM_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/arch/arm64/kvm/vgic-v3-switch.S b/arch/arm64/kvm/vgic-v3-switch.S index d160469..617a012 100644 --- a/arch/arm64/kvm/vgic-v3-switch.S +++ b/arch/arm64/kvm/vgic-v3-switch.S @@ -148,17 +148,18 @@ * x0: Register pointing to VCPU struct */ .macro restore_vgic_v3_state - // Disable SRE_EL1 access. Necessary, otherwise - // ICH_VMCR_EL2.VFIQEn becomes one, and FIQ happens... - msr_s ICC_SRE_EL1, xzr - isb - // Compute the address of struct vgic_cpu add x3, x0, #VCPU_VGIC_CPU // Restore all interesting registers ldr w4, [x3, #VGIC_V3_CPU_HCR] ldr w5, [x3, #VGIC_V3_CPU_VMCR] + ldr w25, [x3, #VGIC_V3_CPU_SRE] + + msr_s ICC_SRE_EL1, x25 + + // make sure SRE is valid before writing the other registers + isb msr_s ICH_HCR_EL2, x4 msr_s ICH_VMCR_EL2, x5 @@ -244,9 +245,12 @@ dsb sy // Prevent the guest from touching the GIC system registers + // if SRE isn't enabled for GICv3 emulation + cbnz x25, 1f mrs_s x5, ICC_SRE_EL2 and x5, x5, #~ICC_SRE_EL2_ENABLE msr_s ICC_SRE_EL2, x5 +1: .endm ENTRY(__save_vgic_v3_state) diff --git a/arch/ia64/include/uapi/asm/Kbuild b/arch/ia64/include/uapi/asm/Kbuild index 1b3f5eb..891002b 100644 --- a/arch/ia64/include/uapi/asm/Kbuild +++ b/arch/ia64/include/uapi/asm/Kbuild @@ -18,7 +18,6 @@ header-y += intrinsics.h header-y += ioctl.h header-y += ioctls.h header-y += ipcbuf.h -header-y += kvm.h header-y += kvm_para.h header-y += mman.h header-y += msgbuf.h diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index f2c2497..ac4fc71 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -120,6 +120,7 @@ struct kvm_vcpu_stat { u32 resvd_inst_exits; u32 break_inst_exits; u32 flush_dcache_exits; + u32 halt_successful_poll; u32 halt_wakeup; }; diff --git a/arch/mips/kvm/locore.S b/arch/mips/kvm/locore.S index d7279c0..4a68b17 100644 --- a/arch/mips/kvm/locore.S +++ b/arch/mips/kvm/locore.S @@ -434,7 +434,7 @@ __kvm_mips_return_to_guest: /* Setup status register for running guest in UM */ .set at or v1, v1, (ST0_EXL | KSU_USER | ST0_IE) - and v1, v1, ~ST0_CU0 + and v1, v1, ~(ST0_CU0 | ST0_MX) .set noat mtc0 v1, CP0_STATUS ehb diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index e3b21e5..c9eccf5 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -15,9 +15,11 @@ #include <linux/vmalloc.h> #include <linux/fs.h> #include <linux/bootmem.h> +#include <asm/fpu.h> #include <asm/page.h> #include <asm/cacheflush.h> #include <asm/mmu_context.h> +#include <asm/pgtable.h> #include <linux/kvm_host.h> @@ -47,6 +49,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "resvd_inst", VCPU_STAT(resvd_inst_exits), KVM_STAT_VCPU }, { "break_inst", VCPU_STAT(break_inst_exits), KVM_STAT_VCPU }, { "flush_dcache", VCPU_STAT(flush_dcache_exits), KVM_STAT_VCPU }, + { "halt_successful_poll", VCPU_STAT(halt_successful_poll), KVM_STAT_VCPU }, { "halt_wakeup", VCPU_STAT(halt_wakeup), KVM_STAT_VCPU }, {NULL} }; @@ -378,6 +381,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) vcpu->mmio_needed = 0; } + lose_fpu(1); + local_irq_disable(); /* Check if we have any exceptions/interrupts pending */ kvm_mips_deliver_interrupts(vcpu, @@ -385,8 +390,14 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) kvm_guest_enter(); + /* Disable hardware page table walking while in guest */ + htw_stop(); + r = __kvm_mips_vcpu_run(run, vcpu); + /* Re-enable HTW before enabling interrupts */ + htw_start(); + kvm_guest_exit(); local_irq_enable(); @@ -832,9 +843,8 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, return -ENOIOCTLCMD; } -int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) +void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) { - return 0; } int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) @@ -980,9 +990,6 @@ static void kvm_mips_set_c0_status(void) { uint32_t status = read_c0_status(); - if (cpu_has_fpu) - status |= (ST0_CU1); - if (cpu_has_dsp) status |= (ST0_MX); @@ -1002,6 +1009,9 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; + /* re-enable HTW before enabling interrupts */ + htw_start(); + /* Set a default exit reason */ run->exit_reason = KVM_EXIT_UNKNOWN; run->ready_for_interrupt_injection = 1; @@ -1136,6 +1146,9 @@ skip_emul: } } + /* Disable HTW before returning to guest or host */ + htw_stop(); + return ret; } diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 7efd666a..8ef0512 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -107,6 +107,7 @@ struct kvm_vcpu_stat { u32 emulated_inst_exits; u32 dec_exits; u32 ext_intr_exits; + u32 halt_successful_poll; u32 halt_wakeup; u32 dbell_exits; u32 gdbell_exits; diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 888bf46..cfbcdc6 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -52,6 +52,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "dec", VCPU_STAT(dec_exits) }, { "ext_intr", VCPU_STAT(ext_intr_exits) }, { "queue_intr", VCPU_STAT(queue_intr) }, + { "halt_successful_poll", VCPU_STAT(halt_successful_poll), }, { "halt_wakeup", VCPU_STAT(halt_wakeup) }, { "pf_storage", VCPU_STAT(pf_storage) }, { "sp_storage", VCPU_STAT(sp_storage) }, diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 9b55dec..6c1316a 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -62,6 +62,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "inst_emu", VCPU_STAT(emulated_inst_exits) }, { "dec", VCPU_STAT(dec_exits) }, { "ext_intr", VCPU_STAT(ext_intr_exits) }, + { "halt_successful_poll", VCPU_STAT(halt_successful_poll) }, { "halt_wakeup", VCPU_STAT(halt_wakeup) }, { "doorbell", VCPU_STAT(dbell_exits) }, { "guest doorbell", VCPU_STAT(gdbell_exits) }, diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index c45eaab..27c0fac 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -623,9 +623,8 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) return vcpu; } -int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) +void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) { - return 0; } void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 9cba74d5..d84559e 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -35,11 +35,13 @@ #define KVM_NR_IRQCHIPS 1 #define KVM_IRQCHIP_NUM_PINS 4096 -#define SIGP_CTRL_C 0x00800000 +#define SIGP_CTRL_C 0x80 +#define SIGP_CTRL_SCN_MASK 0x3f struct sca_entry { - atomic_t ctrl; - __u32 reserved; + __u8 reserved0; + __u8 sigp_ctrl; + __u16 reserved[3]; __u64 sda; __u64 reserved2[2]; } __attribute__((packed)); @@ -87,7 +89,8 @@ struct kvm_s390_sie_block { atomic_t cpuflags; /* 0x0000 */ __u32 : 1; /* 0x0004 */ __u32 prefix : 18; - __u32 : 13; + __u32 : 1; + __u32 ibc : 12; __u8 reserved08[4]; /* 0x0008 */ #define PROG_IN_SIE (1<<0) __u32 prog0c; /* 0x000c */ @@ -132,7 +135,9 @@ struct kvm_s390_sie_block { __u8 reserved60; /* 0x0060 */ __u8 ecb; /* 0x0061 */ __u8 ecb2; /* 0x0062 */ - __u8 reserved63[1]; /* 0x0063 */ +#define ECB3_AES 0x04 +#define ECB3_DEA 0x08 + __u8 ecb3; /* 0x0063 */ __u32 scaol; /* 0x0064 */ __u8 reserved68[4]; /* 0x0068 */ __u32 todpr; /* 0x006c */ @@ -159,6 +164,7 @@ struct kvm_s390_sie_block { __u64 tecmc; /* 0x00e8 */ __u8 reservedf0[12]; /* 0x00f0 */ #define CRYCB_FORMAT1 0x00000001 +#define CRYCB_FORMAT2 0x00000003 __u32 crycbd; /* 0x00fc */ __u64 gcr[16]; /* 0x0100 */ __u64 gbea; /* 0x0180 */ @@ -192,6 +198,7 @@ struct kvm_vcpu_stat { u32 exit_stop_request; u32 exit_validity; u32 exit_instruction; + u32 halt_successful_poll; u32 halt_wakeup; u32 instruction_lctl; u32 instruction_lctlg; @@ -378,14 +385,11 @@ struct kvm_s390_interrupt_info { struct kvm_s390_emerg_info emerg; struct kvm_s390_extcall_info extcall; struct kvm_s390_prefix_info prefix; + struct kvm_s390_stop_info stop; struct kvm_s390_mchk_info mchk; }; }; -/* for local_interrupt.action_flags */ -#define ACTION_STORE_ON_STOP (1<<0) -#define ACTION_STOP_ON_STOP (1<<1) - struct kvm_s390_irq_payload { struct kvm_s390_io_info io; struct kvm_s390_ext_info ext; @@ -393,6 +397,7 @@ struct kvm_s390_irq_payload { struct kvm_s390_emerg_info emerg; struct kvm_s390_extcall_info extcall; struct kvm_s390_prefix_info prefix; + struct kvm_s390_stop_info stop; struct kvm_s390_mchk_info mchk; }; @@ -401,7 +406,6 @@ struct kvm_s390_local_interrupt { struct kvm_s390_float_interrupt *float_int; wait_queue_head_t *wq; atomic_t *cpuflags; - unsigned int action_bits; DECLARE_BITMAP(sigp_emerg_pending, KVM_MAX_VCPUS); struct kvm_s390_irq_payload irq; unsigned long pending_irqs; @@ -470,7 +474,6 @@ struct kvm_vcpu_arch { }; struct gmap *gmap; struct kvm_guestdbg_info_arch guestdbg; -#define KVM_S390_PFAULT_TOKEN_INVALID (-1UL) unsigned long pfault_token; unsigned long pfault_select; unsigned long pfault_compare; @@ -504,13 +507,39 @@ struct s390_io_adapter { #define MAX_S390_IO_ADAPTERS ((MAX_ISC + 1) * 8) #define MAX_S390_ADAPTER_MAPS 256 +/* maximum size of facilities and facility mask is 2k bytes */ +#define S390_ARCH_FAC_LIST_SIZE_BYTE (1<<11) +#define S390_ARCH_FAC_LIST_SIZE_U64 \ + (S390_ARCH_FAC_LIST_SIZE_BYTE / sizeof(u64)) +#define S390_ARCH_FAC_MASK_SIZE_BYTE S390_ARCH_FAC_LIST_SIZE_BYTE +#define S390_ARCH_FAC_MASK_SIZE_U64 \ + (S390_ARCH_FAC_MASK_SIZE_BYTE / sizeof(u64)) + +struct s390_model_fac { + /* facilities used in SIE context */ + __u64 sie[S390_ARCH_FAC_LIST_SIZE_U64]; + /* subset enabled by kvm */ + __u64 kvm[S390_ARCH_FAC_LIST_SIZE_U64]; +}; + +struct kvm_s390_cpu_model { + struct s390_model_fac *fac; + struct cpuid cpu_id; + unsigned short ibc; +}; + struct kvm_s390_crypto { struct kvm_s390_crypto_cb *crycb; __u32 crycbd; + __u8 aes_kw; + __u8 dea_kw; }; struct kvm_s390_crypto_cb { - __u8 reserved00[128]; /* 0x0000 */ + __u8 reserved00[72]; /* 0x0000 */ + __u8 dea_wrapping_key_mask[24]; /* 0x0048 */ + __u8 aes_wrapping_key_mask[32]; /* 0x0060 */ + __u8 reserved80[128]; /* 0x0080 */ }; struct kvm_arch{ @@ -523,12 +552,15 @@ struct kvm_arch{ int use_irqchip; int use_cmma; int user_cpu_state_ctrl; + int user_sigp; struct s390_io_adapter *adapters[MAX_S390_IO_ADAPTERS]; wait_queue_head_t ipte_wq; int ipte_lock_count; struct mutex ipte_mutex; spinlock_t start_stop_lock; + struct kvm_s390_cpu_model model; struct kvm_s390_crypto crypto; + u64 epoch; }; #define KVM_HVA_ERR_BAD (-1UL) diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h index edb453c..f1096ba 100644 --- a/arch/s390/include/asm/sclp.h +++ b/arch/s390/include/asm/sclp.h @@ -31,7 +31,8 @@ struct sclp_cpu_entry { u8 reserved0[2]; u8 : 3; u8 siif : 1; - u8 : 4; + u8 sigpif : 1; + u8 : 3; u8 reserved2[10]; u8 type; u8 reserved1; @@ -69,6 +70,7 @@ int memcpy_hsa(void *dest, unsigned long src, size_t count, int mode); unsigned long sclp_get_hsa_size(void); void sclp_early_detect(void); int sclp_has_siif(void); +int sclp_has_sigpif(void); unsigned int sclp_get_ibc(void); long _sclp_print_early(const char *); diff --git a/arch/s390/include/asm/sysinfo.h b/arch/s390/include/asm/sysinfo.h index 73f12d2..f7054a8 100644 --- a/arch/s390/include/asm/sysinfo.h +++ b/arch/s390/include/asm/sysinfo.h @@ -15,6 +15,7 @@ #define __ASM_S390_SYSINFO_H #include <asm/bitsperlong.h> +#include <linux/uuid.h> struct sysinfo_1_1_1 { unsigned char p:1; @@ -116,10 +117,13 @@ struct sysinfo_3_2_2 { char name[8]; unsigned int caf; char cpi[16]; - char reserved_1[24]; - + char reserved_1[3]; + char ext_name_encoding; + unsigned int reserved_2; + uuid_be uuid; } vm[8]; - char reserved_544[3552]; + char reserved_3[1504]; + char ext_names[8][256]; }; extern int topology_max_mnest; diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h index 48eda3a..9c77e60 100644 --- a/arch/s390/include/uapi/asm/kvm.h +++ b/arch/s390/include/uapi/asm/kvm.h @@ -57,10 +57,44 @@ struct kvm_s390_io_adapter_req { /* kvm attr_group on vm fd */ #define KVM_S390_VM_MEM_CTRL 0 +#define KVM_S390_VM_TOD 1 +#define KVM_S390_VM_CRYPTO 2 +#define KVM_S390_VM_CPU_MODEL 3 /* kvm attributes for mem_ctrl */ #define KVM_S390_VM_MEM_ENABLE_CMMA 0 #define KVM_S390_VM_MEM_CLR_CMMA 1 +#define KVM_S390_VM_MEM_LIMIT_SIZE 2 + +/* kvm attributes for KVM_S390_VM_TOD */ +#define KVM_S390_VM_TOD_LOW 0 +#define KVM_S390_VM_TOD_HIGH 1 + +/* kvm attributes for KVM_S390_VM_CPU_MODEL */ +/* processor related attributes are r/w */ +#define KVM_S390_VM_CPU_PROCESSOR 0 +struct kvm_s390_vm_cpu_processor { + __u64 cpuid; + __u16 ibc; + __u8 pad[6]; + __u64 fac_list[256]; +}; + +/* machine related attributes are r/o */ +#define KVM_S390_VM_CPU_MACHINE 1 +struct kvm_s390_vm_cpu_machine { + __u64 cpuid; + __u32 ibc; + __u8 pad[4]; + __u64 fac_mask[256]; + __u64 fac_list[256]; +}; + +/* kvm attributes for crypto */ +#define KVM_S390_VM_CRYPTO_ENABLE_AES_KW 0 +#define KVM_S390_VM_CRYPTO_ENABLE_DEA_KW 1 +#define KVM_S390_VM_CRYPTO_DISABLE_AES_KW 2 +#define KVM_S390_VM_CRYPTO_DISABLE_DEA_KW 3 /* for KVM_GET_REGS and KVM_SET_REGS */ struct kvm_regs { @@ -107,6 +141,9 @@ struct kvm_guest_debug_arch { struct kvm_hw_breakpoint __user *hw_bp; }; +/* for KVM_SYNC_PFAULT and KVM_REG_S390_PFTOKEN */ +#define KVM_S390_PFAULT_TOKEN_INVALID 0xffffffffffffffffULL + #define KVM_SYNC_PREFIX (1UL << 0) #define KVM_SYNC_GPRS (1UL << 1) #define KVM_SYNC_ACRS (1UL << 2) diff --git a/arch/s390/kernel/sysinfo.c b/arch/s390/kernel/sysinfo.c index 85565f1..99babea 100644 --- a/arch/s390/kernel/sysinfo.c +++ b/arch/s390/kernel/sysinfo.c @@ -204,6 +204,33 @@ static void stsi_2_2_2(struct seq_file *m, struct sysinfo_2_2_2 *info) } } +static void print_ext_name(struct seq_file *m, int lvl, + struct sysinfo_3_2_2 *info) +{ + if (info->vm[lvl].ext_name_encoding == 0) + return; + if (info->ext_names[lvl][0] == 0) + return; + switch (info->vm[lvl].ext_name_encoding) { + case 1: /* EBCDIC */ + EBCASC(info->ext_names[lvl], sizeof(info->ext_names[lvl])); + break; + case 2: /* UTF-8 */ + break; + default: + return; + } + seq_printf(m, "VM%02d Extended Name: %-.256s\n", lvl, + info->ext_names[lvl]); +} + +static void print_uuid(struct seq_file *m, int i, struct sysinfo_3_2_2 *info) +{ + if (!memcmp(&info->vm[i].uuid, &NULL_UUID_BE, sizeof(uuid_be))) + return; + seq_printf(m, "VM%02d UUID: %pUb\n", i, &info->vm[i].uuid); +} + static void stsi_3_2_2(struct seq_file *m, struct sysinfo_3_2_2 *info) { int i; @@ -221,6 +248,8 @@ static void stsi_3_2_2(struct seq_file *m, struct sysinfo_3_2_2 *info) seq_printf(m, "VM%02d CPUs Configured: %d\n", i, info->vm[i].cpus_configured); seq_printf(m, "VM%02d CPUs Standby: %d\n", i, info->vm[i].cpus_standby); seq_printf(m, "VM%02d CPUs Reserved: %d\n", i, info->vm[i].cpus_reserved); + print_ext_name(m, i, info); + print_uuid(m, i, info); } } diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 8a1be90..267523c 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -357,8 +357,8 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, union asce asce; ctlreg0.val = vcpu->arch.sie_block->gcr[0]; - edat1 = ctlreg0.edat && test_vfacility(8); - edat2 = edat1 && test_vfacility(78); + edat1 = ctlreg0.edat && test_kvm_facility(vcpu->kvm, 8); + edat2 = edat1 && test_kvm_facility(vcpu->kvm, 78); asce.val = get_vcpu_asce(vcpu); if (asce.r) goto real_address; diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 81c77ab..bebd215 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -68,18 +68,27 @@ static int handle_noop(struct kvm_vcpu *vcpu) static int handle_stop(struct kvm_vcpu *vcpu) { + struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; int rc = 0; - unsigned int action_bits; + uint8_t flags, stop_pending; vcpu->stat.exit_stop_request++; - trace_kvm_s390_stop_request(vcpu->arch.local_int.action_bits); - action_bits = vcpu->arch.local_int.action_bits; + /* delay the stop if any non-stop irq is pending */ + if (kvm_s390_vcpu_has_irq(vcpu, 1)) + return 0; + + /* avoid races with the injection/SIGP STOP code */ + spin_lock(&li->lock); + flags = li->irq.stop.flags; + stop_pending = kvm_s390_is_stop_irq_pending(vcpu); + spin_unlock(&li->lock); - if (!(action_bits & ACTION_STOP_ON_STOP)) + trace_kvm_s390_stop_request(stop_pending, flags); + if (!stop_pending) return 0; - if (action_bits & ACTION_STORE_ON_STOP) { + if (flags & KVM_S390_STOP_FLAG_STORE_STATUS) { rc = kvm_s390_vcpu_store_status(vcpu, KVM_S390_STORE_STATUS_NOADDR); if (rc) @@ -279,11 +288,13 @@ static int handle_external_interrupt(struct kvm_vcpu *vcpu) irq.type = KVM_S390_INT_CPU_TIMER; break; case EXT_IRQ_EXTERNAL_CALL: - if (kvm_s390_si_ext_call_pending(vcpu)) - return 0; irq.type = KVM_S390_INT_EXTERNAL_CALL; irq.u.extcall.code = vcpu->arch.sie_block->extcpuaddr; - break; + rc = kvm_s390_inject_vcpu(vcpu, &irq); + /* ignore if another external call is already pending */ + if (rc == -EBUSY) + return 0; + return rc; default: return -EOPNOTSUPP; } @@ -307,17 +318,19 @@ static int handle_mvpg_pei(struct kvm_vcpu *vcpu) kvm_s390_get_regs_rre(vcpu, ®1, ®2); /* Make sure that the source is paged-in */ - srcaddr = kvm_s390_real_to_abs(vcpu, vcpu->run->s.regs.gprs[reg2]); - if (kvm_is_error_gpa(vcpu->kvm, srcaddr)) - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + rc = guest_translate_address(vcpu, vcpu->run->s.regs.gprs[reg2], + &srcaddr, 0); + if (rc) + return kvm_s390_inject_prog_cond(vcpu, rc); rc = kvm_arch_fault_in_page(vcpu, srcaddr, 0); if (rc != 0) return rc; /* Make sure that the destination is paged-in */ - dstaddr = kvm_s390_real_to_abs(vcpu, vcpu->run->s.regs.gprs[reg1]); - if (kvm_is_error_gpa(vcpu->kvm, dstaddr)) - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + rc = guest_translate_address(vcpu, vcpu->run->s.regs.gprs[reg1], + &dstaddr, 1); + if (rc) + return kvm_s390_inject_prog_cond(vcpu, rc); rc = kvm_arch_fault_in_page(vcpu, dstaddr, 1); if (rc != 0) return rc; diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index f00f31e..073b5f3 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -19,6 +19,7 @@ #include <linux/bitmap.h> #include <asm/asm-offsets.h> #include <asm/uaccess.h> +#include <asm/sclp.h> #include "kvm-s390.h" #include "gaccess.h" #include "trace-s390.h" @@ -159,6 +160,12 @@ static unsigned long deliverable_local_irqs(struct kvm_vcpu *vcpu) if (psw_mchk_disabled(vcpu)) active_mask &= ~IRQ_PEND_MCHK_MASK; + /* + * STOP irqs will never be actively delivered. They are triggered via + * intercept requests and cleared when the stop intercept is performed. + */ + __clear_bit(IRQ_PEND_SIGP_STOP, &active_mask); + return active_mask; } @@ -186,9 +193,6 @@ static void __reset_intercept_indicators(struct kvm_vcpu *vcpu) LCTL_CR10 | LCTL_CR11); vcpu->arch.sie_block->ictl |= (ICTL_STCTL | ICTL_PINT); } - - if (vcpu->arch.local_int.action_bits & ACTION_STOP_ON_STOP) - atomic_set_mask(CPUSTAT_STOP_INT, &vcpu->arch.sie_block->cpuflags); } static void __set_cpuflag(struct kvm_vcpu *vcpu, u32 flag) @@ -216,11 +220,18 @@ static void set_intercept_indicators_mchk(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->lctl |= LCTL_CR14; } +static void set_intercept_indicators_stop(struct kvm_vcpu *vcpu) +{ + if (kvm_s390_is_stop_irq_pending(vcpu)) + __set_cpuflag(vcpu, CPUSTAT_STOP_INT); +} + /* Set interception request for non-deliverable local interrupts */ static void set_intercept_indicators_local(struct kvm_vcpu *vcpu) { set_intercept_indicators_ext(vcpu); set_intercept_indicators_mchk(vcpu); + set_intercept_indicators_stop(vcpu); } static void __set_intercept_indicator(struct kvm_vcpu *vcpu, @@ -392,18 +403,6 @@ static int __must_check __deliver_restart(struct kvm_vcpu *vcpu) return rc ? -EFAULT : 0; } -static int __must_check __deliver_stop(struct kvm_vcpu *vcpu) -{ - VCPU_EVENT(vcpu, 4, "%s", "interrupt: cpu stop"); - vcpu->stat.deliver_stop_signal++; - trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_SIGP_STOP, - 0, 0); - - __set_cpuflag(vcpu, CPUSTAT_STOP_INT); - clear_bit(IRQ_PEND_SIGP_STOP, &vcpu->arch.local_int.pending_irqs); - return 0; -} - static int __must_check __deliver_set_prefix(struct kvm_vcpu *vcpu) { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; @@ -705,7 +704,6 @@ static const deliver_irq_t deliver_irq_funcs[] = { [IRQ_PEND_EXT_CLOCK_COMP] = __deliver_ckc, [IRQ_PEND_EXT_CPU_TIMER] = __deliver_cpu_timer, [IRQ_PEND_RESTART] = __deliver_restart, - [IRQ_PEND_SIGP_STOP] = __deliver_stop, [IRQ_PEND_SET_PREFIX] = __deliver_set_prefix, [IRQ_PEND_PFAULT_INIT] = __deliver_pfault_init, }; @@ -738,21 +736,20 @@ static int __must_check __deliver_floating_interrupt(struct kvm_vcpu *vcpu, return rc; } -/* Check whether SIGP interpretation facility has an external call pending */ -int kvm_s390_si_ext_call_pending(struct kvm_vcpu *vcpu) +/* Check whether an external call is pending (deliverable or not) */ +int kvm_s390_ext_call_pending(struct kvm_vcpu *vcpu) { - atomic_t *sigp_ctrl = &vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].ctrl; + struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; + uint8_t sigp_ctrl = vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sigp_ctrl; - if (!psw_extint_disabled(vcpu) && - (vcpu->arch.sie_block->gcr[0] & 0x2000ul) && - (atomic_read(sigp_ctrl) & SIGP_CTRL_C) && - (atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_ECALL_PEND)) - return 1; + if (!sclp_has_sigpif()) + return test_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs); - return 0; + return (sigp_ctrl & SIGP_CTRL_C) && + (atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_ECALL_PEND); } -int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu) +int kvm_s390_vcpu_has_irq(struct kvm_vcpu *vcpu, int exclude_stop) { struct kvm_s390_float_interrupt *fi = vcpu->arch.local_int.float_int; struct kvm_s390_interrupt_info *inti; @@ -773,7 +770,13 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu) if (!rc && kvm_cpu_has_pending_timer(vcpu)) rc = 1; - if (!rc && kvm_s390_si_ext_call_pending(vcpu)) + /* external call pending and deliverable */ + if (!rc && kvm_s390_ext_call_pending(vcpu) && + !psw_extint_disabled(vcpu) && + (vcpu->arch.sie_block->gcr[0] & 0x2000ul)) + rc = 1; + + if (!rc && !exclude_stop && kvm_s390_is_stop_irq_pending(vcpu)) rc = 1; return rc; @@ -804,14 +807,20 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu) return -EOPNOTSUPP; /* disabled wait */ } - __set_cpu_idle(vcpu); if (!ckc_interrupts_enabled(vcpu)) { VCPU_EVENT(vcpu, 3, "%s", "enabled wait w/o timer"); + __set_cpu_idle(vcpu); goto no_timer; } now = get_tod_clock_fast() + vcpu->arch.sie_block->epoch; sltime = tod_to_ns(vcpu->arch.sie_block->ckc - now); + + /* underflow */ + if (vcpu->arch.sie_block->ckc < now) + return 0; + + __set_cpu_idle(vcpu); hrtimer_start(&vcpu->arch.ckc_timer, ktime_set (0, sltime) , HRTIMER_MODE_REL); VCPU_EVENT(vcpu, 5, "enabled wait via clock comparator: %llx ns", sltime); no_timer: @@ -820,7 +829,7 @@ no_timer: __unset_cpu_idle(vcpu); vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); - hrtimer_try_to_cancel(&vcpu->arch.ckc_timer); + hrtimer_cancel(&vcpu->arch.ckc_timer); return 0; } @@ -840,10 +849,20 @@ void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu) enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer) { struct kvm_vcpu *vcpu; + u64 now, sltime; vcpu = container_of(timer, struct kvm_vcpu, arch.ckc_timer); - kvm_s390_vcpu_wakeup(vcpu); + now = get_tod_clock_fast() + vcpu->arch.sie_block->epoch; + sltime = tod_to_ns(vcpu->arch.sie_block->ckc - now); + /* + * If the monotonic clock runs faster than the tod clock we might be + * woken up too early and have to go back to sleep to avoid deadlocks. + */ + if (vcpu->arch.sie_block->ckc > now && + hrtimer_forward_now(timer, ns_to_ktime(sltime))) + return HRTIMER_RESTART; + kvm_s390_vcpu_wakeup(vcpu); return HRTIMER_NORESTART; } @@ -859,8 +878,7 @@ void kvm_s390_clear_local_irqs(struct kvm_vcpu *vcpu) /* clear pending external calls set by sigp interpretation facility */ atomic_clear_mask(CPUSTAT_ECALL_PEND, li->cpuflags); - atomic_clear_mask(SIGP_CTRL_C, - &vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].ctrl); + vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sigp_ctrl = 0; } int __must_check kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu) @@ -984,18 +1002,43 @@ static int __inject_pfault_init(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) return 0; } -int __inject_extcall(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) +static int __inject_extcall_sigpif(struct kvm_vcpu *vcpu, uint16_t src_id) +{ + unsigned char new_val, old_val; + uint8_t *sigp_ctrl = &vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sigp_ctrl; + + new_val = SIGP_CTRL_C | (src_id & SIGP_CTRL_SCN_MASK); + old_val = *sigp_ctrl & ~SIGP_CTRL_C; + if (cmpxchg(sigp_ctrl, old_val, new_val) != old_val) { + /* another external call is pending */ + return -EBUSY; + } + atomic_set_mask(CPUSTAT_ECALL_PEND, &vcpu->arch.sie_block->cpuflags); + return 0; +} + +static int __inject_extcall(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; struct kvm_s390_extcall_info *extcall = &li->irq.extcall; + uint16_t src_id = irq->u.extcall.code; VCPU_EVENT(vcpu, 3, "inject: external call source-cpu:%u", - irq->u.extcall.code); + src_id); trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_INT_EXTERNAL_CALL, - irq->u.extcall.code, 0, 2); + src_id, 0, 2); + + /* sending vcpu invalid */ + if (src_id >= KVM_MAX_VCPUS || + kvm_get_vcpu(vcpu->kvm, src_id) == NULL) + return -EINVAL; + if (sclp_has_sigpif()) + return __inject_extcall_sigpif(vcpu, src_id); + + if (!test_and_set_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs)) + return -EBUSY; *extcall = irq->u.extcall; - set_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs); atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags); return 0; } @@ -1006,23 +1049,41 @@ static int __inject_set_prefix(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) struct kvm_s390_prefix_info *prefix = &li->irq.prefix; VCPU_EVENT(vcpu, 3, "inject: set prefix to %x (from user)", - prefix->address); + irq->u.prefix.address); trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_SIGP_SET_PREFIX, - prefix->address, 0, 2); + irq->u.prefix.address, 0, 2); + + if (!is_vcpu_stopped(vcpu)) + return -EBUSY; *prefix = irq->u.prefix; set_bit(IRQ_PEND_SET_PREFIX, &li->pending_irqs); return 0; } +#define KVM_S390_STOP_SUPP_FLAGS (KVM_S390_STOP_FLAG_STORE_STATUS) static int __inject_sigp_stop(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; + struct kvm_s390_stop_info *stop = &li->irq.stop; + int rc = 0; trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_SIGP_STOP, 0, 0, 2); - li->action_bits |= ACTION_STOP_ON_STOP; - set_bit(IRQ_PEND_SIGP_STOP, &li->pending_irqs); + if (irq->u.stop.flags & ~KVM_S390_STOP_SUPP_FLAGS) + return -EINVAL; + + if (is_vcpu_stopped(vcpu)) { + if (irq->u.stop.flags & KVM_S390_STOP_FLAG_STORE_STATUS) + rc = kvm_s390_store_status_unloaded(vcpu, + KVM_S390_STORE_STATUS_NOADDR); + return rc; + } + + if (test_and_set_bit(IRQ_PEND_SIGP_STOP, &li->pending_irqs)) + return -EBUSY; + stop->flags = irq->u.stop.flags; + __set_cpuflag(vcpu, CPUSTAT_STOP_INT); return 0; } @@ -1042,14 +1103,13 @@ static int __inject_sigp_emergency(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; - struct kvm_s390_emerg_info *emerg = &li->irq.emerg; VCPU_EVENT(vcpu, 3, "inject: emergency %u\n", irq->u.emerg.code); trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_INT_EMERGENCY, - emerg->code, 0, 2); + irq->u.emerg.code, 0, 2); - set_bit(emerg->code, li->sigp_emerg_pending); + set_bit(irq->u.emerg.code, li->sigp_emerg_pending); set_bit(IRQ_PEND_EXT_EMERGENCY, &li->pending_irqs); atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags); return 0; @@ -1061,9 +1121,9 @@ static int __inject_mchk(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) struct kvm_s390_mchk_info *mchk = &li->irq.mchk; VCPU_EVENT(vcpu, 5, "inject: machine check parm64:%llx", - mchk->mcic); + irq->u.mchk.mcic); trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_MCHK, 0, - mchk->mcic, 2); + irq->u.mchk.mcic, 2); /* * Because repressible machine checks can be indicated along with @@ -1121,7 +1181,6 @@ struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm, if ((!schid && !cr6) || (schid && cr6)) return NULL; - mutex_lock(&kvm->lock); fi = &kvm->arch.float_int; spin_lock(&fi->lock); inti = NULL; @@ -1149,7 +1208,6 @@ struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm, if (list_empty(&fi->list)) atomic_set(&fi->active, 0); spin_unlock(&fi->lock); - mutex_unlock(&kvm->lock); return inti; } @@ -1162,7 +1220,6 @@ static int __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) int sigcpu; int rc = 0; - mutex_lock(&kvm->lock); fi = &kvm->arch.float_int; spin_lock(&fi->lock); if (fi->irq_count >= KVM_S390_MAX_FLOAT_IRQS) { @@ -1187,6 +1244,8 @@ static int __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) list_add_tail(&inti->list, &iter->list); } atomic_set(&fi->active, 1); + if (atomic_read(&kvm->online_vcpus) == 0) + goto unlock_fi; sigcpu = find_first_bit(fi->idle_mask, KVM_MAX_VCPUS); if (sigcpu == KVM_MAX_VCPUS) { do { @@ -1213,7 +1272,6 @@ static int __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) kvm_s390_vcpu_wakeup(kvm_get_vcpu(kvm, sigcpu)); unlock_fi: spin_unlock(&fi->lock); - mutex_unlock(&kvm->lock); return rc; } @@ -1221,6 +1279,7 @@ int kvm_s390_inject_vm(struct kvm *kvm, struct kvm_s390_interrupt *s390int) { struct kvm_s390_interrupt_info *inti; + int rc; inti = kzalloc(sizeof(*inti), GFP_KERNEL); if (!inti) @@ -1239,7 +1298,6 @@ int kvm_s390_inject_vm(struct kvm *kvm, inti->ext.ext_params = s390int->parm; break; case KVM_S390_INT_PFAULT_DONE: - inti->type = s390int->type; inti->ext.ext_params2 = s390int->parm64; break; case KVM_S390_MCHK: @@ -1268,7 +1326,10 @@ int kvm_s390_inject_vm(struct kvm *kvm, trace_kvm_s390_inject_vm(s390int->type, s390int->parm, s390int->parm64, 2); - return __inject_vm(kvm, inti); + rc = __inject_vm(kvm, inti); + if (rc) + kfree(inti); + return rc; } void kvm_s390_reinject_io_int(struct kvm *kvm, @@ -1290,13 +1351,16 @@ int s390int_to_s390irq(struct kvm_s390_interrupt *s390int, case KVM_S390_SIGP_SET_PREFIX: irq->u.prefix.address = s390int->parm; break; + case KVM_S390_SIGP_STOP: + irq->u.stop.flags = s390int->parm; + break; case KVM_S390_INT_EXTERNAL_CALL: - if (irq->u.extcall.code & 0xffff0000) + if (s390int->parm & 0xffff0000) return -EINVAL; irq->u.extcall.code = s390int->parm; break; case KVM_S390_INT_EMERGENCY: - if (irq->u.emerg.code & 0xffff0000) + if (s390int->parm & 0xffff0000) return -EINVAL; irq->u.emerg.code = s390int->parm; break; @@ -1307,6 +1371,23 @@ int s390int_to_s390irq(struct kvm_s390_interrupt *s390int, return 0; } +int kvm_s390_is_stop_irq_pending(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; + + return test_bit(IRQ_PEND_SIGP_STOP, &li->pending_irqs); +} + +void kvm_s390_clear_stop_irq(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; + + spin_lock(&li->lock); + li->irq.stop.flags = 0; + clear_bit(IRQ_PEND_SIGP_STOP, &li->pending_irqs); + spin_unlock(&li->lock); +} + int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; @@ -1363,7 +1444,6 @@ void kvm_s390_clear_float_irqs(struct kvm *kvm) struct kvm_s390_float_interrupt *fi; struct kvm_s390_interrupt_info *n, *inti = NULL; - mutex_lock(&kvm->lock); fi = &kvm->arch.float_int; spin_lock(&fi->lock); list_for_each_entry_safe(inti, n, &fi->list, list) { @@ -1373,7 +1453,6 @@ void kvm_s390_clear_float_irqs(struct kvm *kvm) fi->irq_count = 0; atomic_set(&fi->active, 0); spin_unlock(&fi->lock); - mutex_unlock(&kvm->lock); } static inline int copy_irq_to_user(struct kvm_s390_interrupt_info *inti, @@ -1413,7 +1492,6 @@ static int get_all_floating_irqs(struct kvm *kvm, __u8 *buf, __u64 len) int ret = 0; int n = 0; - mutex_lock(&kvm->lock); fi = &kvm->arch.float_int; spin_lock(&fi->lock); @@ -1432,7 +1510,6 @@ static int get_all_floating_irqs(struct kvm *kvm, __u8 *buf, __u64 len) } spin_unlock(&fi->lock); - mutex_unlock(&kvm->lock); return ret < 0 ? ret : n; } diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 3e09801..0c36239 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -22,6 +22,7 @@ #include <linux/kvm.h> #include <linux/kvm_host.h> #include <linux/module.h> +#include <linux/random.h> #include <linux/slab.h> #include <linux/timer.h> #include <asm/asm-offsets.h> @@ -29,7 +30,6 @@ #include <asm/pgtable.h> #include <asm/nmi.h> #include <asm/switch_to.h> -#include <asm/facility.h> #include <asm/sclp.h> #include "kvm-s390.h" #include "gaccess.h" @@ -50,6 +50,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "exit_instruction", VCPU_STAT(exit_instruction) }, { "exit_program_interruption", VCPU_STAT(exit_program_interruption) }, { "exit_instr_and_program_int", VCPU_STAT(exit_instr_and_program) }, + { "halt_successful_poll", VCPU_STAT(halt_successful_poll) }, { "halt_wakeup", VCPU_STAT(halt_wakeup) }, { "instruction_lctlg", VCPU_STAT(instruction_lctlg) }, { "instruction_lctl", VCPU_STAT(instruction_lctl) }, @@ -98,15 +99,20 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { NULL } }; -unsigned long *vfacilities; -static struct gmap_notifier gmap_notifier; +/* upper facilities limit for kvm */ +unsigned long kvm_s390_fac_list_mask[] = { + 0xff82fffbf4fc2000UL, + 0x005c000000000000UL, +}; -/* test availability of vfacility */ -int test_vfacility(unsigned long nr) +unsigned long kvm_s390_fac_list_mask_size(void) { - return __test_facility(nr, (void *) vfacilities); + BUILD_BUG_ON(ARRAY_SIZE(kvm_s390_fac_list_mask) > S390_ARCH_FAC_MASK_SIZE_U64); + return ARRAY_SIZE(kvm_s390_fac_list_mask); } +static struct gmap_notifier gmap_notifier; + /* Section: not file related */ int kvm_arch_hardware_enable(void) { @@ -166,6 +172,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_IRQCHIP: case KVM_CAP_VM_ATTRIBUTES: case KVM_CAP_MP_STATE: + case KVM_CAP_S390_USER_SIGP: r = 1; break; case KVM_CAP_NR_VCPUS: @@ -254,6 +261,10 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) kvm->arch.use_irqchip = 1; r = 0; break; + case KVM_CAP_S390_USER_SIGP: + kvm->arch.user_sigp = 1; + r = 0; + break; default: r = -EINVAL; break; @@ -261,7 +272,24 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) return r; } -static int kvm_s390_mem_control(struct kvm *kvm, struct kvm_device_attr *attr) +static int kvm_s390_get_mem_control(struct kvm *kvm, struct kvm_device_attr *attr) +{ + int ret; + + switch (attr->attr) { + case KVM_S390_VM_MEM_LIMIT_SIZE: + ret = 0; + if (put_user(kvm->arch.gmap->asce_end, (u64 __user *)attr->addr)) + ret = -EFAULT; + break; + default: + ret = -ENXIO; + break; + } + return ret; +} + +static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *attr) { int ret; unsigned int idx; @@ -283,6 +311,36 @@ static int kvm_s390_mem_control(struct kvm *kvm, struct kvm_device_attr *attr) mutex_unlock(&kvm->lock); ret = 0; break; + case KVM_S390_VM_MEM_LIMIT_SIZE: { + unsigned long new_limit; + + if (kvm_is_ucontrol(kvm)) + return -EINVAL; + + if (get_user(new_limit, (u64 __user *)attr->addr)) + return -EFAULT; + + if (new_limit > kvm->arch.gmap->asce_end) + return -E2BIG; + + ret = -EBUSY; + mutex_lock(&kvm->lock); + if (atomic_read(&kvm->online_vcpus) == 0) { + /* gmap_alloc will round the limit up */ + struct gmap *new = gmap_alloc(current->mm, new_limit); + + if (!new) { + ret = -ENOMEM; + } else { + gmap_free(kvm->arch.gmap); + new->private = kvm; + kvm->arch.gmap = new; + ret = 0; + } + } + mutex_unlock(&kvm->lock); + break; + } default: ret = -ENXIO; break; @@ -290,13 +348,276 @@ static int kvm_s390_mem_control(struct kvm *kvm, struct kvm_device_attr *attr) return ret; } +static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu); + +static int kvm_s390_vm_set_crypto(struct kvm *kvm, struct kvm_device_attr *attr) +{ + struct kvm_vcpu *vcpu; + int i; + + if (!test_kvm_facility(kvm, 76)) + return -EINVAL; + + mutex_lock(&kvm->lock); + switch (attr->attr) { + case KVM_S390_VM_CRYPTO_ENABLE_AES_KW: + get_random_bytes( + kvm->arch.crypto.crycb->aes_wrapping_key_mask, + sizeof(kvm->arch.crypto.crycb->aes_wrapping_key_mask)); + kvm->arch.crypto.aes_kw = 1; + break; + case KVM_S390_VM_CRYPTO_ENABLE_DEA_KW: + get_random_bytes( + kvm->arch.crypto.crycb->dea_wrapping_key_mask, + sizeof(kvm->arch.crypto.crycb->dea_wrapping_key_mask)); + kvm->arch.crypto.dea_kw = 1; + break; + case KVM_S390_VM_CRYPTO_DISABLE_AES_KW: + kvm->arch.crypto.aes_kw = 0; + memset(kvm->arch.crypto.crycb->aes_wrapping_key_mask, 0, + sizeof(kvm->arch.crypto.crycb->aes_wrapping_key_mask)); + break; + case KVM_S390_VM_CRYPTO_DISABLE_DEA_KW: + kvm->arch.crypto.dea_kw = 0; + memset(kvm->arch.crypto.crycb->dea_wrapping_key_mask, 0, + sizeof(kvm->arch.crypto.crycb->dea_wrapping_key_mask)); + break; + default: + mutex_unlock(&kvm->lock); + return -ENXIO; + } + + kvm_for_each_vcpu(i, vcpu, kvm) { + kvm_s390_vcpu_crypto_setup(vcpu); + exit_sie(vcpu); + } + mutex_unlock(&kvm->lock); + return 0; +} + +static int kvm_s390_set_tod_high(struct kvm *kvm, struct kvm_device_attr *attr) +{ + u8 gtod_high; + + if (copy_from_user(>od_high, (void __user *)attr->addr, + sizeof(gtod_high))) + return -EFAULT; + + if (gtod_high != 0) + return -EINVAL; + + return 0; +} + +static int kvm_s390_set_tod_low(struct kvm *kvm, struct kvm_device_attr *attr) +{ + struct kvm_vcpu *cur_vcpu; + unsigned int vcpu_idx; + u64 host_tod, gtod; + int r; + + if (copy_from_user(>od, (void __user *)attr->addr, sizeof(gtod))) + return -EFAULT; + + r = store_tod_clock(&host_tod); + if (r) + return r; + + mutex_lock(&kvm->lock); + kvm->arch.epoch = gtod - host_tod; + kvm_for_each_vcpu(vcpu_idx, cur_vcpu, kvm) { + cur_vcpu->arch.sie_block->epoch = kvm->arch.epoch; + exit_sie(cur_vcpu); + } + mutex_unlock(&kvm->lock); + return 0; +} + +static int kvm_s390_set_tod(struct kvm *kvm, struct kvm_device_attr *attr) +{ + int ret; + + if (attr->flags) + return -EINVAL; + + switch (attr->attr) { + case KVM_S390_VM_TOD_HIGH: + ret = kvm_s390_set_tod_high(kvm, attr); + break; + case KVM_S390_VM_TOD_LOW: + ret = kvm_s390_set_tod_low(kvm, attr); + break; + default: + ret = -ENXIO; + break; + } + return ret; +} + +static int kvm_s390_get_tod_high(struct kvm *kvm, struct kvm_device_attr *attr) +{ + u8 gtod_high = 0; + + if (copy_to_user((void __user *)attr->addr, >od_high, + sizeof(gtod_high))) + return -EFAULT; + + return 0; +} + +static int kvm_s390_get_tod_low(struct kvm *kvm, struct kvm_device_attr *attr) +{ + u64 host_tod, gtod; + int r; + + r = store_tod_clock(&host_tod); + if (r) + return r; + + gtod = host_tod + kvm->arch.epoch; + if (copy_to_user((void __user *)attr->addr, >od, sizeof(gtod))) + return -EFAULT; + + return 0; +} + +static int kvm_s390_get_tod(struct kvm *kvm, struct kvm_device_attr *attr) +{ + int ret; + + if (attr->flags) + return -EINVAL; + + switch (attr->attr) { + case KVM_S390_VM_TOD_HIGH: + ret = kvm_s390_get_tod_high(kvm, attr); + break; + case KVM_S390_VM_TOD_LOW: + ret = kvm_s390_get_tod_low(kvm, attr); + break; + default: + ret = -ENXIO; + break; + } + return ret; +} + +static int kvm_s390_set_processor(struct kvm *kvm, struct kvm_device_attr *attr) +{ + struct kvm_s390_vm_cpu_processor *proc; + int ret = 0; + + mutex_lock(&kvm->lock); + if (atomic_read(&kvm->online_vcpus)) { + ret = -EBUSY; + goto out; + } + proc = kzalloc(sizeof(*proc), GFP_KERNEL); + if (!proc) { + ret = -ENOMEM; + goto out; + } + if (!copy_from_user(proc, (void __user *)attr->addr, + sizeof(*proc))) { + memcpy(&kvm->arch.model.cpu_id, &proc->cpuid, + sizeof(struct cpuid)); + kvm->arch.model.ibc = proc->ibc; + memcpy(kvm->arch.model.fac->kvm, proc->fac_list, + S390_ARCH_FAC_LIST_SIZE_BYTE); + } else + ret = -EFAULT; + kfree(proc); +out: + mutex_unlock(&kvm->lock); + return ret; +} + +static int kvm_s390_set_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr) +{ + int ret = -ENXIO; + + switch (attr->attr) { + case KVM_S390_VM_CPU_PROCESSOR: + ret = kvm_s390_set_processor(kvm, attr); + break; + } + return ret; +} + +static int kvm_s390_get_processor(struct kvm *kvm, struct kvm_device_attr *attr) +{ + struct kvm_s390_vm_cpu_processor *proc; + int ret = 0; + + proc = kzalloc(sizeof(*proc), GFP_KERNEL); + if (!proc) { + ret = -ENOMEM; + goto out; + } + memcpy(&proc->cpuid, &kvm->arch.model.cpu_id, sizeof(struct cpuid)); + proc->ibc = kvm->arch.model.ibc; + memcpy(&proc->fac_list, kvm->arch.model.fac->kvm, S390_ARCH_FAC_LIST_SIZE_BYTE); + if (copy_to_user((void __user *)attr->addr, proc, sizeof(*proc))) + ret = -EFAULT; + kfree(proc); +out: + return ret; +} + +static int kvm_s390_get_machine(struct kvm *kvm, struct kvm_device_attr *attr) +{ + struct kvm_s390_vm_cpu_machine *mach; + int ret = 0; + + mach = kzalloc(sizeof(*mach), GFP_KERNEL); + if (!mach) { + ret = -ENOMEM; + goto out; + } + get_cpu_id((struct cpuid *) &mach->cpuid); + mach->ibc = sclp_get_ibc(); + memcpy(&mach->fac_mask, kvm_s390_fac_list_mask, + kvm_s390_fac_list_mask_size() * sizeof(u64)); + memcpy((unsigned long *)&mach->fac_list, S390_lowcore.stfle_fac_list, + S390_ARCH_FAC_LIST_SIZE_U64); + if (copy_to_user((void __user *)attr->addr, mach, sizeof(*mach))) + ret = -EFAULT; + kfree(mach); +out: + return ret; +} + +static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr) +{ + int ret = -ENXIO; + + switch (attr->attr) { + case KVM_S390_VM_CPU_PROCESSOR: + ret = kvm_s390_get_processor(kvm, attr); + break; + case KVM_S390_VM_CPU_MACHINE: + ret = kvm_s390_get_machine(kvm, attr); + break; + } + return ret; +} + static int kvm_s390_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr) { int ret; switch (attr->group) { case KVM_S390_VM_MEM_CTRL: - ret = kvm_s390_mem_control(kvm, attr); + ret = kvm_s390_set_mem_control(kvm, attr); + break; + case KVM_S390_VM_TOD: + ret = kvm_s390_set_tod(kvm, attr); + break; + case KVM_S390_VM_CPU_MODEL: + ret = kvm_s390_set_cpu_model(kvm, attr); + break; + case KVM_S390_VM_CRYPTO: + ret = kvm_s390_vm_set_crypto(kvm, attr); break; default: ret = -ENXIO; @@ -308,7 +629,24 @@ static int kvm_s390_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr) static int kvm_s390_vm_get_attr(struct kvm *kvm, struct kvm_device_attr *attr) { - return -ENXIO; + int ret; + + switch (attr->group) { + case KVM_S390_VM_MEM_CTRL: + ret = kvm_s390_get_mem_control(kvm, attr); + break; + case KVM_S390_VM_TOD: + ret = kvm_s390_get_tod(kvm, attr); + break; + case KVM_S390_VM_CPU_MODEL: + ret = kvm_s390_get_cpu_model(kvm, attr); + break; + default: + ret = -ENXIO; + break; + } + + return ret; } static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) @@ -320,6 +658,42 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) switch (attr->attr) { case KVM_S390_VM_MEM_ENABLE_CMMA: case KVM_S390_VM_MEM_CLR_CMMA: + case KVM_S390_VM_MEM_LIMIT_SIZE: + ret = 0; + break; + default: + ret = -ENXIO; + break; + } + break; + case KVM_S390_VM_TOD: + switch (attr->attr) { + case KVM_S390_VM_TOD_LOW: + case KVM_S390_VM_TOD_HIGH: + ret = 0; + break; + default: + ret = -ENXIO; + break; + } + break; + case KVM_S390_VM_CPU_MODEL: + switch (attr->attr) { + case KVM_S390_VM_CPU_PROCESSOR: + case KVM_S390_VM_CPU_MACHINE: + ret = 0; + break; + default: + ret = -ENXIO; + break; + } + break; + case KVM_S390_VM_CRYPTO: + switch (attr->attr) { + case KVM_S390_VM_CRYPTO_ENABLE_AES_KW: + case KVM_S390_VM_CRYPTO_ENABLE_DEA_KW: + case KVM_S390_VM_CRYPTO_DISABLE_AES_KW: + case KVM_S390_VM_CRYPTO_DISABLE_DEA_KW: ret = 0; break; default: @@ -401,9 +775,61 @@ long kvm_arch_vm_ioctl(struct file *filp, return r; } +static int kvm_s390_query_ap_config(u8 *config) +{ + u32 fcn_code = 0x04000000UL; + u32 cc; + + asm volatile( + "lgr 0,%1\n" + "lgr 2,%2\n" + ".long 0xb2af0000\n" /* PQAP(QCI) */ + "ipm %0\n" + "srl %0,28\n" + : "=r" (cc) + : "r" (fcn_code), "r" (config) + : "cc", "0", "2", "memory" + ); + + return cc; +} + +static int kvm_s390_apxa_installed(void) +{ + u8 config[128]; + int cc; + + if (test_facility(2) && test_facility(12)) { + cc = kvm_s390_query_ap_config(config); + + if (cc) + pr_err("PQAP(QCI) failed with cc=%d", cc); + else + return config[0] & 0x40; + } + + return 0; +} + +static void kvm_s390_set_crycb_format(struct kvm *kvm) +{ + kvm->arch.crypto.crycbd = (__u32)(unsigned long) kvm->arch.crypto.crycb; + + if (kvm_s390_apxa_installed()) + kvm->arch.crypto.crycbd |= CRYCB_FORMAT2; + else + kvm->arch.crypto.crycbd |= CRYCB_FORMAT1; +} + +static void kvm_s390_get_cpu_id(struct cpuid *cpu_id) +{ + get_cpu_id(cpu_id); + cpu_id->version = 0xff; +} + static int kvm_s390_crypto_init(struct kvm *kvm) { - if (!test_vfacility(76)) + if (!test_kvm_facility(kvm, 76)) return 0; kvm->arch.crypto.crycb = kzalloc(sizeof(*kvm->arch.crypto.crycb), @@ -411,15 +837,18 @@ static int kvm_s390_crypto_init(struct kvm *kvm) if (!kvm->arch.crypto.crycb) return -ENOMEM; - kvm->arch.crypto.crycbd = (__u32) (unsigned long) kvm->arch.crypto.crycb | - CRYCB_FORMAT1; + kvm_s390_set_crycb_format(kvm); + + /* Disable AES/DEA protected key functions by default */ + kvm->arch.crypto.aes_kw = 0; + kvm->arch.crypto.dea_kw = 0; return 0; } int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { - int rc; + int i, rc; char debug_name[16]; static unsigned long sca_offset; @@ -454,6 +883,46 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) if (!kvm->arch.dbf) goto out_nodbf; + /* + * The architectural maximum amount of facilities is 16 kbit. To store + * this amount, 2 kbyte of memory is required. Thus we need a full + * page to hold the active copy (arch.model.fac->sie) and the current + * facilities set (arch.model.fac->kvm). Its address size has to be + * 31 bits and word aligned. + */ + kvm->arch.model.fac = + (struct s390_model_fac *) get_zeroed_page(GFP_KERNEL | GFP_DMA); + if (!kvm->arch.model.fac) + goto out_nofac; + + memcpy(kvm->arch.model.fac->kvm, S390_lowcore.stfle_fac_list, + S390_ARCH_FAC_LIST_SIZE_U64); + + /* + * If this KVM host runs *not* in a LPAR, relax the facility bits + * of the kvm facility mask by all missing facilities. This will allow + * to determine the right CPU model by means of the remaining facilities. + * Live guest migration must prohibit the migration of KVMs running in + * a LPAR to non LPAR hosts. + */ + if (!MACHINE_IS_LPAR) + for (i = 0; i < kvm_s390_fac_list_mask_size(); i++) + kvm_s390_fac_list_mask[i] &= kvm->arch.model.fac->kvm[i]; + + /* + * Apply the kvm facility mask to limit the kvm supported/tolerated + * facility list. + */ + for (i = 0; i < S390_ARCH_FAC_LIST_SIZE_U64; i++) { + if (i < kvm_s390_fac_list_mask_size()) + kvm->arch.model.fac->kvm[i] &= kvm_s390_fac_list_mask[i]; + else + kvm->arch.model.fac->kvm[i] = 0UL; + } + + kvm_s390_get_cpu_id(&kvm->arch.model.cpu_id); + kvm->arch.model.ibc = sclp_get_ibc() & 0x0fff; + if (kvm_s390_crypto_init(kvm) < 0) goto out_crypto; @@ -477,6 +946,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm->arch.css_support = 0; kvm->arch.use_irqchip = 0; + kvm->arch.epoch = 0; spin_lock_init(&kvm->arch.start_stop_lock); @@ -484,6 +954,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) out_nogmap: kfree(kvm->arch.crypto.crycb); out_crypto: + free_page((unsigned long)kvm->arch.model.fac); +out_nofac: debug_unregister(kvm->arch.dbf); out_nodbf: free_page((unsigned long)(kvm->arch.sca)); @@ -536,6 +1008,7 @@ static void kvm_free_vcpus(struct kvm *kvm) void kvm_arch_destroy_vm(struct kvm *kvm) { kvm_free_vcpus(kvm); + free_page((unsigned long)kvm->arch.model.fac); free_page((unsigned long)(kvm->arch.sca)); debug_unregister(kvm->arch.dbf); kfree(kvm->arch.crypto.crycb); @@ -546,25 +1019,30 @@ void kvm_arch_destroy_vm(struct kvm *kvm) } /* Section: vcpu related */ +static int __kvm_ucontrol_vcpu_init(struct kvm_vcpu *vcpu) +{ + vcpu->arch.gmap = gmap_alloc(current->mm, -1UL); + if (!vcpu->arch.gmap) + return -ENOMEM; + vcpu->arch.gmap->private = vcpu->kvm; + + return 0; +} + int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID; kvm_clear_async_pf_completion_queue(vcpu); - if (kvm_is_ucontrol(vcpu->kvm)) { - vcpu->arch.gmap = gmap_alloc(current->mm, -1UL); - if (!vcpu->arch.gmap) - return -ENOMEM; - vcpu->arch.gmap->private = vcpu->kvm; - return 0; - } - - vcpu->arch.gmap = vcpu->kvm->arch.gmap; vcpu->run->kvm_valid_regs = KVM_SYNC_PREFIX | KVM_SYNC_GPRS | KVM_SYNC_ACRS | KVM_SYNC_CRS | KVM_SYNC_ARCH0 | KVM_SYNC_PFAULT; + + if (kvm_is_ucontrol(vcpu->kvm)) + return __kvm_ucontrol_vcpu_init(vcpu); + return 0; } @@ -615,16 +1093,27 @@ static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu) kvm_s390_clear_local_irqs(vcpu); } -int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) +void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) { - return 0; + mutex_lock(&vcpu->kvm->lock); + vcpu->arch.sie_block->epoch = vcpu->kvm->arch.epoch; + mutex_unlock(&vcpu->kvm->lock); + if (!kvm_is_ucontrol(vcpu->kvm)) + vcpu->arch.gmap = vcpu->kvm->arch.gmap; } static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu) { - if (!test_vfacility(76)) + if (!test_kvm_facility(vcpu->kvm, 76)) return; + vcpu->arch.sie_block->ecb3 &= ~(ECB3_AES | ECB3_DEA); + + if (vcpu->kvm->arch.crypto.aes_kw) + vcpu->arch.sie_block->ecb3 |= ECB3_AES; + if (vcpu->kvm->arch.crypto.dea_kw) + vcpu->arch.sie_block->ecb3 |= ECB3_DEA; + vcpu->arch.sie_block->crycbd = vcpu->kvm->arch.crypto.crycbd; } @@ -654,14 +1143,15 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) CPUSTAT_STOPPED | CPUSTAT_GED); vcpu->arch.sie_block->ecb = 6; - if (test_vfacility(50) && test_vfacility(73)) + if (test_kvm_facility(vcpu->kvm, 50) && test_kvm_facility(vcpu->kvm, 73)) vcpu->arch.sie_block->ecb |= 0x10; vcpu->arch.sie_block->ecb2 = 8; - vcpu->arch.sie_block->eca = 0xD1002000U; + vcpu->arch.sie_block->eca = 0xC1002000U; if (sclp_has_siif()) vcpu->arch.sie_block->eca |= 1; - vcpu->arch.sie_block->fac = (int) (long) vfacilities; + if (sclp_has_sigpif()) + vcpu->arch.sie_block->eca |= 0x10000000U; vcpu->arch.sie_block->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE | ICTL_TPROT; @@ -670,10 +1160,15 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) if (rc) return rc; } - hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); + hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); vcpu->arch.ckc_timer.function = kvm_s390_idle_wakeup; - get_cpu_id(&vcpu->arch.cpu_id); - vcpu->arch.cpu_id.version = 0xff; + + mutex_lock(&vcpu->kvm->lock); + vcpu->arch.cpu_id = vcpu->kvm->arch.model.cpu_id; + memcpy(vcpu->kvm->arch.model.fac->sie, vcpu->kvm->arch.model.fac->kvm, + S390_ARCH_FAC_LIST_SIZE_BYTE); + vcpu->arch.sie_block->ibc = vcpu->kvm->arch.model.ibc; + mutex_unlock(&vcpu->kvm->lock); kvm_s390_vcpu_crypto_setup(vcpu); @@ -717,6 +1212,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, vcpu->arch.sie_block->scaol = (__u32)(__u64)kvm->arch.sca; set_bit(63 - id, (unsigned long *) &kvm->arch.sca->mcn); } + vcpu->arch.sie_block->fac = (int) (long) kvm->arch.model.fac->sie; spin_lock_init(&vcpu->arch.local_int.lock); vcpu->arch.local_int.float_int = &kvm->arch.float_int; @@ -741,7 +1237,7 @@ out: int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { - return kvm_cpu_has_interrupt(vcpu); + return kvm_s390_vcpu_has_irq(vcpu, 0); } void s390_vcpu_block(struct kvm_vcpu *vcpu) @@ -869,6 +1365,8 @@ static int kvm_arch_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, case KVM_REG_S390_PFTOKEN: r = get_user(vcpu->arch.pfault_token, (u64 __user *)reg->addr); + if (vcpu->arch.pfault_token == KVM_S390_PFAULT_TOKEN_INVALID) + kvm_clear_async_pf_completion_queue(vcpu); break; case KVM_REG_S390_PFCOMPARE: r = get_user(vcpu->arch.pfault_compare, @@ -1176,7 +1674,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu) return 0; if (psw_extint_disabled(vcpu)) return 0; - if (kvm_cpu_has_interrupt(vcpu)) + if (kvm_s390_vcpu_has_irq(vcpu, 0)) return 0; if (!(vcpu->arch.sie_block->gcr[0] & 0x200ul)) return 0; @@ -1341,6 +1839,8 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vcpu->arch.pfault_token = kvm_run->s.regs.pft; vcpu->arch.pfault_select = kvm_run->s.regs.pfs; vcpu->arch.pfault_compare = kvm_run->s.regs.pfc; + if (vcpu->arch.pfault_token == KVM_S390_PFAULT_TOKEN_INVALID) + kvm_clear_async_pf_completion_queue(vcpu); } kvm_run->kvm_dirty_regs = 0; } @@ -1559,15 +2059,10 @@ void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu) spin_lock(&vcpu->kvm->arch.start_stop_lock); online_vcpus = atomic_read(&vcpu->kvm->online_vcpus); - /* Need to lock access to action_bits to avoid a SIGP race condition */ - spin_lock(&vcpu->arch.local_int.lock); - atomic_set_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags); - /* SIGP STOP and SIGP STOP AND STORE STATUS has been fully processed */ - vcpu->arch.local_int.action_bits &= - ~(ACTION_STOP_ON_STOP | ACTION_STORE_ON_STOP); - spin_unlock(&vcpu->arch.local_int.lock); + kvm_s390_clear_stop_irq(vcpu); + atomic_set_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags); __disable_ibs_on_vcpu(vcpu); for (i = 0; i < online_vcpus; i++) { @@ -1783,30 +2278,11 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, static int __init kvm_s390_init(void) { - int ret; - ret = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); - if (ret) - return ret; - - /* - * guests can ask for up to 255+1 double words, we need a full page - * to hold the maximum amount of facilities. On the other hand, we - * only set facilities that are known to work in KVM. - */ - vfacilities = (unsigned long *) get_zeroed_page(GFP_KERNEL|GFP_DMA); - if (!vfacilities) { - kvm_exit(); - return -ENOMEM; - } - memcpy(vfacilities, S390_lowcore.stfle_fac_list, 16); - vfacilities[0] &= 0xff82fffbf47c2000UL; - vfacilities[1] &= 0x005c000000000000UL; - return 0; + return kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); } static void __exit kvm_s390_exit(void) { - free_page((unsigned long) vfacilities); kvm_exit(); } diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index a8f3d9b..985c211 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -18,12 +18,10 @@ #include <linux/hrtimer.h> #include <linux/kvm.h> #include <linux/kvm_host.h> +#include <asm/facility.h> typedef int (*intercept_handler_t)(struct kvm_vcpu *vcpu); -/* declare vfacilities extern */ -extern unsigned long *vfacilities; - /* Transactional Memory Execution related macros */ #define IS_TE_ENABLED(vcpu) ((vcpu->arch.sie_block->ecb & 0x10)) #define TDB_FORMAT1 1 @@ -127,6 +125,12 @@ static inline void kvm_s390_set_psw_cc(struct kvm_vcpu *vcpu, unsigned long cc) vcpu->arch.sie_block->gpsw.mask |= cc << 44; } +/* test availability of facility in a kvm intance */ +static inline int test_kvm_facility(struct kvm *kvm, unsigned long nr) +{ + return __test_facility(nr, kvm->arch.model.fac->kvm); +} + /* are cpu states controlled by user space */ static inline int kvm_s390_user_cpu_state_ctrl(struct kvm *kvm) { @@ -183,7 +187,8 @@ int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu); void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu); /* is cmma enabled */ bool kvm_s390_cmma_enabled(struct kvm *kvm); -int test_vfacility(unsigned long nr); +unsigned long kvm_s390_fac_list_mask_size(void); +extern unsigned long kvm_s390_fac_list_mask[]; /* implemented in diag.c */ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu); @@ -228,11 +233,13 @@ int s390int_to_s390irq(struct kvm_s390_interrupt *s390int, struct kvm_s390_irq *s390irq); /* implemented in interrupt.c */ -int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); +int kvm_s390_vcpu_has_irq(struct kvm_vcpu *vcpu, int exclude_stop); int psw_extint_disabled(struct kvm_vcpu *vcpu); void kvm_s390_destroy_adapters(struct kvm *kvm); -int kvm_s390_si_ext_call_pending(struct kvm_vcpu *vcpu); +int kvm_s390_ext_call_pending(struct kvm_vcpu *vcpu); extern struct kvm_device_ops kvm_flic_ops; +int kvm_s390_is_stop_irq_pending(struct kvm_vcpu *vcpu); +void kvm_s390_clear_stop_irq(struct kvm_vcpu *vcpu); /* implemented in guestdbg.c */ void kvm_s390_backup_guest_per_regs(struct kvm_vcpu *vcpu); diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 1be578d..bdd9b5b 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -337,19 +337,24 @@ static int handle_io_inst(struct kvm_vcpu *vcpu) static int handle_stfl(struct kvm_vcpu *vcpu) { int rc; + unsigned int fac; vcpu->stat.instruction_stfl++; if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + /* + * We need to shift the lower 32 facility bits (bit 0-31) from a u64 + * into a u32 memory representation. They will remain bits 0-31. + */ + fac = *vcpu->kvm->arch.model.fac->sie >> 32; rc = write_guest_lc(vcpu, offsetof(struct _lowcore, stfl_fac_list), - vfacilities, 4); + &fac, sizeof(fac)); if (rc) return rc; - VCPU_EVENT(vcpu, 5, "store facility list value %x", - *(unsigned int *) vfacilities); - trace_kvm_s390_handle_stfl(vcpu, *(unsigned int *) vfacilities); + VCPU_EVENT(vcpu, 5, "store facility list value %x", fac); + trace_kvm_s390_handle_stfl(vcpu, fac); return 0; } diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c index 6651f9f..23b1e86 100644 --- a/arch/s390/kvm/sigp.c +++ b/arch/s390/kvm/sigp.c @@ -26,15 +26,17 @@ static int __sigp_sense(struct kvm_vcpu *vcpu, struct kvm_vcpu *dst_vcpu, struct kvm_s390_local_interrupt *li; int cpuflags; int rc; + int ext_call_pending; li = &dst_vcpu->arch.local_int; cpuflags = atomic_read(li->cpuflags); - if (!(cpuflags & (CPUSTAT_ECALL_PEND | CPUSTAT_STOPPED))) + ext_call_pending = kvm_s390_ext_call_pending(dst_vcpu); + if (!(cpuflags & CPUSTAT_STOPPED) && !ext_call_pending) rc = SIGP_CC_ORDER_CODE_ACCEPTED; else { *reg &= 0xffffffff00000000UL; - if (cpuflags & CPUSTAT_ECALL_PEND) + if (ext_call_pending) *reg |= SIGP_STATUS_EXT_CALL_PENDING; if (cpuflags & CPUSTAT_STOPPED) *reg |= SIGP_STATUS_STOPPED; @@ -96,7 +98,7 @@ static int __sigp_conditional_emergency(struct kvm_vcpu *vcpu, } static int __sigp_external_call(struct kvm_vcpu *vcpu, - struct kvm_vcpu *dst_vcpu) + struct kvm_vcpu *dst_vcpu, u64 *reg) { struct kvm_s390_irq irq = { .type = KVM_S390_INT_EXTERNAL_CALL, @@ -105,45 +107,31 @@ static int __sigp_external_call(struct kvm_vcpu *vcpu, int rc; rc = kvm_s390_inject_vcpu(dst_vcpu, &irq); - if (!rc) + if (rc == -EBUSY) { + *reg &= 0xffffffff00000000UL; + *reg |= SIGP_STATUS_EXT_CALL_PENDING; + return SIGP_CC_STATUS_STORED; + } else if (rc == 0) { VCPU_EVENT(vcpu, 4, "sent sigp ext call to cpu %x", dst_vcpu->vcpu_id); - - return rc ? rc : SIGP_CC_ORDER_CODE_ACCEPTED; -} - -static int __inject_sigp_stop(struct kvm_vcpu *dst_vcpu, int action) -{ - struct kvm_s390_local_interrupt *li = &dst_vcpu->arch.local_int; - int rc = SIGP_CC_ORDER_CODE_ACCEPTED; - - spin_lock(&li->lock); - if (li->action_bits & ACTION_STOP_ON_STOP) { - /* another SIGP STOP is pending */ - rc = SIGP_CC_BUSY; - goto out; } - if ((atomic_read(li->cpuflags) & CPUSTAT_STOPPED)) { - if ((action & ACTION_STORE_ON_STOP) != 0) - rc = -ESHUTDOWN; - goto out; - } - set_bit(IRQ_PEND_SIGP_STOP, &li->pending_irqs); - li->action_bits |= action; - atomic_set_mask(CPUSTAT_STOP_INT, li->cpuflags); - kvm_s390_vcpu_wakeup(dst_vcpu); -out: - spin_unlock(&li->lock); - return rc; + return rc ? rc : SIGP_CC_ORDER_CODE_ACCEPTED; } static int __sigp_stop(struct kvm_vcpu *vcpu, struct kvm_vcpu *dst_vcpu) { + struct kvm_s390_irq irq = { + .type = KVM_S390_SIGP_STOP, + }; int rc; - rc = __inject_sigp_stop(dst_vcpu, ACTION_STOP_ON_STOP); - VCPU_EVENT(vcpu, 4, "sent sigp stop to cpu %x", dst_vcpu->vcpu_id); + rc = kvm_s390_inject_vcpu(dst_vcpu, &irq); + if (rc == -EBUSY) + rc = SIGP_CC_BUSY; + else if (rc == 0) + VCPU_EVENT(vcpu, 4, "sent sigp stop to cpu %x", + dst_vcpu->vcpu_id); return rc; } @@ -151,20 +139,18 @@ static int __sigp_stop(struct kvm_vcpu *vcpu, struct kvm_vcpu *dst_vcpu) static int __sigp_stop_and_store_status(struct kvm_vcpu *vcpu, struct kvm_vcpu *dst_vcpu, u64 *reg) { + struct kvm_s390_irq irq = { + .type = KVM_S390_SIGP_STOP, + .u.stop.flags = KVM_S390_STOP_FLAG_STORE_STATUS, + }; int rc; - rc = __inject_sigp_stop(dst_vcpu, ACTION_STOP_ON_STOP | - ACTION_STORE_ON_STOP); - VCPU_EVENT(vcpu, 4, "sent sigp stop and store status to cpu %x", - dst_vcpu->vcpu_id); - - if (rc == -ESHUTDOWN) { - /* If the CPU has already been stopped, we still have - * to save the status when doing stop-and-store. This - * has to be done after unlocking all spinlocks. */ - rc = kvm_s390_store_status_unloaded(dst_vcpu, - KVM_S390_STORE_STATUS_NOADDR); - } + rc = kvm_s390_inject_vcpu(dst_vcpu, &irq); + if (rc == -EBUSY) + rc = SIGP_CC_BUSY; + else if (rc == 0) + VCPU_EVENT(vcpu, 4, "sent sigp stop and store status to cpu %x", + dst_vcpu->vcpu_id); return rc; } @@ -197,41 +183,33 @@ static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter) static int __sigp_set_prefix(struct kvm_vcpu *vcpu, struct kvm_vcpu *dst_vcpu, u32 address, u64 *reg) { - struct kvm_s390_local_interrupt *li; + struct kvm_s390_irq irq = { + .type = KVM_S390_SIGP_SET_PREFIX, + .u.prefix.address = address & 0x7fffe000u, + }; int rc; - li = &dst_vcpu->arch.local_int; - /* * Make sure the new value is valid memory. We only need to check the * first page, since address is 8k aligned and memory pieces are always * at least 1MB aligned and have at least a size of 1MB. */ - address &= 0x7fffe000u; - if (kvm_is_error_gpa(vcpu->kvm, address)) { + if (kvm_is_error_gpa(vcpu->kvm, irq.u.prefix.address)) { *reg &= 0xffffffff00000000UL; *reg |= SIGP_STATUS_INVALID_PARAMETER; return SIGP_CC_STATUS_STORED; } - spin_lock(&li->lock); - /* cpu must be in stopped state */ - if (!(atomic_read(li->cpuflags) & CPUSTAT_STOPPED)) { + rc = kvm_s390_inject_vcpu(dst_vcpu, &irq); + if (rc == -EBUSY) { *reg &= 0xffffffff00000000UL; *reg |= SIGP_STATUS_INCORRECT_STATE; - rc = SIGP_CC_STATUS_STORED; - goto out_li; + return SIGP_CC_STATUS_STORED; + } else if (rc == 0) { + VCPU_EVENT(vcpu, 4, "set prefix of cpu %02x to %x", + dst_vcpu->vcpu_id, irq.u.prefix.address); } - li->irq.prefix.address = address; - set_bit(IRQ_PEND_SET_PREFIX, &li->pending_irqs); - kvm_s390_vcpu_wakeup(dst_vcpu); - rc = SIGP_CC_ORDER_CODE_ACCEPTED; - - VCPU_EVENT(vcpu, 4, "set prefix of cpu %02x to %x", dst_vcpu->vcpu_id, - address); -out_li: - spin_unlock(&li->lock); return rc; } @@ -242,9 +220,7 @@ static int __sigp_store_status_at_addr(struct kvm_vcpu *vcpu, int flags; int rc; - spin_lock(&dst_vcpu->arch.local_int.lock); flags = atomic_read(dst_vcpu->arch.local_int.cpuflags); - spin_unlock(&dst_vcpu->arch.local_int.lock); if (!(flags & CPUSTAT_STOPPED)) { *reg &= 0xffffffff00000000UL; *reg |= SIGP_STATUS_INCORRECT_STATE; @@ -291,8 +267,9 @@ static int __prepare_sigp_re_start(struct kvm_vcpu *vcpu, /* handle (RE)START in user space */ int rc = -EOPNOTSUPP; + /* make sure we don't race with STOP irq injection */ spin_lock(&li->lock); - if (li->action_bits & ACTION_STOP_ON_STOP) + if (kvm_s390_is_stop_irq_pending(dst_vcpu)) rc = SIGP_CC_BUSY; spin_unlock(&li->lock); @@ -333,7 +310,7 @@ static int handle_sigp_dst(struct kvm_vcpu *vcpu, u8 order_code, break; case SIGP_EXTERNAL_CALL: vcpu->stat.instruction_sigp_external_call++; - rc = __sigp_external_call(vcpu, dst_vcpu); + rc = __sigp_external_call(vcpu, dst_vcpu, status_reg); break; case SIGP_EMERGENCY_SIGNAL: vcpu->stat.instruction_sigp_emergency++; @@ -394,6 +371,53 @@ static int handle_sigp_dst(struct kvm_vcpu *vcpu, u8 order_code, return rc; } +static int handle_sigp_order_in_user_space(struct kvm_vcpu *vcpu, u8 order_code) +{ + if (!vcpu->kvm->arch.user_sigp) + return 0; + + switch (order_code) { + case SIGP_SENSE: + case SIGP_EXTERNAL_CALL: + case SIGP_EMERGENCY_SIGNAL: + case SIGP_COND_EMERGENCY_SIGNAL: + case SIGP_SENSE_RUNNING: + return 0; + /* update counters as we're directly dropping to user space */ + case SIGP_STOP: + vcpu->stat.instruction_sigp_stop++; + break; + case SIGP_STOP_AND_STORE_STATUS: + vcpu->stat.instruction_sigp_stop_store_status++; + break; + case SIGP_STORE_STATUS_AT_ADDRESS: + vcpu->stat.instruction_sigp_store_status++; + break; + case SIGP_SET_PREFIX: + vcpu->stat.instruction_sigp_prefix++; + break; + case SIGP_START: + vcpu->stat.instruction_sigp_start++; + break; + case SIGP_RESTART: + vcpu->stat.instruction_sigp_restart++; + break; + case SIGP_INITIAL_CPU_RESET: + vcpu->stat.instruction_sigp_init_cpu_reset++; + break; + case SIGP_CPU_RESET: + vcpu->stat.instruction_sigp_cpu_reset++; + break; + default: + vcpu->stat.instruction_sigp_unknown++; + } + + VCPU_EVENT(vcpu, 4, "sigp order %u: completely handled in user space", + order_code); + + return 1; +} + int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu) { int r1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4; @@ -408,6 +432,8 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); order_code = kvm_s390_get_base_disp_rs(vcpu); + if (handle_sigp_order_in_user_space(vcpu, order_code)) + return -EOPNOTSUPP; if (r1 % 2) parameter = vcpu->run->s.regs.gprs[r1]; diff --git a/arch/s390/kvm/trace-s390.h b/arch/s390/kvm/trace-s390.h index 647e9d6..653a7ec 100644 --- a/arch/s390/kvm/trace-s390.h +++ b/arch/s390/kvm/trace-s390.h @@ -209,19 +209,21 @@ TRACE_EVENT(kvm_s390_request_resets, * Trace point for a vcpu's stop requests. */ TRACE_EVENT(kvm_s390_stop_request, - TP_PROTO(unsigned int action_bits), - TP_ARGS(action_bits), + TP_PROTO(unsigned char stop_irq, unsigned char flags), + TP_ARGS(stop_irq, flags), TP_STRUCT__entry( - __field(unsigned int, action_bits) + __field(unsigned char, stop_irq) + __field(unsigned char, flags) ), TP_fast_assign( - __entry->action_bits = action_bits; + __entry->stop_irq = stop_irq; + __entry->flags = flags; ), - TP_printk("stop request, action_bits = %08x", - __entry->action_bits) + TP_printk("stop request, stop irq = %u, flags = %08x", + __entry->stop_irq, __entry->flags) ); diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index eb18117..57a9d94 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -208,6 +208,7 @@ struct x86_emulate_ops { void (*get_cpuid)(struct x86_emulate_ctxt *ctxt, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); + void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked); }; typedef u32 __attribute__((vector_size(16))) sse128_t; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d89c6b8..a236e39 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -38,8 +38,6 @@ #define KVM_PRIVATE_MEM_SLOTS 3 #define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS) -#define KVM_MMIO_SIZE 16 - #define KVM_PIO_PAGE_OFFSET 1 #define KVM_COALESCED_MMIO_PAGE_OFFSET 2 @@ -51,7 +49,7 @@ | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) #define CR3_L_MODE_RESERVED_BITS 0xFFFFFF0000000000ULL -#define CR3_PCID_INVD (1UL << 63) +#define CR3_PCID_INVD BIT_64(63) #define CR4_RESERVED_BITS \ (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ @@ -160,6 +158,18 @@ enum { #define DR7_FIXED_1 0x00000400 #define DR7_VOLATILE 0xffff2bff +#define PFERR_PRESENT_BIT 0 +#define PFERR_WRITE_BIT 1 +#define PFERR_USER_BIT 2 +#define PFERR_RSVD_BIT 3 +#define PFERR_FETCH_BIT 4 + +#define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT) +#define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT) +#define PFERR_USER_MASK (1U << PFERR_USER_BIT) +#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT) +#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT) + /* apic attention bits */ #define KVM_APIC_CHECK_VAPIC 0 /* @@ -615,6 +625,8 @@ struct kvm_arch { #ifdef CONFIG_KVM_MMU_AUDIT int audit_point; #endif + + bool boot_vcpu_runs_old_kvmclock; }; struct kvm_vm_stat { @@ -643,6 +655,7 @@ struct kvm_vcpu_stat { u32 irq_window_exits; u32 nmi_window_exits; u32 halt_exits; + u32 halt_successful_poll; u32 halt_wakeup; u32 request_irq_exits; u32 irq_exits; @@ -787,6 +800,31 @@ struct kvm_x86_ops { int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr); void (*sched_in)(struct kvm_vcpu *kvm, int cpu); + + /* + * Arch-specific dirty logging hooks. These hooks are only supposed to + * be valid if the specific arch has hardware-accelerated dirty logging + * mechanism. Currently only for PML on VMX. + * + * - slot_enable_log_dirty: + * called when enabling log dirty mode for the slot. + * - slot_disable_log_dirty: + * called when disabling log dirty mode for the slot. + * also called when slot is created with log dirty disabled. + * - flush_log_dirty: + * called before reporting dirty_bitmap to userspace. + * - enable_log_dirty_pt_masked: + * called when reenabling log dirty for the GFNs in the mask after + * corresponding bits are cleared in slot->dirty_bitmap. + */ + void (*slot_enable_log_dirty)(struct kvm *kvm, + struct kvm_memory_slot *slot); + void (*slot_disable_log_dirty)(struct kvm *kvm, + struct kvm_memory_slot *slot); + void (*flush_log_dirty)(struct kvm *kvm); + void (*enable_log_dirty_pt_masked)(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t offset, unsigned long mask); }; struct kvm_arch_async_pf { @@ -819,10 +857,17 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, u64 dirty_mask, u64 nx_mask, u64 x_mask); void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); -void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); -void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, - struct kvm_memory_slot *slot, - gfn_t gfn_offset, unsigned long mask); +void kvm_mmu_slot_remove_write_access(struct kvm *kvm, + struct kvm_memory_slot *memslot); +void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, + struct kvm_memory_slot *memslot); +void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, + struct kvm_memory_slot *memslot); +void kvm_mmu_slot_set_dirty(struct kvm *kvm, + struct kvm_memory_slot *memslot); +void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask); void kvm_mmu_zap_all(struct kvm *kvm); void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm); unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 45afaee..da772ed 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -69,6 +69,7 @@ #define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 #define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000 #define SECONDARY_EXEC_SHADOW_VMCS 0x00004000 +#define SECONDARY_EXEC_ENABLE_PML 0x00020000 #define SECONDARY_EXEC_XSAVES 0x00100000 @@ -121,6 +122,7 @@ enum vmcs_field { GUEST_LDTR_SELECTOR = 0x0000080c, GUEST_TR_SELECTOR = 0x0000080e, GUEST_INTR_STATUS = 0x00000810, + GUEST_PML_INDEX = 0x00000812, HOST_ES_SELECTOR = 0x00000c00, HOST_CS_SELECTOR = 0x00000c02, HOST_SS_SELECTOR = 0x00000c04, @@ -140,6 +142,8 @@ enum vmcs_field { VM_EXIT_MSR_LOAD_ADDR_HIGH = 0x00002009, VM_ENTRY_MSR_LOAD_ADDR = 0x0000200a, VM_ENTRY_MSR_LOAD_ADDR_HIGH = 0x0000200b, + PML_ADDRESS = 0x0000200e, + PML_ADDRESS_HIGH = 0x0000200f, TSC_OFFSET = 0x00002010, TSC_OFFSET_HIGH = 0x00002011, VIRTUAL_APIC_PAGE_ADDR = 0x00002012, diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h index 536240fa..3ce0791 100644 --- a/arch/x86/include/uapi/asm/msr-index.h +++ b/arch/x86/include/uapi/asm/msr-index.h @@ -364,6 +364,9 @@ #define MSR_IA32_UCODE_WRITE 0x00000079 #define MSR_IA32_UCODE_REV 0x0000008b +#define MSR_IA32_SMM_MONITOR_CTL 0x0000009b +#define MSR_IA32_SMBASE 0x0000009e + #define MSR_IA32_PERF_STATUS 0x00000198 #define MSR_IA32_PERF_CTL 0x00000199 #define INTEL_PERF_CTL_MASK 0xffff diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index b813bf9..c5f1a1d 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -56,6 +56,7 @@ #define EXIT_REASON_MSR_READ 31 #define EXIT_REASON_MSR_WRITE 32 #define EXIT_REASON_INVALID_STATE 33 +#define EXIT_REASON_MSR_LOAD_FAIL 34 #define EXIT_REASON_MWAIT_INSTRUCTION 36 #define EXIT_REASON_MONITOR_INSTRUCTION 39 #define EXIT_REASON_PAUSE_INSTRUCTION 40 @@ -72,6 +73,7 @@ #define EXIT_REASON_XSETBV 55 #define EXIT_REASON_APIC_WRITE 56 #define EXIT_REASON_INVPCID 58 +#define EXIT_REASON_PML_FULL 62 #define EXIT_REASON_XSAVES 63 #define EXIT_REASON_XRSTORS 64 @@ -116,10 +118,14 @@ { EXIT_REASON_APIC_WRITE, "APIC_WRITE" }, \ { EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \ { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \ + { EXIT_REASON_MSR_LOAD_FAIL, "MSR_LOAD_FAIL" }, \ { EXIT_REASON_INVD, "INVD" }, \ { EXIT_REASON_INVVPID, "INVVPID" }, \ { EXIT_REASON_INVPCID, "INVPCID" }, \ { EXIT_REASON_XSAVES, "XSAVES" }, \ { EXIT_REASON_XRSTORS, "XRSTORS" } +#define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1 +#define VMX_ABORT_LOAD_HOST_MSR_FAIL 4 + #endif /* _UAPIVMX_H */ diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 7dc7ba5..413a7bf 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -39,6 +39,7 @@ config KVM select PERF_EVENTS select HAVE_KVM_MSI select HAVE_KVM_CPU_RELAX_INTERCEPT + select KVM_GENERIC_DIRTYLOG_READ_PROTECT select KVM_VFIO select SRCU ---help--- diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index de12c1d..e0b794a 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -86,6 +86,7 @@ #define DstAcc (OpAcc << DstShift) #define DstDI (OpDI << DstShift) #define DstMem64 (OpMem64 << DstShift) +#define DstMem16 (OpMem16 << DstShift) #define DstImmUByte (OpImmUByte << DstShift) #define DstDX (OpDX << DstShift) #define DstAccLo (OpAccLo << DstShift) @@ -124,6 +125,7 @@ #define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */ #define Escape (5<<15) /* Escape to coprocessor instruction */ #define InstrDual (6<<15) /* Alternate instruction decoding of mod == 3 */ +#define ModeDual (7<<15) /* Different instruction for 32/64 bit */ #define Sse (1<<18) /* SSE Vector instruction */ /* Generic ModRM decode. */ #define ModRM (1<<19) @@ -165,10 +167,10 @@ #define NoMod ((u64)1 << 47) /* Mod field is ignored */ #define Intercept ((u64)1 << 48) /* Has valid intercept field */ #define CheckPerm ((u64)1 << 49) /* Has valid check_perm field */ -#define NoBigReal ((u64)1 << 50) /* No big real mode */ #define PrivUD ((u64)1 << 51) /* #UD instead of #GP on CPL > 0 */ #define NearBranch ((u64)1 << 52) /* Near branches */ #define No16 ((u64)1 << 53) /* No 16 bit operand */ +#define IncSP ((u64)1 << 54) /* SP is incremented before ModRM calc */ #define DstXacc (DstAccLo | SrcAccHi | SrcWrite) @@ -213,6 +215,7 @@ struct opcode { const struct gprefix *gprefix; const struct escape *esc; const struct instr_dual *idual; + const struct mode_dual *mdual; void (*fastop)(struct fastop *fake); } u; int (*check_perm)(struct x86_emulate_ctxt *ctxt); @@ -240,6 +243,11 @@ struct instr_dual { struct opcode mod3; }; +struct mode_dual { + struct opcode mode32; + struct opcode mode64; +}; + /* EFLAGS bit definitions. */ #define EFLG_ID (1<<21) #define EFLG_VIP (1<<20) @@ -262,6 +270,13 @@ struct instr_dual { #define EFLG_RESERVED_ZEROS_MASK 0xffc0802a #define EFLG_RESERVED_ONE_MASK 2 +enum x86_transfer_type { + X86_TRANSFER_NONE, + X86_TRANSFER_CALL_JMP, + X86_TRANSFER_RET, + X86_TRANSFER_TASK_SWITCH, +}; + static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr) { if (!(ctxt->regs_valid & (1 << nr))) { @@ -669,9 +684,13 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt, } if (addr.ea > lim) goto bad; - *max_size = min_t(u64, ~0u, (u64)lim + 1 - addr.ea); - if (size > *max_size) - goto bad; + if (lim == 0xffffffff) + *max_size = ~0u; + else { + *max_size = (u64)lim + 1 - addr.ea; + if (size > *max_size) + goto bad; + } la &= (u32)-1; break; } @@ -722,19 +741,26 @@ static int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst, const struct desc_struct *cs_desc) { enum x86emul_mode mode = ctxt->mode; + int rc; #ifdef CONFIG_X86_64 - if (ctxt->mode >= X86EMUL_MODE_PROT32 && cs_desc->l) { - u64 efer = 0; + if (ctxt->mode >= X86EMUL_MODE_PROT16) { + if (cs_desc->l) { + u64 efer = 0; - ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); - if (efer & EFER_LMA) - mode = X86EMUL_MODE_PROT64; + ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); + if (efer & EFER_LMA) + mode = X86EMUL_MODE_PROT64; + } else + mode = X86EMUL_MODE_PROT32; /* temporary value */ } #endif if (mode == X86EMUL_MODE_PROT16 || mode == X86EMUL_MODE_PROT32) mode = cs_desc->d ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; - return assign_eip(ctxt, dst, mode); + rc = assign_eip(ctxt, dst, mode); + if (rc == X86EMUL_CONTINUE) + ctxt->mode = mode; + return rc; } static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) @@ -1057,8 +1083,6 @@ static int em_fnstcw(struct x86_emulate_ctxt *ctxt) asm volatile("fnstcw %0": "+m"(fcw)); ctxt->ops->put_fpu(ctxt); - /* force 2 byte destination */ - ctxt->dst.bytes = 2; ctxt->dst.val = fcw; return X86EMUL_CONTINUE; @@ -1075,8 +1099,6 @@ static int em_fnstsw(struct x86_emulate_ctxt *ctxt) asm volatile("fnstsw %0": "+m"(fsw)); ctxt->ops->put_fpu(ctxt); - /* force 2 byte destination */ - ctxt->dst.bytes = 2; ctxt->dst.val = fsw; return X86EMUL_CONTINUE; @@ -1223,6 +1245,10 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, else { modrm_ea += reg_read(ctxt, base_reg); adjust_modrm_seg(ctxt, base_reg); + /* Increment ESP on POP [ESP] */ + if ((ctxt->d & IncSP) && + base_reg == VCPU_REGS_RSP) + modrm_ea += ctxt->op_bytes; } if (index_reg != 4) modrm_ea += reg_read(ctxt, index_reg) << scale; @@ -1435,10 +1461,8 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, ops->get_gdt(ctxt, dt); } -/* allowed just for 8 bytes segments */ -static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, - u16 selector, struct desc_struct *desc, - ulong *desc_addr_p) +static int get_descriptor_ptr(struct x86_emulate_ctxt *ctxt, + u16 selector, ulong *desc_addr_p) { struct desc_ptr dt; u16 index = selector >> 3; @@ -1449,8 +1473,34 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, if (dt.size < index * 8 + 7) return emulate_gp(ctxt, selector & 0xfffc); - *desc_addr_p = addr = dt.address + index * 8; - return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc, + addr = dt.address + index * 8; + +#ifdef CONFIG_X86_64 + if (addr >> 32 != 0) { + u64 efer = 0; + + ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); + if (!(efer & EFER_LMA)) + addr &= (u32)-1; + } +#endif + + *desc_addr_p = addr; + return X86EMUL_CONTINUE; +} + +/* allowed just for 8 bytes segments */ +static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, + u16 selector, struct desc_struct *desc, + ulong *desc_addr_p) +{ + int rc; + + rc = get_descriptor_ptr(ctxt, selector, desc_addr_p); + if (rc != X86EMUL_CONTINUE) + return rc; + + return ctxt->ops->read_std(ctxt, *desc_addr_p, desc, sizeof(*desc), &ctxt->exception); } @@ -1458,16 +1508,13 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, u16 selector, struct desc_struct *desc) { - struct desc_ptr dt; - u16 index = selector >> 3; + int rc; ulong addr; - get_descriptor_table_ptr(ctxt, selector, &dt); - - if (dt.size < index * 8 + 7) - return emulate_gp(ctxt, selector & 0xfffc); + rc = get_descriptor_ptr(ctxt, selector, &addr); + if (rc != X86EMUL_CONTINUE) + return rc; - addr = dt.address + index * 8; return ctxt->ops->write_std(ctxt, addr, desc, sizeof *desc, &ctxt->exception); } @@ -1475,7 +1522,7 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, /* Does not support long mode */ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, u16 selector, int seg, u8 cpl, - bool in_task_switch, + enum x86_transfer_type transfer, struct desc_struct *desc) { struct desc_struct seg_desc, old_desc; @@ -1529,11 +1576,15 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, return ret; err_code = selector & 0xfffc; - err_vec = in_task_switch ? TS_VECTOR : GP_VECTOR; + err_vec = (transfer == X86_TRANSFER_TASK_SWITCH) ? TS_VECTOR : + GP_VECTOR; /* can't load system descriptor into segment selector */ - if (seg <= VCPU_SREG_GS && !seg_desc.s) + if (seg <= VCPU_SREG_GS && !seg_desc.s) { + if (transfer == X86_TRANSFER_CALL_JMP) + return X86EMUL_UNHANDLEABLE; goto exception; + } if (!seg_desc.p) { err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR; @@ -1605,10 +1656,13 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, if (seg_desc.s) { /* mark segment as accessed */ - seg_desc.type |= 1; - ret = write_segment_descriptor(ctxt, selector, &seg_desc); - if (ret != X86EMUL_CONTINUE) - return ret; + if (!(seg_desc.type & 1)) { + seg_desc.type |= 1; + ret = write_segment_descriptor(ctxt, selector, + &seg_desc); + if (ret != X86EMUL_CONTINUE) + return ret; + } } else if (ctxt->mode == X86EMUL_MODE_PROT64) { ret = ctxt->ops->read_std(ctxt, desc_addr+8, &base3, sizeof(base3), &ctxt->exception); @@ -1631,7 +1685,8 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, u16 selector, int seg) { u8 cpl = ctxt->ops->cpl(ctxt); - return __load_segment_descriptor(ctxt, selector, seg, cpl, false, NULL); + return __load_segment_descriptor(ctxt, selector, seg, cpl, + X86_TRANSFER_NONE, NULL); } static void write_register_operand(struct operand *op) @@ -1828,12 +1883,14 @@ static int em_pop_sreg(struct x86_emulate_ctxt *ctxt) unsigned long selector; int rc; - rc = emulate_pop(ctxt, &selector, ctxt->op_bytes); + rc = emulate_pop(ctxt, &selector, 2); if (rc != X86EMUL_CONTINUE) return rc; if (ctxt->modrm_reg == VCPU_SREG_SS) ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS; + if (ctxt->op_bytes > 2) + rsp_increment(ctxt, ctxt->op_bytes - 2); rc = load_segment_descriptor(ctxt, (u16)selector, seg); return rc; @@ -2007,6 +2064,7 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt) ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */ ctxt->eflags |= EFLG_RESERVED_ONE_MASK; + ctxt->ops->set_nmi_mask(ctxt, false); return rc; } @@ -2041,7 +2099,8 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt) memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2); - rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, false, + rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, + X86_TRANSFER_CALL_JMP, &new_desc); if (rc != X86EMUL_CONTINUE) return rc; @@ -2130,7 +2189,8 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt) /* Outer-privilege level return is not implemented */ if (ctxt->mode >= X86EMUL_MODE_PROT16 && (cs & 3) > cpl) return X86EMUL_UNHANDLEABLE; - rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, cpl, false, + rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, cpl, + X86_TRANSFER_RET, &new_desc); if (rc != X86EMUL_CONTINUE) return rc; @@ -2163,12 +2223,15 @@ static int em_cmpxchg(struct x86_emulate_ctxt *ctxt) fastop(ctxt, em_cmp); if (ctxt->eflags & EFLG_ZF) { - /* Success: write back to memory. */ + /* Success: write back to memory; no update of EAX */ + ctxt->src.type = OP_NONE; ctxt->dst.val = ctxt->src.orig_val; } else { /* Failure: write the value we saw to EAX. */ - ctxt->dst.type = OP_REG; - ctxt->dst.addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX); + ctxt->src.type = OP_REG; + ctxt->src.addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX); + ctxt->src.val = ctxt->dst.orig_val; + /* Create write-cycle to dest by writing the same value */ ctxt->dst.val = ctxt->dst.orig_val; } return X86EMUL_CONTINUE; @@ -2556,23 +2619,23 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, * it is handled in a context of new task */ ret = __load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR, cpl, - true, NULL); + X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, - true, NULL); + X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, - true, NULL); + X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, - true, NULL); + X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, - true, NULL); + X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; @@ -2694,31 +2757,31 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, * it is handled in a context of new task */ ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR, - cpl, true, NULL); + cpl, X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, - true, NULL); + X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, - true, NULL); + X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, - true, NULL); + X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, - true, NULL); + X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS, cpl, - true, NULL); + X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl, - true, NULL); + X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; @@ -2739,7 +2802,6 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, &ctxt->exception); if (ret != X86EMUL_CONTINUE) - /* FIXME: need to provide precise fault address */ return ret; save_state_to_tss32(ctxt, &tss_seg); @@ -2748,13 +2810,11 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, ret = ops->write_std(ctxt, old_tss_base + eip_offset, &tss_seg.eip, ldt_sel_offset - eip_offset, &ctxt->exception); if (ret != X86EMUL_CONTINUE) - /* FIXME: need to provide precise fault address */ return ret; ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg, &ctxt->exception); if (ret != X86EMUL_CONTINUE) - /* FIXME: need to provide precise fault address */ return ret; if (old_tss_sel != 0xffff) { @@ -2765,7 +2825,6 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, sizeof tss_seg.prev_task_link, &ctxt->exception); if (ret != X86EMUL_CONTINUE) - /* FIXME: need to provide precise fault address */ return ret; } @@ -2999,15 +3058,16 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt) struct desc_struct old_desc, new_desc; const struct x86_emulate_ops *ops = ctxt->ops; int cpl = ctxt->ops->cpl(ctxt); + enum x86emul_mode prev_mode = ctxt->mode; old_eip = ctxt->_eip; ops->get_segment(ctxt, &old_cs, &old_desc, NULL, VCPU_SREG_CS); memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2); - rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, false, - &new_desc); + rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, + X86_TRANSFER_CALL_JMP, &new_desc); if (rc != X86EMUL_CONTINUE) - return X86EMUL_CONTINUE; + return rc; rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc); if (rc != X86EMUL_CONTINUE) @@ -3022,11 +3082,14 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt) rc = em_push(ctxt); /* If we failed, we tainted the memory, but the very least we should restore cs */ - if (rc != X86EMUL_CONTINUE) + if (rc != X86EMUL_CONTINUE) { + pr_warn_once("faulting far call emulation tainted memory\n"); goto fail; + } return rc; fail: ops->set_segment(ctxt, old_cs, &old_desc, 0, VCPU_SREG_CS); + ctxt->mode = prev_mode; return rc; } @@ -3477,6 +3540,12 @@ static int em_clflush(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_movsxd(struct x86_emulate_ctxt *ctxt) +{ + ctxt->dst.val = (s32) ctxt->src.val; + return X86EMUL_CONTINUE; +} + static bool valid_cr(int nr) { switch (nr) { @@ -3676,6 +3745,7 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt) #define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) } #define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) } #define ID(_f, _i) { .flags = ((_f) | InstrDual | ModRM), .u.idual = (_i) } +#define MD(_f, _m) { .flags = ((_f) | ModeDual), .u.mdual = (_m) } #define E(_f, _e) { .flags = ((_f) | Escape | ModRM), .u.esc = (_e) } #define I(_f, _e) { .flags = (_f), .u.execute = (_e) } #define F(_f, _e) { .flags = (_f) | Fastop, .u.fastop = (_e) } @@ -3738,7 +3808,7 @@ static const struct opcode group1[] = { }; static const struct opcode group1A[] = { - I(DstMem | SrcNone | Mov | Stack, em_pop), N, N, N, N, N, N, N, + I(DstMem | SrcNone | Mov | Stack | IncSP, em_pop), N, N, N, N, N, N, N, }; static const struct opcode group2[] = { @@ -3854,7 +3924,7 @@ static const struct gprefix pfx_0f_e7 = { }; static const struct escape escape_d9 = { { - N, N, N, N, N, N, N, I(DstMem, em_fnstcw), + N, N, N, N, N, N, N, I(DstMem16 | Mov, em_fnstcw), }, { /* 0xC0 - 0xC7 */ N, N, N, N, N, N, N, N, @@ -3896,7 +3966,7 @@ static const struct escape escape_db = { { } }; static const struct escape escape_dd = { { - N, N, N, N, N, N, N, I(DstMem, em_fnstsw), + N, N, N, N, N, N, N, I(DstMem16 | Mov, em_fnstsw), }, { /* 0xC0 - 0xC7 */ N, N, N, N, N, N, N, N, @@ -3920,6 +3990,10 @@ static const struct instr_dual instr_dual_0f_c3 = { I(DstMem | SrcReg | ModRM | No16 | Mov, em_mov), N }; +static const struct mode_dual mode_dual_63 = { + N, I(DstReg | SrcMem32 | ModRM | Mov, em_movsxd) +}; + static const struct opcode opcode_table[256] = { /* 0x00 - 0x07 */ F6ALU(Lock, em_add), @@ -3954,7 +4028,7 @@ static const struct opcode opcode_table[256] = { /* 0x60 - 0x67 */ I(ImplicitOps | Stack | No64, em_pusha), I(ImplicitOps | Stack | No64, em_popa), - N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ , + N, MD(ModRM, &mode_dual_63), N, N, N, N, /* 0x68 - 0x6F */ I(SrcImm | Mov | Stack, em_push), @@ -4010,8 +4084,8 @@ static const struct opcode opcode_table[256] = { G(ByteOp, group11), G(0, group11), /* 0xC8 - 0xCF */ I(Stack | SrcImmU16 | Src2ImmByte, em_enter), I(Stack, em_leave), - I(ImplicitOps | Stack | SrcImmU16, em_ret_far_imm), - I(ImplicitOps | Stack, em_ret_far), + I(ImplicitOps | SrcImmU16, em_ret_far_imm), + I(ImplicitOps, em_ret_far), D(ImplicitOps), DI(SrcImmByte, intn), D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret), /* 0xD0 - 0xD7 */ @@ -4108,7 +4182,7 @@ static const struct opcode twobyte_table[256] = { F(DstMem | SrcReg | Src2CL | ModRM, em_shrd), GD(0, &group15), F(DstReg | SrcMem | ModRM, em_imul), /* 0xB0 - 0xB7 */ - I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_cmpxchg), + I2bv(DstMem | SrcReg | ModRM | Lock | PageTable | SrcWrite, em_cmpxchg), I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg), F(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr), I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg), @@ -4174,6 +4248,8 @@ static const struct opcode opcode_map_0f_38[256] = { #undef I #undef GP #undef EXT +#undef MD +#undef ID #undef D2bv #undef D2bvIP @@ -4563,6 +4639,12 @@ done_prefixes: else opcode = opcode.u.idual->mod012; break; + case ModeDual: + if (ctxt->mode == X86EMUL_MODE_PROT64) + opcode = opcode.u.mdual->mode64; + else + opcode = opcode.u.mdual->mode32; + break; default: return EMULATION_FAILED; } @@ -4860,8 +4942,13 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) /* optimisation - avoid slow emulated read if Mov */ rc = segmented_read(ctxt, ctxt->dst.addr.mem, &ctxt->dst.val, ctxt->dst.bytes); - if (rc != X86EMUL_CONTINUE) + if (rc != X86EMUL_CONTINUE) { + if (!(ctxt->d & NoWrite) && + rc == X86EMUL_PROPAGATE_FAULT && + ctxt->exception.vector == PF_VECTOR) + ctxt->exception.error_code |= PFERR_WRITE_MASK; goto done; + } } ctxt->dst.orig_val = ctxt->dst.val; @@ -4899,11 +4986,6 @@ special_insn: goto threebyte_insn; switch (ctxt->b) { - case 0x63: /* movsxd */ - if (ctxt->mode != X86EMUL_MODE_PROT64) - goto cannot_emulate; - ctxt->dst.val = (s32) ctxt->src.val; - break; case 0x70 ... 0x7f: /* jcc (short) */ if (test_cc(ctxt->b, ctxt->eflags)) rc = jmp_rel(ctxt, ctxt->src.val); diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index 3c91955..c2e36d9 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -98,7 +98,7 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm) } void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu); -int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, +bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int short_hand, unsigned int dest, int dest_mode); int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, diff --git a/arch/x86/kvm/iommu.c b/arch/x86/kvm/iommu.c index 17b73ee..7dbced3 100644 --- a/arch/x86/kvm/iommu.c +++ b/arch/x86/kvm/iommu.c @@ -138,7 +138,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot) gfn += page_size >> PAGE_SHIFT; - + cond_resched(); } return 0; @@ -306,6 +306,8 @@ static void kvm_iommu_put_pages(struct kvm *kvm, kvm_unpin_pages(kvm, pfn, unmap_pages); gfn += unmap_pages; + + cond_resched(); } } diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index d52dcf0..e55b5fc 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -33,6 +33,7 @@ #include <asm/page.h> #include <asm/current.h> #include <asm/apicdef.h> +#include <asm/delay.h> #include <linux/atomic.h> #include <linux/jump_label.h> #include "kvm_cache_regs.h" @@ -327,17 +328,24 @@ static u8 count_vectors(void *bitmap) return count; } -void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir) +void __kvm_apic_update_irr(u32 *pir, void *regs) { u32 i, pir_val; - struct kvm_lapic *apic = vcpu->arch.apic; for (i = 0; i <= 7; i++) { pir_val = xchg(&pir[i], 0); if (pir_val) - *((u32 *)(apic->regs + APIC_IRR + i * 0x10)) |= pir_val; + *((u32 *)(regs + APIC_IRR + i * 0x10)) |= pir_val; } } +EXPORT_SYMBOL_GPL(__kvm_apic_update_irr); + +void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + __kvm_apic_update_irr(pir, apic->regs); +} EXPORT_SYMBOL_GPL(kvm_apic_update_irr); static inline void apic_set_irr(int vec, struct kvm_lapic *apic) @@ -405,7 +413,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic) * because the processor can modify ISR under the hood. Instead * just set SVI. */ - if (unlikely(kvm_apic_vid_enabled(vcpu->kvm))) + if (unlikely(kvm_x86_ops->hwapic_isr_update)) kvm_x86_ops->hwapic_isr_update(vcpu->kvm, vec); else { ++apic->isr_count; @@ -453,7 +461,7 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) * on the other hand isr_count and highest_isr_cache are unused * and must be left alone. */ - if (unlikely(kvm_apic_vid_enabled(vcpu->kvm))) + if (unlikely(kvm_x86_ops->hwapic_isr_update)) kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic)); else { @@ -580,55 +588,48 @@ static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) apic_update_ppr(apic); } -static int kvm_apic_broadcast(struct kvm_lapic *apic, u32 dest) +static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 dest) { return dest == (apic_x2apic_mode(apic) ? X2APIC_BROADCAST : APIC_BROADCAST); } -int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest) +static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest) { return kvm_apic_id(apic) == dest || kvm_apic_broadcast(apic, dest); } -int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) +static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) { - int result = 0; u32 logical_id; if (kvm_apic_broadcast(apic, mda)) - return 1; + return true; - if (apic_x2apic_mode(apic)) { - logical_id = kvm_apic_get_reg(apic, APIC_LDR); - return logical_id & mda; - } + logical_id = kvm_apic_get_reg(apic, APIC_LDR); - logical_id = GET_APIC_LOGICAL_ID(kvm_apic_get_reg(apic, APIC_LDR)); + if (apic_x2apic_mode(apic)) + return ((logical_id >> 16) == (mda >> 16)) + && (logical_id & mda & 0xffff) != 0; + + logical_id = GET_APIC_LOGICAL_ID(logical_id); switch (kvm_apic_get_reg(apic, APIC_DFR)) { case APIC_DFR_FLAT: - if (logical_id & mda) - result = 1; - break; + return (logical_id & mda) != 0; case APIC_DFR_CLUSTER: - if (((logical_id >> 4) == (mda >> 0x4)) - && (logical_id & mda & 0xf)) - result = 1; - break; + return ((logical_id >> 4) == (mda >> 4)) + && (logical_id & mda & 0xf) != 0; default: apic_debug("Bad DFR vcpu %d: %08x\n", apic->vcpu->vcpu_id, kvm_apic_get_reg(apic, APIC_DFR)); - break; + return false; } - - return result; } -int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, +bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int short_hand, unsigned int dest, int dest_mode) { - int result = 0; struct kvm_lapic *target = vcpu->arch.apic; apic_debug("target %p, source %p, dest 0x%x, " @@ -638,29 +639,21 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, ASSERT(target); switch (short_hand) { case APIC_DEST_NOSHORT: - if (dest_mode == 0) - /* Physical mode. */ - result = kvm_apic_match_physical_addr(target, dest); + if (dest_mode == APIC_DEST_PHYSICAL) + return kvm_apic_match_physical_addr(target, dest); else - /* Logical mode. */ - result = kvm_apic_match_logical_addr(target, dest); - break; + return kvm_apic_match_logical_addr(target, dest); case APIC_DEST_SELF: - result = (target == source); - break; + return target == source; case APIC_DEST_ALLINC: - result = 1; - break; + return true; case APIC_DEST_ALLBUT: - result = (target != source); - break; + return target != source; default: apic_debug("kvm: apic: Bad dest shorthand value %x\n", short_hand); - break; + return false; } - - return result; } bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, @@ -693,7 +686,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, ret = true; - if (irq->dest_mode == 0) { /* physical mode */ + if (irq->dest_mode == APIC_DEST_PHYSICAL) { if (irq->dest_id >= ARRAY_SIZE(map->phys_map)) goto out; @@ -1076,25 +1069,72 @@ static void apic_timer_expired(struct kvm_lapic *apic) { struct kvm_vcpu *vcpu = apic->vcpu; wait_queue_head_t *q = &vcpu->wq; + struct kvm_timer *ktimer = &apic->lapic_timer; - /* - * Note: KVM_REQ_PENDING_TIMER is implicitly checked in - * vcpu_enter_guest. - */ if (atomic_read(&apic->lapic_timer.pending)) return; atomic_inc(&apic->lapic_timer.pending); - /* FIXME: this code should not know anything about vcpus */ - kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); + kvm_set_pending_timer(vcpu); if (waitqueue_active(q)) wake_up_interruptible(q); + + if (apic_lvtt_tscdeadline(apic)) + ktimer->expired_tscdeadline = ktimer->tscdeadline; +} + +/* + * On APICv, this test will cause a busy wait + * during a higher-priority task. + */ + +static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + u32 reg = kvm_apic_get_reg(apic, APIC_LVTT); + + if (kvm_apic_hw_enabled(apic)) { + int vec = reg & APIC_VECTOR_MASK; + void *bitmap = apic->regs + APIC_ISR; + + if (kvm_x86_ops->deliver_posted_interrupt) + bitmap = apic->regs + APIC_IRR; + + if (apic_test_vector(vec, bitmap)) + return true; + } + return false; +} + +void wait_lapic_expire(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + u64 guest_tsc, tsc_deadline; + + if (!kvm_vcpu_has_lapic(vcpu)) + return; + + if (apic->lapic_timer.expired_tscdeadline == 0) + return; + + if (!lapic_timer_int_injected(vcpu)) + return; + + tsc_deadline = apic->lapic_timer.expired_tscdeadline; + apic->lapic_timer.expired_tscdeadline = 0; + guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc()); + trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline); + + /* __delay is delay_tsc whenever the hardware has TSC, thus always. */ + if (guest_tsc < tsc_deadline) + __delay(tsc_deadline - guest_tsc); } static void start_apic_timer(struct kvm_lapic *apic) { ktime_t now; + atomic_set(&apic->lapic_timer.pending, 0); if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) { @@ -1140,6 +1180,7 @@ static void start_apic_timer(struct kvm_lapic *apic) /* lapic timer in tsc deadline mode */ u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline; u64 ns = 0; + ktime_t expire; struct kvm_vcpu *vcpu = apic->vcpu; unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz; unsigned long flags; @@ -1154,8 +1195,10 @@ static void start_apic_timer(struct kvm_lapic *apic) if (likely(tscdeadline > guest_tsc)) { ns = (tscdeadline - guest_tsc) * 1000000ULL; do_div(ns, this_tsc_khz); + expire = ktime_add_ns(now, ns); + expire = ktime_sub_ns(expire, lapic_timer_advance_ns); hrtimer_start(&apic->lapic_timer.timer, - ktime_add_ns(now, ns), HRTIMER_MODE_ABS); + expire, HRTIMER_MODE_ABS); } else apic_timer_expired(apic); @@ -1745,7 +1788,9 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, if (kvm_x86_ops->hwapic_irr_update) kvm_x86_ops->hwapic_irr_update(vcpu, apic_find_highest_irr(apic)); - kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic)); + if (unlikely(kvm_x86_ops->hwapic_isr_update)) + kvm_x86_ops->hwapic_isr_update(vcpu->kvm, + apic_find_highest_isr(apic)); kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_rtc_eoi_tracking_restore_one(vcpu); } diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index c674fce..0bc6c65 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -14,6 +14,7 @@ struct kvm_timer { u32 timer_mode; u32 timer_mode_mask; u64 tscdeadline; + u64 expired_tscdeadline; atomic_t pending; /* accumulated triggered timers */ }; @@ -56,9 +57,8 @@ u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); void kvm_apic_set_version(struct kvm_vcpu *vcpu); void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr); +void __kvm_apic_update_irr(u32 *pir, void *regs); void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir); -int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest); -int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda); int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, unsigned long *dest_map); int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type); @@ -170,4 +170,6 @@ static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu) bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); +void wait_lapic_expire(struct kvm_vcpu *vcpu); + #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index f83fc6c..cee7592 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -63,30 +63,16 @@ enum { #undef MMU_DEBUG #ifdef MMU_DEBUG +static bool dbg = 0; +module_param(dbg, bool, 0644); #define pgprintk(x...) do { if (dbg) printk(x); } while (0) #define rmap_printk(x...) do { if (dbg) printk(x); } while (0) - +#define MMU_WARN_ON(x) WARN_ON(x) #else - #define pgprintk(x...) do { } while (0) #define rmap_printk(x...) do { } while (0) - -#endif - -#ifdef MMU_DEBUG -static bool dbg = 0; -module_param(dbg, bool, 0644); -#endif - -#ifndef MMU_DEBUG -#define ASSERT(x) do { } while (0) -#else -#define ASSERT(x) \ - if (!(x)) { \ - printk(KERN_WARNING "assertion failed %s:%d: %s\n", \ - __FILE__, __LINE__, #x); \ - } +#define MMU_WARN_ON(x) do { } while (0) #endif #define PTE_PREFETCH_NUM 8 @@ -546,6 +532,11 @@ static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask) return (old_spte & bit_mask) && !(new_spte & bit_mask); } +static bool spte_is_bit_changed(u64 old_spte, u64 new_spte, u64 bit_mask) +{ + return (old_spte & bit_mask) != (new_spte & bit_mask); +} + /* Rules for using mmu_spte_set: * Set the sptep from nonpresent to present. * Note: the sptep being assigned *must* be either not present @@ -596,6 +587,14 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) if (!shadow_accessed_mask) return ret; + /* + * Flush TLB when accessed/dirty bits are changed in the page tables, + * to guarantee consistency between TLB and page tables. + */ + if (spte_is_bit_changed(old_spte, new_spte, + shadow_accessed_mask | shadow_dirty_mask)) + ret = true; + if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask)) kvm_set_pfn_accessed(spte_to_pfn(old_spte)); if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask)) @@ -1216,6 +1215,60 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, return flush; } +static bool spte_clear_dirty(struct kvm *kvm, u64 *sptep) +{ + u64 spte = *sptep; + + rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep); + + spte &= ~shadow_dirty_mask; + + return mmu_spte_update(sptep, spte); +} + +static bool __rmap_clear_dirty(struct kvm *kvm, unsigned long *rmapp) +{ + u64 *sptep; + struct rmap_iterator iter; + bool flush = false; + + for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { + BUG_ON(!(*sptep & PT_PRESENT_MASK)); + + flush |= spte_clear_dirty(kvm, sptep); + sptep = rmap_get_next(&iter); + } + + return flush; +} + +static bool spte_set_dirty(struct kvm *kvm, u64 *sptep) +{ + u64 spte = *sptep; + + rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep); + + spte |= shadow_dirty_mask; + + return mmu_spte_update(sptep, spte); +} + +static bool __rmap_set_dirty(struct kvm *kvm, unsigned long *rmapp) +{ + u64 *sptep; + struct rmap_iterator iter; + bool flush = false; + + for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { + BUG_ON(!(*sptep & PT_PRESENT_MASK)); + + flush |= spte_set_dirty(kvm, sptep); + sptep = rmap_get_next(&iter); + } + + return flush; +} + /** * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages * @kvm: kvm instance @@ -1226,7 +1279,7 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, * Used when we do not need to care about huge page mappings: e.g. during dirty * logging we do not have any such mappings. */ -void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, +static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) { @@ -1242,6 +1295,53 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, } } +/** + * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages + * @kvm: kvm instance + * @slot: slot to clear D-bit + * @gfn_offset: start of the BITS_PER_LONG pages we care about + * @mask: indicates which pages we should clear D-bit + * + * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap. + */ +void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask) +{ + unsigned long *rmapp; + + while (mask) { + rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), + PT_PAGE_TABLE_LEVEL, slot); + __rmap_clear_dirty(kvm, rmapp); + + /* clear the first set bit */ + mask &= mask - 1; + } +} +EXPORT_SYMBOL_GPL(kvm_mmu_clear_dirty_pt_masked); + +/** + * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected + * PT level pages. + * + * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to + * enable dirty logging for them. + * + * Used when we do not need to care about huge page mappings: e.g. during dirty + * logging we do not have any such mappings. + */ +void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask) +{ + if (kvm_x86_ops->enable_log_dirty_pt_masked) + kvm_x86_ops->enable_log_dirty_pt_masked(kvm, slot, gfn_offset, + mask); + else + kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); +} + static bool rmap_write_protect(struct kvm *kvm, u64 gfn) { struct kvm_memory_slot *slot; @@ -1536,7 +1636,7 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr) static void kvm_mmu_free_page(struct kvm_mmu_page *sp) { - ASSERT(is_empty_shadow_page(sp->spt)); + MMU_WARN_ON(!is_empty_shadow_page(sp->spt)); hlist_del(&sp->hash_link); list_del(&sp->link); free_page((unsigned long)sp->spt); @@ -2501,8 +2601,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, } } - if (pte_access & ACC_WRITE_MASK) + if (pte_access & ACC_WRITE_MASK) { mark_page_dirty(vcpu->kvm, gfn); + spte |= shadow_dirty_mask; + } set_pte: if (mmu_spte_update(sptep, spte)) @@ -2818,6 +2920,18 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, */ gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); + /* + * Theoretically we could also set dirty bit (and flush TLB) here in + * order to eliminate unnecessary PML logging. See comments in + * set_spte. But fast_page_fault is very unlikely to happen with PML + * enabled, so we do not do this. This might result in the same GPA + * to be logged in PML buffer again when the write really happens, and + * eventually to be called by mark_page_dirty twice. But it's also no + * harm. This also avoids the TLB flush needed after setting dirty bit + * so non-PML cases won't be impacted. + * + * Compare with set_spte where instead shadow_dirty_mask is set. + */ if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte) mark_page_dirty(vcpu->kvm, gfn); @@ -3041,7 +3155,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu.pae_root[i]; - ASSERT(!VALID_PAGE(root)); + MMU_WARN_ON(VALID_PAGE(root)); spin_lock(&vcpu->kvm->mmu_lock); make_mmu_pages_available(vcpu); sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT), @@ -3079,7 +3193,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; - ASSERT(!VALID_PAGE(root)); + MMU_WARN_ON(VALID_PAGE(root)); spin_lock(&vcpu->kvm->mmu_lock); make_mmu_pages_available(vcpu); @@ -3104,7 +3218,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu.pae_root[i]; - ASSERT(!VALID_PAGE(root)); + MMU_WARN_ON(VALID_PAGE(root)); if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i); if (!is_present_gpte(pdptr)) { @@ -3329,8 +3443,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, if (r) return r; - ASSERT(vcpu); - ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); + MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); gfn = gva >> PAGE_SHIFT; @@ -3396,8 +3509,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, int write = error_code & PFERR_WRITE_MASK; bool map_writable; - ASSERT(vcpu); - ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); + MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); if (unlikely(error_code & PFERR_RSVD_MASK)) { r = handle_mmio_page_fault(vcpu, gpa, error_code, true); @@ -3718,7 +3830,7 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu, update_permission_bitmask(vcpu, context, false); update_last_pte_bitmap(vcpu, context); - ASSERT(is_pae(vcpu)); + MMU_WARN_ON(!is_pae(vcpu)); context->page_fault = paging64_page_fault; context->gva_to_gpa = paging64_gva_to_gpa; context->sync_page = paging64_sync_page; @@ -3763,7 +3875,7 @@ static void paging32E_init_context(struct kvm_vcpu *vcpu, static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) { - struct kvm_mmu *context = vcpu->arch.walk_mmu; + struct kvm_mmu *context = &vcpu->arch.mmu; context->base_role.word = 0; context->page_fault = tdp_page_fault; @@ -3803,11 +3915,12 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) update_last_pte_bitmap(vcpu, context); } -void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) +void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) { bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); - ASSERT(vcpu); - ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + struct kvm_mmu *context = &vcpu->arch.mmu; + + MMU_WARN_ON(VALID_PAGE(context->root_hpa)); if (!is_paging(vcpu)) nonpaging_init_context(vcpu, context); @@ -3818,19 +3931,19 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) else paging32_init_context(vcpu, context); - vcpu->arch.mmu.base_role.nxe = is_nx(vcpu); - vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); - vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); - vcpu->arch.mmu.base_role.smep_andnot_wp + context->base_role.nxe = is_nx(vcpu); + context->base_role.cr4_pae = !!is_pae(vcpu); + context->base_role.cr0_wp = is_write_protection(vcpu); + context->base_role.smep_andnot_wp = smep && !is_write_protection(vcpu); } EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); -void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context, - bool execonly) +void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly) { - ASSERT(vcpu); - ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + struct kvm_mmu *context = &vcpu->arch.mmu; + + MMU_WARN_ON(VALID_PAGE(context->root_hpa)); context->shadow_root_level = kvm_x86_ops->get_tdp_level(); @@ -3851,11 +3964,13 @@ EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); static void init_kvm_softmmu(struct kvm_vcpu *vcpu) { - kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu); - vcpu->arch.walk_mmu->set_cr3 = kvm_x86_ops->set_cr3; - vcpu->arch.walk_mmu->get_cr3 = get_cr3; - vcpu->arch.walk_mmu->get_pdptr = kvm_pdptr_read; - vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; + struct kvm_mmu *context = &vcpu->arch.mmu; + + kvm_init_shadow_mmu(vcpu); + context->set_cr3 = kvm_x86_ops->set_cr3; + context->get_cr3 = get_cr3; + context->get_pdptr = kvm_pdptr_read; + context->inject_page_fault = kvm_inject_page_fault; } static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) @@ -3900,17 +4015,15 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) static void init_kvm_mmu(struct kvm_vcpu *vcpu) { if (mmu_is_nested(vcpu)) - return init_kvm_nested_mmu(vcpu); + init_kvm_nested_mmu(vcpu); else if (tdp_enabled) - return init_kvm_tdp_mmu(vcpu); + init_kvm_tdp_mmu(vcpu); else - return init_kvm_softmmu(vcpu); + init_kvm_softmmu(vcpu); } void kvm_mmu_reset_context(struct kvm_vcpu *vcpu) { - ASSERT(vcpu); - kvm_mmu_unload(vcpu); init_kvm_mmu(vcpu); } @@ -4266,8 +4379,6 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) struct page *page; int i; - ASSERT(vcpu); - /* * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. * Therefore we need to allocate shadow page tables in the first @@ -4286,8 +4397,6 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) int kvm_mmu_create(struct kvm_vcpu *vcpu) { - ASSERT(vcpu); - vcpu->arch.walk_mmu = &vcpu->arch.mmu; vcpu->arch.mmu.root_hpa = INVALID_PAGE; vcpu->arch.mmu.translate_gpa = translate_gpa; @@ -4298,19 +4407,18 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) void kvm_mmu_setup(struct kvm_vcpu *vcpu) { - ASSERT(vcpu); - ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa)); init_kvm_mmu(vcpu); } -void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) +void kvm_mmu_slot_remove_write_access(struct kvm *kvm, + struct kvm_memory_slot *memslot) { - struct kvm_memory_slot *memslot; gfn_t last_gfn; int i; + bool flush = false; - memslot = id_to_memslot(kvm->memslots, slot); last_gfn = memslot->base_gfn + memslot->npages - 1; spin_lock(&kvm->mmu_lock); @@ -4325,7 +4433,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) for (index = 0; index <= last_index; ++index, ++rmapp) { if (*rmapp) - __rmap_write_protect(kvm, rmapp, false); + flush |= __rmap_write_protect(kvm, rmapp, + false); if (need_resched() || spin_needbreak(&kvm->mmu_lock)) cond_resched_lock(&kvm->mmu_lock); @@ -4352,8 +4461,124 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) * instead of PT_WRITABLE_MASK, that means it does not depend * on PT_WRITABLE_MASK anymore. */ - kvm_flush_remote_tlbs(kvm); + if (flush) + kvm_flush_remote_tlbs(kvm); +} + +void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, + struct kvm_memory_slot *memslot) +{ + gfn_t last_gfn; + unsigned long *rmapp; + unsigned long last_index, index; + bool flush = false; + + last_gfn = memslot->base_gfn + memslot->npages - 1; + + spin_lock(&kvm->mmu_lock); + + rmapp = memslot->arch.rmap[PT_PAGE_TABLE_LEVEL - 1]; + last_index = gfn_to_index(last_gfn, memslot->base_gfn, + PT_PAGE_TABLE_LEVEL); + + for (index = 0; index <= last_index; ++index, ++rmapp) { + if (*rmapp) + flush |= __rmap_clear_dirty(kvm, rmapp); + + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) + cond_resched_lock(&kvm->mmu_lock); + } + + spin_unlock(&kvm->mmu_lock); + + lockdep_assert_held(&kvm->slots_lock); + + /* + * It's also safe to flush TLBs out of mmu lock here as currently this + * function is only used for dirty logging, in which case flushing TLB + * out of mmu lock also guarantees no dirty pages will be lost in + * dirty_bitmap. + */ + if (flush) + kvm_flush_remote_tlbs(kvm); +} +EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty); + +void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, + struct kvm_memory_slot *memslot) +{ + gfn_t last_gfn; + int i; + bool flush = false; + + last_gfn = memslot->base_gfn + memslot->npages - 1; + + spin_lock(&kvm->mmu_lock); + + for (i = PT_PAGE_TABLE_LEVEL + 1; /* skip rmap for 4K page */ + i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { + unsigned long *rmapp; + unsigned long last_index, index; + + rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL]; + last_index = gfn_to_index(last_gfn, memslot->base_gfn, i); + + for (index = 0; index <= last_index; ++index, ++rmapp) { + if (*rmapp) + flush |= __rmap_write_protect(kvm, rmapp, + false); + + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) + cond_resched_lock(&kvm->mmu_lock); + } + } + spin_unlock(&kvm->mmu_lock); + + /* see kvm_mmu_slot_remove_write_access */ + lockdep_assert_held(&kvm->slots_lock); + + if (flush) + kvm_flush_remote_tlbs(kvm); +} +EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access); + +void kvm_mmu_slot_set_dirty(struct kvm *kvm, + struct kvm_memory_slot *memslot) +{ + gfn_t last_gfn; + int i; + bool flush = false; + + last_gfn = memslot->base_gfn + memslot->npages - 1; + + spin_lock(&kvm->mmu_lock); + + for (i = PT_PAGE_TABLE_LEVEL; + i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { + unsigned long *rmapp; + unsigned long last_index, index; + + rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL]; + last_index = gfn_to_index(last_gfn, memslot->base_gfn, i); + + for (index = 0; index <= last_index; ++index, ++rmapp) { + if (*rmapp) + flush |= __rmap_set_dirty(kvm, rmapp); + + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) + cond_resched_lock(&kvm->mmu_lock); + } + } + + spin_unlock(&kvm->mmu_lock); + + lockdep_assert_held(&kvm->slots_lock); + + /* see kvm_mmu_slot_leaf_clear_dirty */ + if (flush) + kvm_flush_remote_tlbs(kvm); } +EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty); #define BATCH_ZAP_PAGES 10 static void kvm_zap_obsolete_pages(struct kvm *kvm) @@ -4606,8 +4831,6 @@ EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); void kvm_mmu_destroy(struct kvm_vcpu *vcpu) { - ASSERT(vcpu); - kvm_mmu_unload(vcpu); free_mmu_pages(vcpu); mmu_free_memory_caches(vcpu); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index bde8ee7..c7d6563 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -44,18 +44,6 @@ #define PT_DIRECTORY_LEVEL 2 #define PT_PAGE_TABLE_LEVEL 1 -#define PFERR_PRESENT_BIT 0 -#define PFERR_WRITE_BIT 1 -#define PFERR_USER_BIT 2 -#define PFERR_RSVD_BIT 3 -#define PFERR_FETCH_BIT 4 - -#define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT) -#define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT) -#define PFERR_USER_MASK (1U << PFERR_USER_BIT) -#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT) -#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT) - static inline u64 rsvd_bits(int s, int e) { return ((1ULL << (e - s + 1)) - 1) << s; @@ -81,9 +69,8 @@ enum { }; int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); -void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); -void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context, - bool execonly); +void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu); +void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly); void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, bool ept); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 41dd038..a17d848 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2003,8 +2003,8 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) { - kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu); - + WARN_ON(mmu_is_nested(vcpu)); + kvm_init_shadow_mmu(vcpu); vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3; vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3; vcpu->arch.mmu.get_pdptr = nested_svm_get_tdp_pdptr; diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index c2a34bb..7c7bc8b 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -848,6 +848,24 @@ TRACE_EVENT(kvm_track_tsc, #endif /* CONFIG_X86_64 */ +/* + * Tracepoint for PML full VMEXIT. + */ +TRACE_EVENT(kvm_pml_full, + TP_PROTO(unsigned int vcpu_id), + TP_ARGS(vcpu_id), + + TP_STRUCT__entry( + __field( unsigned int, vcpu_id ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + ), + + TP_printk("vcpu %d: PML full", __entry->vcpu_id) +); + TRACE_EVENT(kvm_ple_window, TP_PROTO(bool grow, unsigned int vcpu_id, int new, int old), TP_ARGS(grow, vcpu_id, new, old), @@ -914,6 +932,26 @@ TRACE_EVENT(kvm_pvclock_update, __entry->flags) ); +TRACE_EVENT(kvm_wait_lapic_expire, + TP_PROTO(unsigned int vcpu_id, s64 delta), + TP_ARGS(vcpu_id, delta), + + TP_STRUCT__entry( + __field( unsigned int, vcpu_id ) + __field( s64, delta ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->delta = delta; + ), + + TP_printk("vcpu %u: delta %lld (%s)", + __entry->vcpu_id, + __entry->delta, + __entry->delta < 0 ? "early" : "late") +); + #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d4c58d8..3f73bfa 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -45,6 +45,7 @@ #include <asm/perf_event.h> #include <asm/debugreg.h> #include <asm/kexec.h> +#include <asm/apic.h> #include "trace.h" @@ -101,6 +102,9 @@ module_param(nested, bool, S_IRUGO); static u64 __read_mostly host_xss; +static bool __read_mostly enable_pml = 1; +module_param_named(pml, enable_pml, bool, S_IRUGO); + #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD) #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE) #define KVM_VM_CR0_ALWAYS_ON \ @@ -215,7 +219,12 @@ struct __packed vmcs12 { u64 tsc_offset; u64 virtual_apic_page_addr; u64 apic_access_addr; + u64 posted_intr_desc_addr; u64 ept_pointer; + u64 eoi_exit_bitmap0; + u64 eoi_exit_bitmap1; + u64 eoi_exit_bitmap2; + u64 eoi_exit_bitmap3; u64 xss_exit_bitmap; u64 guest_physical_address; u64 vmcs_link_pointer; @@ -330,6 +339,7 @@ struct __packed vmcs12 { u32 vmx_preemption_timer_value; u32 padding32[7]; /* room for future expansion */ u16 virtual_processor_id; + u16 posted_intr_nv; u16 guest_es_selector; u16 guest_cs_selector; u16 guest_ss_selector; @@ -338,6 +348,7 @@ struct __packed vmcs12 { u16 guest_gs_selector; u16 guest_ldtr_selector; u16 guest_tr_selector; + u16 guest_intr_status; u16 host_es_selector; u16 host_cs_selector; u16 host_ss_selector; @@ -401,6 +412,10 @@ struct nested_vmx { */ struct page *apic_access_page; struct page *virtual_apic_page; + struct page *pi_desc_page; + struct pi_desc *pi_desc; + bool pi_pending; + u16 posted_intr_nv; u64 msr_ia32_feature_control; struct hrtimer preemption_timer; @@ -408,6 +423,23 @@ struct nested_vmx { /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */ u64 vmcs01_debugctl; + + u32 nested_vmx_procbased_ctls_low; + u32 nested_vmx_procbased_ctls_high; + u32 nested_vmx_true_procbased_ctls_low; + u32 nested_vmx_secondary_ctls_low; + u32 nested_vmx_secondary_ctls_high; + u32 nested_vmx_pinbased_ctls_low; + u32 nested_vmx_pinbased_ctls_high; + u32 nested_vmx_exit_ctls_low; + u32 nested_vmx_exit_ctls_high; + u32 nested_vmx_true_exit_ctls_low; + u32 nested_vmx_entry_ctls_low; + u32 nested_vmx_entry_ctls_high; + u32 nested_vmx_true_entry_ctls_low; + u32 nested_vmx_misc_low; + u32 nested_vmx_misc_high; + u32 nested_vmx_ept_caps; }; #define POSTED_INTR_ON 0 @@ -511,6 +543,10 @@ struct vcpu_vmx { /* Dynamic PLE window. */ int ple_window; bool ple_window_dirty; + + /* Support for PML */ +#define PML_ENTITY_NUM 512 + struct page *pml_pg; }; enum segment_cache_field { @@ -594,6 +630,7 @@ static int max_shadow_read_write_fields = static const unsigned short vmcs_field_to_offset_table[] = { FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id), + FIELD(POSTED_INTR_NV, posted_intr_nv), FIELD(GUEST_ES_SELECTOR, guest_es_selector), FIELD(GUEST_CS_SELECTOR, guest_cs_selector), FIELD(GUEST_SS_SELECTOR, guest_ss_selector), @@ -602,6 +639,7 @@ static const unsigned short vmcs_field_to_offset_table[] = { FIELD(GUEST_GS_SELECTOR, guest_gs_selector), FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector), FIELD(GUEST_TR_SELECTOR, guest_tr_selector), + FIELD(GUEST_INTR_STATUS, guest_intr_status), FIELD(HOST_ES_SELECTOR, host_es_selector), FIELD(HOST_CS_SELECTOR, host_cs_selector), FIELD(HOST_SS_SELECTOR, host_ss_selector), @@ -618,7 +656,12 @@ static const unsigned short vmcs_field_to_offset_table[] = { FIELD64(TSC_OFFSET, tsc_offset), FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr), FIELD64(APIC_ACCESS_ADDR, apic_access_addr), + FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr), FIELD64(EPT_POINTER, ept_pointer), + FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0), + FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1), + FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2), + FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3), FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap), FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address), FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer), @@ -766,6 +809,7 @@ static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxoff(void); static bool vmx_mpx_supported(void); static bool vmx_xsaves_supported(void); +static int vmx_vm_has_apicv(struct kvm *kvm); static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); @@ -793,6 +837,7 @@ static unsigned long *vmx_msr_bitmap_legacy; static unsigned long *vmx_msr_bitmap_longmode; static unsigned long *vmx_msr_bitmap_legacy_x2apic; static unsigned long *vmx_msr_bitmap_longmode_x2apic; +static unsigned long *vmx_msr_bitmap_nested; static unsigned long *vmx_vmread_bitmap; static unsigned long *vmx_vmwrite_bitmap; @@ -959,16 +1004,6 @@ static inline bool cpu_has_vmx_ept_execute_only(void) return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT; } -static inline bool cpu_has_vmx_eptp_uncacheable(void) -{ - return vmx_capability.ept & VMX_EPTP_UC_BIT; -} - -static inline bool cpu_has_vmx_eptp_writeback(void) -{ - return vmx_capability.ept & VMX_EPTP_WB_BIT; -} - static inline bool cpu_has_vmx_ept_2m_page(void) { return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT; @@ -1073,6 +1108,11 @@ static inline bool cpu_has_vmx_shadow_vmcs(void) SECONDARY_EXEC_SHADOW_VMCS; } +static inline bool cpu_has_vmx_pml(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML; +} + static inline bool report_flexpriority(void) { return flexpriority_enabled; @@ -1112,6 +1152,26 @@ static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12) vmx_xsaves_supported(); } +static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); +} + +static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT); +} + +static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); +} + +static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12) +{ + return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR; +} + static inline bool is_exception(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) @@ -2284,20 +2344,8 @@ static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu) * if the corresponding bit in the (32-bit) control field *must* be on, and a * bit in the high half is on if the corresponding bit in the control field * may be on. See also vmx_control_verify(). - * TODO: allow these variables to be modified (downgraded) by module options - * or other means. */ -static u32 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high; -static u32 nested_vmx_true_procbased_ctls_low; -static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high; -static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high; -static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high; -static u32 nested_vmx_true_exit_ctls_low; -static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high; -static u32 nested_vmx_true_entry_ctls_low; -static u32 nested_vmx_misc_low, nested_vmx_misc_high; -static u32 nested_vmx_ept_caps; -static __init void nested_vmx_setup_ctls_msrs(void) +static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) { /* * Note that as a general rule, the high half of the MSRs (bits in @@ -2316,57 +2364,74 @@ static __init void nested_vmx_setup_ctls_msrs(void) /* pin-based controls */ rdmsr(MSR_IA32_VMX_PINBASED_CTLS, - nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high); - nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; - nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK | - PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS; - nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | + vmx->nested.nested_vmx_pinbased_ctls_low, + vmx->nested.nested_vmx_pinbased_ctls_high); + vmx->nested.nested_vmx_pinbased_ctls_low |= + PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; + vmx->nested.nested_vmx_pinbased_ctls_high &= + PIN_BASED_EXT_INTR_MASK | + PIN_BASED_NMI_EXITING | + PIN_BASED_VIRTUAL_NMIS; + vmx->nested.nested_vmx_pinbased_ctls_high |= + PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | PIN_BASED_VMX_PREEMPTION_TIMER; + if (vmx_vm_has_apicv(vmx->vcpu.kvm)) + vmx->nested.nested_vmx_pinbased_ctls_high |= + PIN_BASED_POSTED_INTR; /* exit controls */ rdmsr(MSR_IA32_VMX_EXIT_CTLS, - nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high); - nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; + vmx->nested.nested_vmx_exit_ctls_low, + vmx->nested.nested_vmx_exit_ctls_high); + vmx->nested.nested_vmx_exit_ctls_low = + VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; - nested_vmx_exit_ctls_high &= + vmx->nested.nested_vmx_exit_ctls_high &= #ifdef CONFIG_X86_64 VM_EXIT_HOST_ADDR_SPACE_SIZE | #endif VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; - nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | + vmx->nested.nested_vmx_exit_ctls_high |= + VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; if (vmx_mpx_supported()) - nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; + vmx->nested.nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; /* We support free control of debug control saving. */ - nested_vmx_true_exit_ctls_low = nested_vmx_exit_ctls_low & + vmx->nested.nested_vmx_true_exit_ctls_low = + vmx->nested.nested_vmx_exit_ctls_low & ~VM_EXIT_SAVE_DEBUG_CONTROLS; /* entry controls */ rdmsr(MSR_IA32_VMX_ENTRY_CTLS, - nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high); - nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; - nested_vmx_entry_ctls_high &= + vmx->nested.nested_vmx_entry_ctls_low, + vmx->nested.nested_vmx_entry_ctls_high); + vmx->nested.nested_vmx_entry_ctls_low = + VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; + vmx->nested.nested_vmx_entry_ctls_high &= #ifdef CONFIG_X86_64 VM_ENTRY_IA32E_MODE | #endif VM_ENTRY_LOAD_IA32_PAT; - nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | - VM_ENTRY_LOAD_IA32_EFER); + vmx->nested.nested_vmx_entry_ctls_high |= + (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); if (vmx_mpx_supported()) - nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; + vmx->nested.nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; /* We support free control of debug control loading. */ - nested_vmx_true_entry_ctls_low = nested_vmx_entry_ctls_low & + vmx->nested.nested_vmx_true_entry_ctls_low = + vmx->nested.nested_vmx_entry_ctls_low & ~VM_ENTRY_LOAD_DEBUG_CONTROLS; /* cpu-based controls */ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, - nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high); - nested_vmx_procbased_ctls_low = CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; - nested_vmx_procbased_ctls_high &= + vmx->nested.nested_vmx_procbased_ctls_low, + vmx->nested.nested_vmx_procbased_ctls_high); + vmx->nested.nested_vmx_procbased_ctls_low = + CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; + vmx->nested.nested_vmx_procbased_ctls_high &= CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING | CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | @@ -2386,45 +2451,55 @@ static __init void nested_vmx_setup_ctls_msrs(void) * can use it to avoid exits to L1 - even when L0 runs L2 * without MSR bitmaps. */ - nested_vmx_procbased_ctls_high |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | + vmx->nested.nested_vmx_procbased_ctls_high |= + CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | CPU_BASED_USE_MSR_BITMAPS; /* We support free control of CR3 access interception. */ - nested_vmx_true_procbased_ctls_low = nested_vmx_procbased_ctls_low & + vmx->nested.nested_vmx_true_procbased_ctls_low = + vmx->nested.nested_vmx_procbased_ctls_low & ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); /* secondary cpu-based controls */ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, - nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high); - nested_vmx_secondary_ctls_low = 0; - nested_vmx_secondary_ctls_high &= + vmx->nested.nested_vmx_secondary_ctls_low, + vmx->nested.nested_vmx_secondary_ctls_high); + vmx->nested.nested_vmx_secondary_ctls_low = 0; + vmx->nested.nested_vmx_secondary_ctls_high &= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_WBINVD_EXITING | SECONDARY_EXEC_XSAVES; if (enable_ept) { /* nested EPT: emulate EPT also to L1 */ - nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT | + vmx->nested.nested_vmx_secondary_ctls_high |= + SECONDARY_EXEC_ENABLE_EPT | SECONDARY_EXEC_UNRESTRICTED_GUEST; - nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT | + vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT | VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT | VMX_EPT_INVEPT_BIT; - nested_vmx_ept_caps &= vmx_capability.ept; + vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept; /* * For nested guests, we don't do anything specific * for single context invalidation. Hence, only advertise * support for global context invalidation. */ - nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT; + vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT; } else - nested_vmx_ept_caps = 0; + vmx->nested.nested_vmx_ept_caps = 0; /* miscellaneous data */ - rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high); - nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA; - nested_vmx_misc_low |= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | + rdmsr(MSR_IA32_VMX_MISC, + vmx->nested.nested_vmx_misc_low, + vmx->nested.nested_vmx_misc_high); + vmx->nested.nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA; + vmx->nested.nested_vmx_misc_low |= + VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | VMX_MISC_ACTIVITY_HLT; - nested_vmx_misc_high = 0; + vmx->nested.nested_vmx_misc_high = 0; } static inline bool vmx_control_verify(u32 control, u32 low, u32 high) @@ -2443,6 +2518,8 @@ static inline u64 vmx_control_msr(u32 low, u32 high) /* Returns 0 on success, non-0 otherwise. */ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) { + struct vcpu_vmx *vmx = to_vmx(vcpu); + switch (msr_index) { case MSR_IA32_VMX_BASIC: /* @@ -2457,36 +2534,44 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) break; case MSR_IA32_VMX_TRUE_PINBASED_CTLS: case MSR_IA32_VMX_PINBASED_CTLS: - *pdata = vmx_control_msr(nested_vmx_pinbased_ctls_low, - nested_vmx_pinbased_ctls_high); + *pdata = vmx_control_msr( + vmx->nested.nested_vmx_pinbased_ctls_low, + vmx->nested.nested_vmx_pinbased_ctls_high); break; case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: - *pdata = vmx_control_msr(nested_vmx_true_procbased_ctls_low, - nested_vmx_procbased_ctls_high); + *pdata = vmx_control_msr( + vmx->nested.nested_vmx_true_procbased_ctls_low, + vmx->nested.nested_vmx_procbased_ctls_high); break; case MSR_IA32_VMX_PROCBASED_CTLS: - *pdata = vmx_control_msr(nested_vmx_procbased_ctls_low, - nested_vmx_procbased_ctls_high); + *pdata = vmx_control_msr( + vmx->nested.nested_vmx_procbased_ctls_low, + vmx->nested.nested_vmx_procbased_ctls_high); break; case MSR_IA32_VMX_TRUE_EXIT_CTLS: - *pdata = vmx_control_msr(nested_vmx_true_exit_ctls_low, - nested_vmx_exit_ctls_high); + *pdata = vmx_control_msr( + vmx->nested.nested_vmx_true_exit_ctls_low, + vmx->nested.nested_vmx_exit_ctls_high); break; case MSR_IA32_VMX_EXIT_CTLS: - *pdata = vmx_control_msr(nested_vmx_exit_ctls_low, - nested_vmx_exit_ctls_high); + *pdata = vmx_control_msr( + vmx->nested.nested_vmx_exit_ctls_low, + vmx->nested.nested_vmx_exit_ctls_high); break; case MSR_IA32_VMX_TRUE_ENTRY_CTLS: - *pdata = vmx_control_msr(nested_vmx_true_entry_ctls_low, - nested_vmx_entry_ctls_high); + *pdata = vmx_control_msr( + vmx->nested.nested_vmx_true_entry_ctls_low, + vmx->nested.nested_vmx_entry_ctls_high); break; case MSR_IA32_VMX_ENTRY_CTLS: - *pdata = vmx_control_msr(nested_vmx_entry_ctls_low, - nested_vmx_entry_ctls_high); + *pdata = vmx_control_msr( + vmx->nested.nested_vmx_entry_ctls_low, + vmx->nested.nested_vmx_entry_ctls_high); break; case MSR_IA32_VMX_MISC: - *pdata = vmx_control_msr(nested_vmx_misc_low, - nested_vmx_misc_high); + *pdata = vmx_control_msr( + vmx->nested.nested_vmx_misc_low, + vmx->nested.nested_vmx_misc_high); break; /* * These MSRs specify bits which the guest must keep fixed (on or off) @@ -2511,12 +2596,13 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) *pdata = 0x2e; /* highest index: VMX_PREEMPTION_TIMER_VALUE */ break; case MSR_IA32_VMX_PROCBASED_CTLS2: - *pdata = vmx_control_msr(nested_vmx_secondary_ctls_low, - nested_vmx_secondary_ctls_high); + *pdata = vmx_control_msr( + vmx->nested.nested_vmx_secondary_ctls_low, + vmx->nested.nested_vmx_secondary_ctls_high); break; case MSR_IA32_VMX_EPT_VPID_CAP: /* Currently, no nested vpid support */ - *pdata = nested_vmx_ept_caps; + *pdata = vmx->nested.nested_vmx_ept_caps; break; default: return 1; @@ -2929,7 +3015,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_SHADOW_VMCS | - SECONDARY_EXEC_XSAVES; + SECONDARY_EXEC_XSAVES | + SECONDARY_EXEC_ENABLE_PML; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) @@ -4159,6 +4246,52 @@ static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, } } +/* + * If a msr is allowed by L0, we should check whether it is allowed by L1. + * The corresponding bit will be cleared unless both of L0 and L1 allow it. + */ +static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1, + unsigned long *msr_bitmap_nested, + u32 msr, int type) +{ + int f = sizeof(unsigned long); + + if (!cpu_has_vmx_msr_bitmap()) { + WARN_ON(1); + return; + } + + /* + * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals + * have the write-low and read-high bitmap offsets the wrong way round. + * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. + */ + if (msr <= 0x1fff) { + if (type & MSR_TYPE_R && + !test_bit(msr, msr_bitmap_l1 + 0x000 / f)) + /* read-low */ + __clear_bit(msr, msr_bitmap_nested + 0x000 / f); + + if (type & MSR_TYPE_W && + !test_bit(msr, msr_bitmap_l1 + 0x800 / f)) + /* write-low */ + __clear_bit(msr, msr_bitmap_nested + 0x800 / f); + + } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { + msr &= 0x1fff; + if (type & MSR_TYPE_R && + !test_bit(msr, msr_bitmap_l1 + 0x400 / f)) + /* read-high */ + __clear_bit(msr, msr_bitmap_nested + 0x400 / f); + + if (type & MSR_TYPE_W && + !test_bit(msr, msr_bitmap_l1 + 0xc00 / f)) + /* write-high */ + __clear_bit(msr, msr_bitmap_nested + 0xc00 / f); + + } +} + static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) { if (!longmode_only) @@ -4197,6 +4330,64 @@ static int vmx_vm_has_apicv(struct kvm *kvm) return enable_apicv && irqchip_in_kernel(kvm); } +static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int max_irr; + void *vapic_page; + u16 status; + + if (vmx->nested.pi_desc && + vmx->nested.pi_pending) { + vmx->nested.pi_pending = false; + if (!pi_test_and_clear_on(vmx->nested.pi_desc)) + return 0; + + max_irr = find_last_bit( + (unsigned long *)vmx->nested.pi_desc->pir, 256); + + if (max_irr == 256) + return 0; + + vapic_page = kmap(vmx->nested.virtual_apic_page); + if (!vapic_page) { + WARN_ON(1); + return -ENOMEM; + } + __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page); + kunmap(vmx->nested.virtual_apic_page); + + status = vmcs_read16(GUEST_INTR_STATUS); + if ((u8)max_irr > ((u8)status & 0xff)) { + status &= ~0xff; + status |= (u8)max_irr; + vmcs_write16(GUEST_INTR_STATUS, status); + } + } + return 0; +} + +static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, + int vector) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (is_guest_mode(vcpu) && + vector == vmx->nested.posted_intr_nv) { + /* the PIR and ON have been set by L1. */ + if (vcpu->mode == IN_GUEST_MODE) + apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), + POSTED_INTR_VECTOR); + /* + * If a posted intr is not recognized by hardware, + * we will accomplish it in the next vmentry. + */ + vmx->nested.pi_pending = true; + kvm_make_request(KVM_REQ_EVENT, vcpu); + return 0; + } + return -1; +} /* * Send interrupt to vcpu via posted interrupt way. * 1. If target vcpu is running(non-root mode), send posted interrupt @@ -4209,6 +4400,10 @@ static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) struct vcpu_vmx *vmx = to_vmx(vcpu); int r; + r = vmx_deliver_nested_posted_interrupt(vcpu, vector); + if (!r) + return; + if (pi_test_and_set_pir(vector, &vmx->pi_desc)) return; @@ -4360,6 +4555,9 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) a current VMCS12 */ exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; + /* PML is enabled/disabled in creating/destorying vcpu */ + exec_control &= ~SECONDARY_EXEC_ENABLE_PML; + return exec_control; } @@ -4986,11 +5184,12 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) hypercall[2] = 0xc1; } -static bool nested_cr0_valid(struct vmcs12 *vmcs12, unsigned long val) +static bool nested_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) { unsigned long always_on = VMXON_CR0_ALWAYSON; + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - if (nested_vmx_secondary_ctls_high & + if (to_vmx(vcpu)->nested.nested_vmx_secondary_ctls_high & SECONDARY_EXEC_UNRESTRICTED_GUEST && nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) always_on &= ~(X86_CR0_PE | X86_CR0_PG); @@ -5015,7 +5214,7 @@ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) val = (val & ~vmcs12->cr0_guest_host_mask) | (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask); - if (!nested_cr0_valid(vmcs12, val)) + if (!nested_cr0_valid(vcpu, val)) return 1; if (kvm_set_cr0(vcpu, val)) @@ -5817,13 +6016,21 @@ static __init int hardware_setup(void) (unsigned long *)__get_free_page(GFP_KERNEL); if (!vmx_msr_bitmap_longmode_x2apic) goto out4; + + if (nested) { + vmx_msr_bitmap_nested = + (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_msr_bitmap_nested) + goto out5; + } + vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); if (!vmx_vmread_bitmap) - goto out5; + goto out6; vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); if (!vmx_vmwrite_bitmap) - goto out6; + goto out7; memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); @@ -5839,10 +6046,12 @@ static __init int hardware_setup(void) memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); + if (nested) + memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE); if (setup_vmcs_config(&vmcs_config) < 0) { r = -EIO; - goto out7; + goto out8; } if (boot_cpu_has(X86_FEATURE_NX)) @@ -5868,16 +6077,16 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_unrestricted_guest()) enable_unrestricted_guest = 0; - if (!cpu_has_vmx_flexpriority()) { + if (!cpu_has_vmx_flexpriority()) flexpriority_enabled = 0; - /* - * set_apic_access_page_addr() is used to reload apic access - * page upon invalidation. No need to do anything if the - * processor does not have the APIC_ACCESS_ADDR VMCS field. - */ + /* + * set_apic_access_page_addr() is used to reload apic access + * page upon invalidation. No need to do anything if not + * using the APIC_ACCESS_ADDR VMCS field. + */ + if (!flexpriority_enabled) kvm_x86_ops->set_apic_access_page_addr = NULL; - } if (!cpu_has_vmx_tpr_shadow()) kvm_x86_ops->update_cr8_intercept = NULL; @@ -5895,13 +6104,11 @@ static __init int hardware_setup(void) kvm_x86_ops->update_cr8_intercept = NULL; else { kvm_x86_ops->hwapic_irr_update = NULL; + kvm_x86_ops->hwapic_isr_update = NULL; kvm_x86_ops->deliver_posted_interrupt = NULL; kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy; } - if (nested) - nested_vmx_setup_ctls_msrs(); - vmx_disable_intercept_for_msr(MSR_FS_BASE, false); vmx_disable_intercept_for_msr(MSR_GS_BASE, false); vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); @@ -5945,12 +6152,29 @@ static __init int hardware_setup(void) update_ple_window_actual_max(); + /* + * Only enable PML when hardware supports PML feature, and both EPT + * and EPT A/D bit features are enabled -- PML depends on them to work. + */ + if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml()) + enable_pml = 0; + + if (!enable_pml) { + kvm_x86_ops->slot_enable_log_dirty = NULL; + kvm_x86_ops->slot_disable_log_dirty = NULL; + kvm_x86_ops->flush_log_dirty = NULL; + kvm_x86_ops->enable_log_dirty_pt_masked = NULL; + } + return alloc_kvm_area(); -out7: +out8: free_page((unsigned long)vmx_vmwrite_bitmap); -out6: +out7: free_page((unsigned long)vmx_vmread_bitmap); +out6: + if (nested) + free_page((unsigned long)vmx_msr_bitmap_nested); out5: free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); out4: @@ -5977,6 +6201,8 @@ static __exit void hardware_unsetup(void) free_page((unsigned long)vmx_io_bitmap_a); free_page((unsigned long)vmx_vmwrite_bitmap); free_page((unsigned long)vmx_vmread_bitmap); + if (nested) + free_page((unsigned long)vmx_msr_bitmap_nested); free_kvm_area(); } @@ -6143,6 +6369,13 @@ static void nested_vmx_failValid(struct kvm_vcpu *vcpu, */ } +static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) +{ + /* TODO: not to reset guest simply here. */ + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + pr_warn("kvm: nested vmx abort, indicator %d\n", indicator); +} + static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) { struct vcpu_vmx *vmx = @@ -6432,6 +6665,7 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); vmcs_write64(VMCS_LINK_POINTER, -1ull); } + vmx->nested.posted_intr_nv = -1; kunmap(vmx->nested.current_vmcs12_page); nested_release_page(vmx->nested.current_vmcs12_page); vmx->nested.current_vmptr = -1ull; @@ -6460,6 +6694,12 @@ static void free_nested(struct vcpu_vmx *vmx) nested_release_page(vmx->nested.virtual_apic_page); vmx->nested.virtual_apic_page = NULL; } + if (vmx->nested.pi_desc_page) { + kunmap(vmx->nested.pi_desc_page); + nested_release_page(vmx->nested.pi_desc_page); + vmx->nested.pi_desc_page = NULL; + vmx->nested.pi_desc = NULL; + } nested_free_all_saved_vmcss(vmx); } @@ -6893,6 +7133,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) /* Emulate the INVEPT instruction */ static int handle_invept(struct kvm_vcpu *vcpu) { + struct vcpu_vmx *vmx = to_vmx(vcpu); u32 vmx_instruction_info, types; unsigned long type; gva_t gva; @@ -6901,8 +7142,9 @@ static int handle_invept(struct kvm_vcpu *vcpu) u64 eptp, gpa; } operand; - if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) || - !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) { + if (!(vmx->nested.nested_vmx_secondary_ctls_high & + SECONDARY_EXEC_ENABLE_EPT) || + !(vmx->nested.nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } @@ -6918,7 +7160,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); - types = (nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; + types = (vmx->nested.nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; if (!(types & (1UL << type))) { nested_vmx_failValid(vcpu, @@ -6960,6 +7202,31 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) return 1; } +static int handle_pml_full(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qualification; + + trace_kvm_pml_full(vcpu->vcpu_id); + + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + + /* + * PML buffer FULL happened while executing iret from NMI, + * "blocked by NMI" bit has to be set before next VM entry. + */ + if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && + cpu_has_virtual_nmis() && + (exit_qualification & INTR_INFO_UNBLOCK_NMI)) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + + /* + * PML buffer already flushed at beginning of VMEXIT. Nothing to do + * here.., and there's no userspace involvement needed for PML. + */ + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -7008,6 +7275,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_INVVPID] = handle_invvpid, [EXIT_REASON_XSAVES] = handle_xsaves, [EXIT_REASON_XRSTORS] = handle_xrstors, + [EXIT_REASON_PML_FULL] = handle_pml_full, }; static const int kvm_vmx_max_exit_handlers = @@ -7275,6 +7543,10 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) case EXIT_REASON_APIC_ACCESS: return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); + case EXIT_REASON_APIC_WRITE: + case EXIT_REASON_EOI_INDUCED: + /* apic_write and eoi_induced should exit unconditionally. */ + return 1; case EXIT_REASON_EPT_VIOLATION: /* * L0 always deals with the EPT violation. If nested EPT is @@ -7314,6 +7586,89 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) *info2 = vmcs_read32(VM_EXIT_INTR_INFO); } +static int vmx_enable_pml(struct vcpu_vmx *vmx) +{ + struct page *pml_pg; + u32 exec_control; + + pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!pml_pg) + return -ENOMEM; + + vmx->pml_pg = pml_pg; + + vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); + vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); + + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + exec_control |= SECONDARY_EXEC_ENABLE_PML; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + + return 0; +} + +static void vmx_disable_pml(struct vcpu_vmx *vmx) +{ + u32 exec_control; + + ASSERT(vmx->pml_pg); + __free_page(vmx->pml_pg); + vmx->pml_pg = NULL; + + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + exec_control &= ~SECONDARY_EXEC_ENABLE_PML; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); +} + +static void vmx_flush_pml_buffer(struct vcpu_vmx *vmx) +{ + struct kvm *kvm = vmx->vcpu.kvm; + u64 *pml_buf; + u16 pml_idx; + + pml_idx = vmcs_read16(GUEST_PML_INDEX); + + /* Do nothing if PML buffer is empty */ + if (pml_idx == (PML_ENTITY_NUM - 1)) + return; + + /* PML index always points to next available PML buffer entity */ + if (pml_idx >= PML_ENTITY_NUM) + pml_idx = 0; + else + pml_idx++; + + pml_buf = page_address(vmx->pml_pg); + for (; pml_idx < PML_ENTITY_NUM; pml_idx++) { + u64 gpa; + + gpa = pml_buf[pml_idx]; + WARN_ON(gpa & (PAGE_SIZE - 1)); + mark_page_dirty(kvm, gpa >> PAGE_SHIFT); + } + + /* reset PML index */ + vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); +} + +/* + * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap. + * Called before reporting dirty_bitmap to userspace. + */ +static void kvm_flush_pml_buffers(struct kvm *kvm) +{ + int i; + struct kvm_vcpu *vcpu; + /* + * We only need to kick vcpu out of guest mode here, as PML buffer + * is flushed at beginning of all VMEXITs, and it's obvious that only + * vcpus running in guest are possible to have unflushed GPAs in PML + * buffer. + */ + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_vcpu_kick(vcpu); +} + /* * The guest has exited. See if we can fix it or if we need userspace * assistance. @@ -7324,6 +7679,16 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) u32 exit_reason = vmx->exit_reason; u32 vectoring_info = vmx->idt_vectoring_info; + /* + * Flush logged GPAs PML buffer, this will make dirty_bitmap more + * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before + * querying dirty_bitmap, we only need to kick all vcpus out of guest + * mode as if vcpus is in root mode, the PML buffer must has been + * flushed already. + */ + if (enable_pml) + vmx_flush_pml_buffer(vmx); + /* If guest state is invalid, start emulating */ if (vmx->emulation_required) return handle_invalid_guest_state(vcpu); @@ -7471,9 +7836,6 @@ static void vmx_hwapic_isr_update(struct kvm *kvm, int isr) u16 status; u8 old; - if (!vmx_vm_has_apicv(kvm)) - return; - if (isr == -1) isr = 0; @@ -7973,6 +8335,8 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + if (enable_pml) + vmx_disable_pml(vmx); free_vpid(vmx); leave_guest_mode(vcpu); vmx_load_vmcs01(vcpu); @@ -8040,9 +8404,25 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) goto free_vmcs; } + if (nested) + nested_vmx_setup_ctls_msrs(vmx); + + vmx->nested.posted_intr_nv = -1; vmx->nested.current_vmptr = -1ull; vmx->nested.current_vmcs12 = NULL; + /* + * If PML is turned on, failure on enabling PML just results in failure + * of creating the vcpu, therefore we can simplify PML logic (by + * avoiding dealing with cases, such as enabling PML partially on vcpus + * for the guest, etc. + */ + if (enable_pml) { + err = vmx_enable_pml(vmx); + if (err) + goto free_vmcs; + } + return &vmx->vcpu; free_vmcs: @@ -8184,9 +8564,10 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) { - kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu, - nested_vmx_ept_caps & VMX_EPT_EXECUTE_ONLY_BIT); - + WARN_ON(mmu_is_nested(vcpu)); + kvm_init_shadow_ept_mmu(vcpu, + to_vmx(vcpu)->nested.nested_vmx_ept_caps & + VMX_EPT_EXECUTE_ONLY_BIT); vcpu->arch.mmu.set_cr3 = vmx_set_cr3; vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3; vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault; @@ -8199,6 +8580,18 @@ static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) vcpu->arch.walk_mmu = &vcpu->arch.mmu; } +static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, + u16 error_code) +{ + bool inequality, bit; + + bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; + inequality = + (error_code & vmcs12->page_fault_error_code_mask) != + vmcs12->page_fault_error_code_match; + return inequality ^ bit; +} + static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, struct x86_exception *fault) { @@ -8206,8 +8599,7 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, WARN_ON(!is_guest_mode(vcpu)); - /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */ - if (vmcs12->exception_bitmap & (1u << PF_VECTOR)) + if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code)) nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason, vmcs_read32(VM_EXIT_INTR_INFO), vmcs_readl(EXIT_QUALIFICATION)); @@ -8261,6 +8653,31 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, return false; } + if (nested_cpu_has_posted_intr(vmcs12)) { + if (!IS_ALIGNED(vmcs12->posted_intr_desc_addr, 64)) + return false; + + if (vmx->nested.pi_desc_page) { /* shouldn't happen */ + kunmap(vmx->nested.pi_desc_page); + nested_release_page(vmx->nested.pi_desc_page); + } + vmx->nested.pi_desc_page = + nested_get_page(vcpu, vmcs12->posted_intr_desc_addr); + if (!vmx->nested.pi_desc_page) + return false; + + vmx->nested.pi_desc = + (struct pi_desc *)kmap(vmx->nested.pi_desc_page); + if (!vmx->nested.pi_desc) { + nested_release_page_clean(vmx->nested.pi_desc_page); + return false; + } + vmx->nested.pi_desc = + (struct pi_desc *)((void *)vmx->nested.pi_desc + + (unsigned long)(vmcs12->posted_intr_desc_addr & + (PAGE_SIZE - 1))); + } + return true; } @@ -8286,6 +8703,310 @@ static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL); } +static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + int maxphyaddr; + u64 addr; + + if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) + return 0; + + if (vmcs12_read_any(vcpu, MSR_BITMAP, &addr)) { + WARN_ON(1); + return -EINVAL; + } + maxphyaddr = cpuid_maxphyaddr(vcpu); + + if (!PAGE_ALIGNED(vmcs12->msr_bitmap) || + ((addr + PAGE_SIZE) >> maxphyaddr)) + return -EINVAL; + + return 0; +} + +/* + * Merge L0's and L1's MSR bitmap, return false to indicate that + * we do not use the hardware. + */ +static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + int msr; + struct page *page; + unsigned long *msr_bitmap; + + if (!nested_cpu_has_virt_x2apic_mode(vmcs12)) + return false; + + page = nested_get_page(vcpu, vmcs12->msr_bitmap); + if (!page) { + WARN_ON(1); + return false; + } + msr_bitmap = (unsigned long *)kmap(page); + if (!msr_bitmap) { + nested_release_page_clean(page); + WARN_ON(1); + return false; + } + + if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { + if (nested_cpu_has_apic_reg_virt(vmcs12)) + for (msr = 0x800; msr <= 0x8ff; msr++) + nested_vmx_disable_intercept_for_msr( + msr_bitmap, + vmx_msr_bitmap_nested, + msr, MSR_TYPE_R); + /* TPR is allowed */ + nested_vmx_disable_intercept_for_msr(msr_bitmap, + vmx_msr_bitmap_nested, + APIC_BASE_MSR + (APIC_TASKPRI >> 4), + MSR_TYPE_R | MSR_TYPE_W); + if (nested_cpu_has_vid(vmcs12)) { + /* EOI and self-IPI are allowed */ + nested_vmx_disable_intercept_for_msr( + msr_bitmap, + vmx_msr_bitmap_nested, + APIC_BASE_MSR + (APIC_EOI >> 4), + MSR_TYPE_W); + nested_vmx_disable_intercept_for_msr( + msr_bitmap, + vmx_msr_bitmap_nested, + APIC_BASE_MSR + (APIC_SELF_IPI >> 4), + MSR_TYPE_W); + } + } else { + /* + * Enable reading intercept of all the x2apic + * MSRs. We should not rely on vmcs12 to do any + * optimizations here, it may have been modified + * by L1. + */ + for (msr = 0x800; msr <= 0x8ff; msr++) + __vmx_enable_intercept_for_msr( + vmx_msr_bitmap_nested, + msr, + MSR_TYPE_R); + + __vmx_enable_intercept_for_msr( + vmx_msr_bitmap_nested, + APIC_BASE_MSR + (APIC_TASKPRI >> 4), + MSR_TYPE_W); + __vmx_enable_intercept_for_msr( + vmx_msr_bitmap_nested, + APIC_BASE_MSR + (APIC_EOI >> 4), + MSR_TYPE_W); + __vmx_enable_intercept_for_msr( + vmx_msr_bitmap_nested, + APIC_BASE_MSR + (APIC_SELF_IPI >> 4), + MSR_TYPE_W); + } + kunmap(page); + nested_release_page_clean(page); + + return true; +} + +static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && + !nested_cpu_has_apic_reg_virt(vmcs12) && + !nested_cpu_has_vid(vmcs12) && + !nested_cpu_has_posted_intr(vmcs12)) + return 0; + + /* + * If virtualize x2apic mode is enabled, + * virtualize apic access must be disabled. + */ + if (nested_cpu_has_virt_x2apic_mode(vmcs12) && + nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) + return -EINVAL; + + /* + * If virtual interrupt delivery is enabled, + * we must exit on external interrupts. + */ + if (nested_cpu_has_vid(vmcs12) && + !nested_exit_on_intr(vcpu)) + return -EINVAL; + + /* + * bits 15:8 should be zero in posted_intr_nv, + * the descriptor address has been already checked + * in nested_get_vmcs12_pages. + */ + if (nested_cpu_has_posted_intr(vmcs12) && + (!nested_cpu_has_vid(vmcs12) || + !nested_exit_intr_ack_set(vcpu) || + vmcs12->posted_intr_nv & 0xff00)) + return -EINVAL; + + /* tpr shadow is needed by all apicv features. */ + if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) + return -EINVAL; + + return 0; +} + +static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, + unsigned long count_field, + unsigned long addr_field, + int maxphyaddr) +{ + u64 count, addr; + + if (vmcs12_read_any(vcpu, count_field, &count) || + vmcs12_read_any(vcpu, addr_field, &addr)) { + WARN_ON(1); + return -EINVAL; + } + if (count == 0) + return 0; + if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr || + (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) { + pr_warn_ratelimited( + "nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)", + addr_field, maxphyaddr, count, addr); + return -EINVAL; + } + return 0; +} + +static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + int maxphyaddr; + + if (vmcs12->vm_exit_msr_load_count == 0 && + vmcs12->vm_exit_msr_store_count == 0 && + vmcs12->vm_entry_msr_load_count == 0) + return 0; /* Fast path */ + maxphyaddr = cpuid_maxphyaddr(vcpu); + if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT, + VM_EXIT_MSR_LOAD_ADDR, maxphyaddr) || + nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT, + VM_EXIT_MSR_STORE_ADDR, maxphyaddr) || + nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT, + VM_ENTRY_MSR_LOAD_ADDR, maxphyaddr)) + return -EINVAL; + return 0; +} + +static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, + struct vmx_msr_entry *e) +{ + /* x2APIC MSR accesses are not allowed */ + if (apic_x2apic_mode(vcpu->arch.apic) && e->index >> 8 == 0x8) + return -EINVAL; + if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */ + e->index == MSR_IA32_UCODE_REV) + return -EINVAL; + if (e->reserved != 0) + return -EINVAL; + return 0; +} + +static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, + struct vmx_msr_entry *e) +{ + if (e->index == MSR_FS_BASE || + e->index == MSR_GS_BASE || + e->index == MSR_IA32_SMM_MONITOR_CTL || /* SMM is not supported */ + nested_vmx_msr_check_common(vcpu, e)) + return -EINVAL; + return 0; +} + +static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, + struct vmx_msr_entry *e) +{ + if (e->index == MSR_IA32_SMBASE || /* SMM is not supported */ + nested_vmx_msr_check_common(vcpu, e)) + return -EINVAL; + return 0; +} + +/* + * Load guest's/host's msr at nested entry/exit. + * return 0 for success, entry index for failure. + */ +static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) +{ + u32 i; + struct vmx_msr_entry e; + struct msr_data msr; + + msr.host_initiated = false; + for (i = 0; i < count; i++) { + if (kvm_read_guest(vcpu->kvm, gpa + i * sizeof(e), + &e, sizeof(e))) { + pr_warn_ratelimited( + "%s cannot read MSR entry (%u, 0x%08llx)\n", + __func__, i, gpa + i * sizeof(e)); + goto fail; + } + if (nested_vmx_load_msr_check(vcpu, &e)) { + pr_warn_ratelimited( + "%s check failed (%u, 0x%x, 0x%x)\n", + __func__, i, e.index, e.reserved); + goto fail; + } + msr.index = e.index; + msr.data = e.value; + if (kvm_set_msr(vcpu, &msr)) { + pr_warn_ratelimited( + "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", + __func__, i, e.index, e.value); + goto fail; + } + } + return 0; +fail: + return i + 1; +} + +static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) +{ + u32 i; + struct vmx_msr_entry e; + + for (i = 0; i < count; i++) { + if (kvm_read_guest(vcpu->kvm, + gpa + i * sizeof(e), + &e, 2 * sizeof(u32))) { + pr_warn_ratelimited( + "%s cannot read MSR entry (%u, 0x%08llx)\n", + __func__, i, gpa + i * sizeof(e)); + return -EINVAL; + } + if (nested_vmx_store_msr_check(vcpu, &e)) { + pr_warn_ratelimited( + "%s check failed (%u, 0x%x, 0x%x)\n", + __func__, i, e.index, e.reserved); + return -EINVAL; + } + if (kvm_get_msr(vcpu, e.index, &e.value)) { + pr_warn_ratelimited( + "%s cannot read MSR (%u, 0x%x)\n", + __func__, i, e.index); + return -EINVAL; + } + if (kvm_write_guest(vcpu->kvm, + gpa + i * sizeof(e) + + offsetof(struct vmx_msr_entry, value), + &e.value, sizeof(e.value))) { + pr_warn_ratelimited( + "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", + __func__, i, e.index, e.value); + return -EINVAL; + } + } + return 0; +} + /* * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it @@ -8365,8 +9086,23 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) exec_control = vmcs12->pin_based_vm_exec_control; exec_control |= vmcs_config.pin_based_exec_ctrl; - exec_control &= ~(PIN_BASED_VMX_PREEMPTION_TIMER | - PIN_BASED_POSTED_INTR); + exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + + if (nested_cpu_has_posted_intr(vmcs12)) { + /* + * Note that we use L0's vector here and in + * vmx_deliver_nested_posted_interrupt. + */ + vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; + vmx->nested.pi_pending = false; + vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR); + vmcs_write64(POSTED_INTR_DESC_ADDR, + page_to_phys(vmx->nested.pi_desc_page) + + (unsigned long)(vmcs12->posted_intr_desc_addr & + (PAGE_SIZE - 1))); + } else + exec_control &= ~PIN_BASED_POSTED_INTR; + vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control); vmx->nested.preemption_timer_expired = false; @@ -8423,12 +9159,26 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) else vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(vmx->nested.apic_access_page)); - } else if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) { + } else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) && + (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))) { exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; kvm_vcpu_reload_apic_access_page(vcpu); } + if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) { + vmcs_write64(EOI_EXIT_BITMAP0, + vmcs12->eoi_exit_bitmap0); + vmcs_write64(EOI_EXIT_BITMAP1, + vmcs12->eoi_exit_bitmap1); + vmcs_write64(EOI_EXIT_BITMAP2, + vmcs12->eoi_exit_bitmap2); + vmcs_write64(EOI_EXIT_BITMAP3, + vmcs12->eoi_exit_bitmap3); + vmcs_write16(GUEST_INTR_STATUS, + vmcs12->guest_intr_status); + } + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); } @@ -8462,11 +9212,17 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); } + if (cpu_has_vmx_msr_bitmap() && + exec_control & CPU_BASED_USE_MSR_BITMAPS && + nested_vmx_merge_msr_bitmap(vcpu, vmcs12)) { + vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_nested)); + } else + exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; + /* - * Merging of IO and MSR bitmaps not currently supported. + * Merging of IO bitmap not currently supported. * Rather, exit every time. */ - exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; exec_control &= ~CPU_BASED_USE_IO_BITMAPS; exec_control |= CPU_BASED_UNCOND_IO_EXITING; @@ -8582,6 +9338,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) int cpu; struct loaded_vmcs *vmcs02; bool ia32e; + u32 msr_entry_idx; if (!nested_vmx_check_permission(vcpu) || !nested_vmx_check_vmcs12(vcpu)) @@ -8616,41 +9373,42 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) return 1; } - if ((vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_MSR_BITMAPS) && - !PAGE_ALIGNED(vmcs12->msr_bitmap)) { + if (!nested_get_vmcs12_pages(vcpu, vmcs12)) { /*TODO: Also verify bits beyond physical address width are 0*/ nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); return 1; } - if (!nested_get_vmcs12_pages(vcpu, vmcs12)) { - /*TODO: Also verify bits beyond physical address width are 0*/ + if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); return 1; } - if (vmcs12->vm_entry_msr_load_count > 0 || - vmcs12->vm_exit_msr_load_count > 0 || - vmcs12->vm_exit_msr_store_count > 0) { - pr_warn_ratelimited("%s: VMCS MSR_{LOAD,STORE} unsupported\n", - __func__); + if (nested_vmx_check_apicv_controls(vcpu, vmcs12)) { + nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); + return 1; + } + + if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); return 1; } if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, - nested_vmx_true_procbased_ctls_low, - nested_vmx_procbased_ctls_high) || + vmx->nested.nested_vmx_true_procbased_ctls_low, + vmx->nested.nested_vmx_procbased_ctls_high) || !vmx_control_verify(vmcs12->secondary_vm_exec_control, - nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high) || + vmx->nested.nested_vmx_secondary_ctls_low, + vmx->nested.nested_vmx_secondary_ctls_high) || !vmx_control_verify(vmcs12->pin_based_vm_exec_control, - nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high) || + vmx->nested.nested_vmx_pinbased_ctls_low, + vmx->nested.nested_vmx_pinbased_ctls_high) || !vmx_control_verify(vmcs12->vm_exit_controls, - nested_vmx_true_exit_ctls_low, - nested_vmx_exit_ctls_high) || + vmx->nested.nested_vmx_true_exit_ctls_low, + vmx->nested.nested_vmx_exit_ctls_high) || !vmx_control_verify(vmcs12->vm_entry_controls, - nested_vmx_true_entry_ctls_low, - nested_vmx_entry_ctls_high)) + vmx->nested.nested_vmx_true_entry_ctls_low, + vmx->nested.nested_vmx_entry_ctls_high)) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); return 1; @@ -8663,7 +9421,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) return 1; } - if (!nested_cr0_valid(vmcs12, vmcs12->guest_cr0) || + if (!nested_cr0_valid(vcpu, vmcs12->guest_cr0) || ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) { nested_vmx_entry_failure(vcpu, vmcs12, EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); @@ -8739,10 +9497,21 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) vmx_segment_cache_clear(vmx); - vmcs12->launch_state = 1; - prepare_vmcs02(vcpu, vmcs12); + msr_entry_idx = nested_vmx_load_msr(vcpu, + vmcs12->vm_entry_msr_load_addr, + vmcs12->vm_entry_msr_load_count); + if (msr_entry_idx) { + leave_guest_mode(vcpu); + vmx_load_vmcs01(vcpu); + nested_vmx_entry_failure(vcpu, vmcs12, + EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx); + return 1; + } + + vmcs12->launch_state = 1; + if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) return kvm_emulate_halt(vcpu); @@ -8869,9 +9638,10 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) if (vmx->nested.nested_run_pending) return -EBUSY; nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); + return 0; } - return 0; + return vmx_complete_nested_posted_interrupt(vcpu); } static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) @@ -8981,6 +9751,9 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); } + if (nested_cpu_has_vid(vmcs12)) + vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); + vmcs12->vm_entry_controls = (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); @@ -9172,6 +9945,13 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, kvm_set_dr(vcpu, 7, 0x400); vmcs_write64(GUEST_IA32_DEBUGCTL, 0); + + if (cpu_has_vmx_msr_bitmap()) + vmx_set_msr_bitmap(vcpu); + + if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, + vmcs12->vm_exit_msr_load_count)) + nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); } /* @@ -9193,6 +9973,10 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, exit_qualification); + if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr, + vmcs12->vm_exit_msr_store_count)) + nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL); + vmx_load_vmcs01(vcpu); if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) @@ -9235,6 +10019,12 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, nested_release_page(vmx->nested.virtual_apic_page); vmx->nested.virtual_apic_page = NULL; } + if (vmx->nested.pi_desc_page) { + kunmap(vmx->nested.pi_desc_page); + nested_release_page(vmx->nested.pi_desc_page); + vmx->nested.pi_desc_page = NULL; + vmx->nested.pi_desc = NULL; + } /* * We are now running in L2, mmu_notifier will force to reload the @@ -9301,6 +10091,31 @@ static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) shrink_ple_window(vcpu); } +static void vmx_slot_enable_log_dirty(struct kvm *kvm, + struct kvm_memory_slot *slot) +{ + kvm_mmu_slot_leaf_clear_dirty(kvm, slot); + kvm_mmu_slot_largepage_remove_write_access(kvm, slot); +} + +static void vmx_slot_disable_log_dirty(struct kvm *kvm, + struct kvm_memory_slot *slot) +{ + kvm_mmu_slot_set_dirty(kvm, slot); +} + +static void vmx_flush_log_dirty(struct kvm *kvm) +{ + kvm_flush_pml_buffers(kvm); +} + +static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *memslot, + gfn_t offset, unsigned long mask) +{ + kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask); +} + static struct kvm_x86_ops vmx_x86_ops = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -9409,6 +10224,11 @@ static struct kvm_x86_ops vmx_x86_ops = { .check_nested_events = vmx_check_nested_events, .sched_in = vmx_sched_in, + + .slot_enable_log_dirty = vmx_slot_enable_log_dirty, + .slot_disable_log_dirty = vmx_slot_disable_log_dirty, + .flush_log_dirty = vmx_flush_log_dirty, + .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c259814..bd7a70b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -108,6 +108,10 @@ EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); static u32 tsc_tolerance_ppm = 250; module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); +/* lapic timer advance (tscdeadline mode only) in nanoseconds */ +unsigned int lapic_timer_advance_ns = 0; +module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR); + static bool backwards_tsc_observed = false; #define KVM_NR_SHARED_MSRS 16 @@ -141,6 +145,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "irq_window", VCPU_STAT(irq_window_exits) }, { "nmi_window", VCPU_STAT(nmi_window_exits) }, { "halt_exits", VCPU_STAT(halt_exits) }, + { "halt_successful_poll", VCPU_STAT(halt_successful_poll) }, { "halt_wakeup", VCPU_STAT(halt_wakeup) }, { "hypercalls", VCPU_STAT(hypercalls) }, { "request_irq", VCPU_STAT(request_irq_exits) }, @@ -492,7 +497,7 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, } EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu); -int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, +static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset, int len, u32 access) { return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn, @@ -643,7 +648,7 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) } } -int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) +static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) { u64 xcr0 = xcr; u64 old_xcr0 = vcpu->arch.xcr0; @@ -1083,6 +1088,15 @@ static void update_pvclock_gtod(struct timekeeper *tk) } #endif +void kvm_set_pending_timer(struct kvm_vcpu *vcpu) +{ + /* + * Note: KVM_REQ_PENDING_TIMER is implicitly checked in + * vcpu_enter_guest. This function is only called from + * the physical CPU that is running vcpu. + */ + kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); +} static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) { @@ -1180,7 +1194,7 @@ static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0); #endif static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); -unsigned long max_tsc_khz; +static unsigned long max_tsc_khz; static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) { @@ -1234,7 +1248,7 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) return tsc; } -void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) +static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) { #ifdef CONFIG_X86_64 bool vcpus_matched; @@ -1529,7 +1543,8 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm) &ka->master_cycle_now); ka->use_master_clock = host_tsc_clocksource && vcpus_matched - && !backwards_tsc_observed; + && !backwards_tsc_observed + && !ka->boot_vcpu_runs_old_kvmclock; if (ka->use_master_clock) atomic_set(&kvm_guest_has_master_clock, 1); @@ -2161,8 +2176,20 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_KVM_SYSTEM_TIME_NEW: case MSR_KVM_SYSTEM_TIME: { u64 gpa_offset; + struct kvm_arch *ka = &vcpu->kvm->arch; + kvmclock_reset(vcpu); + if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) { + bool tmp = (msr == MSR_KVM_SYSTEM_TIME); + + if (ka->boot_vcpu_runs_old_kvmclock != tmp) + set_bit(KVM_REQ_MASTERCLOCK_UPDATE, + &vcpu->requests); + + ka->boot_vcpu_runs_old_kvmclock = tmp; + } + vcpu->arch.time = data; kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); @@ -2324,6 +2351,7 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) { return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); } +EXPORT_SYMBOL_GPL(kvm_get_msr); static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) { @@ -2738,6 +2766,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_READONLY_MEM: case KVM_CAP_HYPERV_TIME: case KVM_CAP_IOAPIC_POLARITY_IGNORED: + case KVM_CAP_TSC_DEADLINE_TIMER: #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT case KVM_CAP_ASSIGN_DEV_IRQ: case KVM_CAP_PCI_2_3: @@ -2776,9 +2805,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_TSC_CONTROL: r = kvm_has_tsc_control; break; - case KVM_CAP_TSC_DEADLINE_TIMER: - r = boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER); - break; default: r = 0; break; @@ -3734,83 +3760,43 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm, * @kvm: kvm instance * @log: slot id and address to which we copy the log * - * We need to keep it in mind that VCPU threads can write to the bitmap - * concurrently. So, to avoid losing data, we keep the following order for - * each bit: + * Steps 1-4 below provide general overview of dirty page logging. See + * kvm_get_dirty_log_protect() function description for additional details. + * + * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we + * always flush the TLB (step 4) even if previous step failed and the dirty + * bitmap may be corrupt. Regardless of previous outcome the KVM logging API + * does not preclude user space subsequent dirty log read. Flushing TLB ensures + * writes will be marked dirty for next log read. * * 1. Take a snapshot of the bit and clear it if needed. * 2. Write protect the corresponding page. - * 3. Flush TLB's if needed. - * 4. Copy the snapshot to the userspace. - * - * Between 2 and 3, the guest may write to the page using the remaining TLB - * entry. This is not a problem because the page will be reported dirty at - * step 4 using the snapshot taken before and step 3 ensures that successive - * writes will be logged for the next call. + * 3. Copy the snapshot to the userspace. + * 4. Flush TLB's if needed. */ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { - int r; - struct kvm_memory_slot *memslot; - unsigned long n, i; - unsigned long *dirty_bitmap; - unsigned long *dirty_bitmap_buffer; bool is_dirty = false; + int r; mutex_lock(&kvm->slots_lock); - r = -EINVAL; - if (log->slot >= KVM_USER_MEM_SLOTS) - goto out; - - memslot = id_to_memslot(kvm->memslots, log->slot); - - dirty_bitmap = memslot->dirty_bitmap; - r = -ENOENT; - if (!dirty_bitmap) - goto out; - - n = kvm_dirty_bitmap_bytes(memslot); - - dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long); - memset(dirty_bitmap_buffer, 0, n); - - spin_lock(&kvm->mmu_lock); - - for (i = 0; i < n / sizeof(long); i++) { - unsigned long mask; - gfn_t offset; - - if (!dirty_bitmap[i]) - continue; - - is_dirty = true; - - mask = xchg(&dirty_bitmap[i], 0); - dirty_bitmap_buffer[i] = mask; - - offset = i * BITS_PER_LONG; - kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask); - } - - spin_unlock(&kvm->mmu_lock); + /* + * Flush potentially hardware-cached dirty pages to dirty_bitmap. + */ + if (kvm_x86_ops->flush_log_dirty) + kvm_x86_ops->flush_log_dirty(kvm); - /* See the comments in kvm_mmu_slot_remove_write_access(). */ - lockdep_assert_held(&kvm->slots_lock); + r = kvm_get_dirty_log_protect(kvm, log, &is_dirty); /* * All the TLBs can be flushed out of mmu lock, see the comments in * kvm_mmu_slot_remove_write_access(). */ + lockdep_assert_held(&kvm->slots_lock); if (is_dirty) kvm_flush_remote_tlbs(kvm); - r = -EFAULT; - if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) - goto out; - - r = 0; -out: mutex_unlock(&kvm->slots_lock); return r; } @@ -4516,6 +4502,8 @@ int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr, if (rc != X86EMUL_CONTINUE) return rc; addr += now; + if (ctxt->mode != X86EMUL_MODE_PROT64) + addr = (u32)addr; val += now; bytes -= now; } @@ -4984,6 +4972,11 @@ static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulon kvm_register_write(emul_to_vcpu(ctxt), reg, val); } +static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked) +{ + kvm_x86_ops->set_nmi_mask(emul_to_vcpu(ctxt), masked); +} + static const struct x86_emulate_ops emulate_ops = { .read_gpr = emulator_read_gpr, .write_gpr = emulator_write_gpr, @@ -5019,6 +5012,7 @@ static const struct x86_emulate_ops emulate_ops = { .put_fpu = emulator_put_fpu, .intercept = emulator_intercept, .get_cpuid = emulator_get_cpuid, + .set_nmi_mask = emulator_set_nmi_mask, }; static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) @@ -6311,6 +6305,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } trace_kvm_entry(vcpu->vcpu_id); + wait_lapic_expire(vcpu); kvm_x86_ops->run(vcpu); /* @@ -7041,15 +7036,13 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) return r; } -int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) +void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) { - int r; struct msr_data msr; struct kvm *kvm = vcpu->kvm; - r = vcpu_load(vcpu); - if (r) - return r; + if (vcpu_load(vcpu)) + return; msr.data = 0x0; msr.index = MSR_IA32_TSC; msr.host_initiated = true; @@ -7058,8 +7051,6 @@ int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) schedule_delayed_work(&kvm->arch.kvmclock_sync_work, KVMCLOCK_SYNC_PERIOD); - - return r; } void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) @@ -7549,12 +7540,62 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, return 0; } +static void kvm_mmu_slot_apply_flags(struct kvm *kvm, + struct kvm_memory_slot *new) +{ + /* Still write protect RO slot */ + if (new->flags & KVM_MEM_READONLY) { + kvm_mmu_slot_remove_write_access(kvm, new); + return; + } + + /* + * Call kvm_x86_ops dirty logging hooks when they are valid. + * + * kvm_x86_ops->slot_disable_log_dirty is called when: + * + * - KVM_MR_CREATE with dirty logging is disabled + * - KVM_MR_FLAGS_ONLY with dirty logging is disabled in new flag + * + * The reason is, in case of PML, we need to set D-bit for any slots + * with dirty logging disabled in order to eliminate unnecessary GPA + * logging in PML buffer (and potential PML buffer full VMEXT). This + * guarantees leaving PML enabled during guest's lifetime won't have + * any additonal overhead from PML when guest is running with dirty + * logging disabled for memory slots. + * + * kvm_x86_ops->slot_enable_log_dirty is called when switching new slot + * to dirty logging mode. + * + * If kvm_x86_ops dirty logging hooks are invalid, use write protect. + * + * In case of write protect: + * + * Write protect all pages for dirty logging. + * + * All the sptes including the large sptes which point to this + * slot are set to readonly. We can not create any new large + * spte on this slot until the end of the logging. + * + * See the comments in fast_page_fault(). + */ + if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) { + if (kvm_x86_ops->slot_enable_log_dirty) + kvm_x86_ops->slot_enable_log_dirty(kvm, new); + else + kvm_mmu_slot_remove_write_access(kvm, new); + } else { + if (kvm_x86_ops->slot_disable_log_dirty) + kvm_x86_ops->slot_disable_log_dirty(kvm, new); + } +} + void kvm_arch_commit_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, const struct kvm_memory_slot *old, enum kvm_mr_change change) { - + struct kvm_memory_slot *new; int nr_mmu_pages = 0; if ((mem->slot >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_DELETE)) { @@ -7573,17 +7614,20 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, if (nr_mmu_pages) kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); + + /* It's OK to get 'new' slot here as it has already been installed */ + new = id_to_memslot(kvm->memslots, mem->slot); + /* - * Write protect all pages for dirty logging. + * Set up write protection and/or dirty logging for the new slot. * - * All the sptes including the large sptes which point to this - * slot are set to readonly. We can not create any new large - * spte on this slot until the end of the logging. - * - * See the comments in fast_page_fault(). + * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of old slot have + * been zapped so no dirty logging staff is needed for old slot. For + * KVM_MR_FLAGS_ONLY, the old slot is essentially the same one as the + * new and it's also covered when dealing with the new slot. */ - if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES)) - kvm_mmu_slot_remove_write_access(kvm, mem->slot); + if (change != KVM_MR_DELETE) + kvm_mmu_slot_apply_flags(kvm, new); } void kvm_arch_flush_shadow_all(struct kvm *kvm) @@ -7837,3 +7881,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index cc1d61a..f5fef18 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -147,6 +147,7 @@ static inline void kvm_register_writel(struct kvm_vcpu *vcpu, void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); +void kvm_set_pending_timer(struct kvm_vcpu *vcpu); int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr); @@ -170,5 +171,7 @@ extern u64 kvm_supported_xcr0(void); extern unsigned int min_timer_period_us; +extern unsigned int lapic_timer_advance_ns; + extern struct static_key kvm_no_apic_vcpu; #endif |