diff options
133 files changed, 5170 insertions, 2239 deletions
diff --git a/Documentation/virtual/kvm/00-INDEX b/Documentation/virtual/kvm/00-INDEX new file mode 100644 index 0000000..641ec92 --- /dev/null +++ b/Documentation/virtual/kvm/00-INDEX @@ -0,0 +1,24 @@ +00-INDEX + - this file. +api.txt + - KVM userspace API. +cpuid.txt + - KVM-specific cpuid leaves (x86). +devices/ + - KVM_CAP_DEVICE_CTRL userspace API. +hypercalls.txt + - KVM hypercalls. +locking.txt + - notes on KVM locks. +mmu.txt + - the x86 kvm shadow mmu. +msr.txt + - KVM-specific MSRs (x86). +nested-vmx.txt + - notes on nested virtualization for Intel x86 processors. +ppc-pv.txt + - the paravirtualization interface on PowerPC. +review-checklist.txt + - review checklist for KVM patches. +timekeeping.txt + - timekeeping virtualization for x86-based architectures. diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 858aecf..a30035d 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1122,9 +1122,9 @@ struct kvm_cpuid2 { struct kvm_cpuid_entry2 entries[0]; }; -#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX 1 -#define KVM_CPUID_FLAG_STATEFUL_FUNC 2 -#define KVM_CPUID_FLAG_STATE_READ_NEXT 4 +#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX BIT(0) +#define KVM_CPUID_FLAG_STATEFUL_FUNC BIT(1) +#define KVM_CPUID_FLAG_STATE_READ_NEXT BIT(2) struct kvm_cpuid_entry2 { __u32 function; @@ -1810,6 +1810,50 @@ registers, find a list below: PPC | KVM_REG_PPC_TLB3PS | 32 PPC | KVM_REG_PPC_EPTCFG | 32 PPC | KVM_REG_PPC_ICP_STATE | 64 + PPC | KVM_REG_PPC_TB_OFFSET | 64 + PPC | KVM_REG_PPC_SPMC1 | 32 + PPC | KVM_REG_PPC_SPMC2 | 32 + PPC | KVM_REG_PPC_IAMR | 64 + PPC | KVM_REG_PPC_TFHAR | 64 + PPC | KVM_REG_PPC_TFIAR | 64 + PPC | KVM_REG_PPC_TEXASR | 64 + PPC | KVM_REG_PPC_FSCR | 64 + PPC | KVM_REG_PPC_PSPB | 32 + PPC | KVM_REG_PPC_EBBHR | 64 + PPC | KVM_REG_PPC_EBBRR | 64 + PPC | KVM_REG_PPC_BESCR | 64 + PPC | KVM_REG_PPC_TAR | 64 + PPC | KVM_REG_PPC_DPDES | 64 + PPC | KVM_REG_PPC_DAWR | 64 + PPC | KVM_REG_PPC_DAWRX | 64 + PPC | KVM_REG_PPC_CIABR | 64 + PPC | KVM_REG_PPC_IC | 64 + PPC | KVM_REG_PPC_VTB | 64 + PPC | KVM_REG_PPC_CSIGR | 64 + PPC | KVM_REG_PPC_TACR | 64 + PPC | KVM_REG_PPC_TCSCR | 64 + PPC | KVM_REG_PPC_PID | 64 + PPC | KVM_REG_PPC_ACOP | 64 + PPC | KVM_REG_PPC_VRSAVE | 32 + PPC | KVM_REG_PPC_LPCR | 64 + PPC | KVM_REG_PPC_PPR | 64 + PPC | KVM_REG_PPC_ARCH_COMPAT 32 + PPC | KVM_REG_PPC_TM_GPR0 | 64 + ... + PPC | KVM_REG_PPC_TM_GPR31 | 64 + PPC | KVM_REG_PPC_TM_VSR0 | 128 + ... + PPC | KVM_REG_PPC_TM_VSR63 | 128 + PPC | KVM_REG_PPC_TM_CR | 64 + PPC | KVM_REG_PPC_TM_LR | 64 + PPC | KVM_REG_PPC_TM_CTR | 64 + PPC | KVM_REG_PPC_TM_FPSCR | 64 + PPC | KVM_REG_PPC_TM_AMR | 64 + PPC | KVM_REG_PPC_TM_PPR | 64 + PPC | KVM_REG_PPC_TM_VRSAVE | 64 + PPC | KVM_REG_PPC_TM_VSCR | 32 + PPC | KVM_REG_PPC_TM_DSCR | 64 + PPC | KVM_REG_PPC_TM_TAR | 64 ARM registers are mapped using the lower 32 bits. The upper 16 of that is the register group type, or coprocessor number: @@ -2304,7 +2348,31 @@ Possible features: Depends on KVM_CAP_ARM_EL1_32BIT (arm64 only). -4.83 KVM_GET_REG_LIST +4.83 KVM_ARM_PREFERRED_TARGET + +Capability: basic +Architectures: arm, arm64 +Type: vm ioctl +Parameters: struct struct kvm_vcpu_init (out) +Returns: 0 on success; -1 on error +Errors: + ENODEV: no preferred target available for the host + +This queries KVM for preferred CPU target type which can be emulated +by KVM on underlying host. + +The ioctl returns struct kvm_vcpu_init instance containing information +about preferred CPU target type and recommended features for it. The +kvm_vcpu_init->features bitmap returned will have feature bits set if +the preferred target recommends setting these features, but this is +not mandatory. + +The information returned by this ioctl can be used to prepare an instance +of struct kvm_vcpu_init for KVM_ARM_VCPU_INIT ioctl which will result in +in VCPU matching underlying host. + + +4.84 KVM_GET_REG_LIST Capability: basic Architectures: arm, arm64 @@ -2323,8 +2391,7 @@ struct kvm_reg_list { This ioctl returns the guest registers that are supported for the KVM_GET_ONE_REG/KVM_SET_ONE_REG calls. - -4.84 KVM_ARM_SET_DEVICE_ADDR +4.85 KVM_ARM_SET_DEVICE_ADDR Capability: KVM_CAP_ARM_SET_DEVICE_ADDR Architectures: arm, arm64 @@ -2362,7 +2429,7 @@ must be called after calling KVM_CREATE_IRQCHIP, but before calling KVM_RUN on any of the VCPUs. Calling this ioctl twice for any of the base addresses will return -EEXIST. -4.85 KVM_PPC_RTAS_DEFINE_TOKEN +4.86 KVM_PPC_RTAS_DEFINE_TOKEN Capability: KVM_CAP_PPC_RTAS Architectures: ppc @@ -2661,6 +2728,77 @@ and usually define the validity of a groups of registers. (e.g. one bit }; +4.81 KVM_GET_EMULATED_CPUID + +Capability: KVM_CAP_EXT_EMUL_CPUID +Architectures: x86 +Type: system ioctl +Parameters: struct kvm_cpuid2 (in/out) +Returns: 0 on success, -1 on error + +struct kvm_cpuid2 { + __u32 nent; + __u32 flags; + struct kvm_cpuid_entry2 entries[0]; +}; + +The member 'flags' is used for passing flags from userspace. + +#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX BIT(0) +#define KVM_CPUID_FLAG_STATEFUL_FUNC BIT(1) +#define KVM_CPUID_FLAG_STATE_READ_NEXT BIT(2) + +struct kvm_cpuid_entry2 { + __u32 function; + __u32 index; + __u32 flags; + __u32 eax; + __u32 ebx; + __u32 ecx; + __u32 edx; + __u32 padding[3]; +}; + +This ioctl returns x86 cpuid features which are emulated by +kvm.Userspace can use the information returned by this ioctl to query +which features are emulated by kvm instead of being present natively. + +Userspace invokes KVM_GET_EMULATED_CPUID by passing a kvm_cpuid2 +structure with the 'nent' field indicating the number of entries in +the variable-size array 'entries'. If the number of entries is too low +to describe the cpu capabilities, an error (E2BIG) is returned. If the +number is too high, the 'nent' field is adjusted and an error (ENOMEM) +is returned. If the number is just right, the 'nent' field is adjusted +to the number of valid entries in the 'entries' array, which is then +filled. + +The entries returned are the set CPUID bits of the respective features +which kvm emulates, as returned by the CPUID instruction, with unknown +or unsupported feature bits cleared. + +Features like x2apic, for example, may not be present in the host cpu +but are exposed by kvm in KVM_GET_SUPPORTED_CPUID because they can be +emulated efficiently and thus not included here. + +The fields in each entry are defined as follows: + + function: the eax value used to obtain the entry + index: the ecx value used to obtain the entry (for entries that are + affected by ecx) + flags: an OR of zero or more of the following: + KVM_CPUID_FLAG_SIGNIFCANT_INDEX: + if the index field is valid + KVM_CPUID_FLAG_STATEFUL_FUNC: + if cpuid for this function returns different values for successive + invocations; there will be several entries with the same function, + all with this flag set + KVM_CPUID_FLAG_STATE_READ_NEXT: + for KVM_CPUID_FLAG_STATEFUL_FUNC entries, set if this entry is + the first entry to be read by a cpu + eax, ebx, ecx, edx: the values returned by the cpuid instruction for + this function/index combination + + 6. Capabilities that can be enabled ----------------------------------- diff --git a/Documentation/virtual/kvm/cpuid.txt b/Documentation/virtual/kvm/cpuid.txt index 22ff659..3c65feb 100644 --- a/Documentation/virtual/kvm/cpuid.txt +++ b/Documentation/virtual/kvm/cpuid.txt @@ -43,6 +43,13 @@ KVM_FEATURE_CLOCKSOURCE2 || 3 || kvmclock available at msrs KVM_FEATURE_ASYNC_PF || 4 || async pf can be enabled by || || writing to msr 0x4b564d02 ------------------------------------------------------------------------------ +KVM_FEATURE_STEAL_TIME || 5 || steal time can be enabled by + || || writing to msr 0x4b564d03. +------------------------------------------------------------------------------ +KVM_FEATURE_PV_EOI || 6 || paravirtualized end of interrupt + || || handler can be enabled by writing + || || to msr 0x4b564d04. +------------------------------------------------------------------------------ KVM_FEATURE_PV_UNHALT || 7 || guest checks this feature bit || || before enabling paravirtualized || || spinlock support. diff --git a/Documentation/virtual/kvm/devices/vfio.txt b/Documentation/virtual/kvm/devices/vfio.txt new file mode 100644 index 0000000..ef51740 --- /dev/null +++ b/Documentation/virtual/kvm/devices/vfio.txt @@ -0,0 +1,22 @@ +VFIO virtual device +=================== + +Device types supported: + KVM_DEV_TYPE_VFIO + +Only one VFIO instance may be created per VM. The created device +tracks VFIO groups in use by the VM and features of those groups +important to the correctness and acceleration of the VM. As groups +are enabled and disabled for use by the VM, KVM should be updated +about their presence. When registered with KVM, a reference to the +VFIO-group is held by KVM. + +Groups: + KVM_DEV_VFIO_GROUP + +KVM_DEV_VFIO_GROUP attributes: + KVM_DEV_VFIO_GROUP_ADD: Add a VFIO group to VFIO-KVM device tracking + KVM_DEV_VFIO_GROUP_DEL: Remove a VFIO group from VFIO-KVM device tracking + +For each, kvm_device_attr.addr points to an int32_t file descriptor +for the VFIO group. diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt index 41b7ac9..f886941 100644 --- a/Documentation/virtual/kvm/locking.txt +++ b/Documentation/virtual/kvm/locking.txt @@ -132,10 +132,14 @@ See the comments in spte_has_volatile_bits() and mmu_spte_update(). ------------ Name: kvm_lock -Type: raw_spinlock +Type: spinlock_t Arch: any Protects: - vm_list - - hardware virtualization enable/disable + +Name: kvm_count_lock +Type: raw_spinlock_t +Arch: any +Protects: - hardware virtualization enable/disable Comment: 'raw' because hardware enabling/disabling must be atomic /wrt migration. @@ -151,3 +155,14 @@ Type: spinlock_t Arch: any Protects: -shadow page/shadow tlb entry Comment: it is a spinlock since it is used in mmu notifier. + +Name: kvm->srcu +Type: srcu lock +Arch: any +Protects: - kvm->memslots + - kvm->buses +Comment: The srcu read lock must be held while accessing memslots (e.g. + when using gfn_to_* functions) and while accessing in-kernel + MMIO/PIO address->device structure mapping (kvm->buses). + The srcu index can be stored in kvm_vcpu->srcu_idx per vcpu + if it is needed by multiple functions. diff --git a/MAINTAINERS b/MAINTAINERS index f3ef1d1..583af4b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4871,7 +4871,8 @@ KERNEL VIRTUAL MACHINE (KVM) M: Gleb Natapov <gleb@redhat.com> M: Paolo Bonzini <pbonzini@redhat.com> L: kvm@vger.kernel.org -W: http://linux-kvm.org +W: http://www.linux-kvm.org +T: git git://git.kernel.org/pub/scm/virt/kvm/kvm.git S: Supported F: Documentation/*/kvm*.txt F: Documentation/virtual/kvm/ diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h index 64e9696..1d3153c 100644 --- a/arch/arm/include/asm/kvm_arm.h +++ b/arch/arm/include/asm/kvm_arm.h @@ -57,6 +57,7 @@ * TSC: Trap SMC * TSW: Trap cache operations by set/way * TWI: Trap WFI + * TWE: Trap WFE * TIDCP: Trap L2CTLR/L2ECTLR * BSU_IS: Upgrade barriers to the inner shareable domain * FB: Force broadcast of all maintainance operations @@ -67,7 +68,7 @@ */ #define HCR_GUEST_MASK (HCR_TSC | HCR_TSW | HCR_TWI | HCR_VM | HCR_BSU_IS | \ HCR_FB | HCR_TAC | HCR_AMO | HCR_IMO | HCR_FMO | \ - HCR_SWIO | HCR_TIDCP) + HCR_TWE | HCR_SWIO | HCR_TIDCP) #define HCR_VIRT_EXCP_MASK (HCR_VA | HCR_VI | HCR_VF) /* System Control Register (SCTLR) bits */ @@ -95,12 +96,12 @@ #define TTBCR_IRGN1 (3 << 24) #define TTBCR_EPD1 (1 << 23) #define TTBCR_A1 (1 << 22) -#define TTBCR_T1SZ (3 << 16) +#define TTBCR_T1SZ (7 << 16) #define TTBCR_SH0 (3 << 12) #define TTBCR_ORGN0 (3 << 10) #define TTBCR_IRGN0 (3 << 8) #define TTBCR_EPD0 (1 << 7) -#define TTBCR_T0SZ 3 +#define TTBCR_T0SZ (7 << 0) #define HTCR_MASK (TTBCR_T0SZ | TTBCR_IRGN0 | TTBCR_ORGN0 | TTBCR_SH0) /* Hyp System Trap Register */ @@ -208,6 +209,8 @@ #define HSR_EC_DABT (0x24) #define HSR_EC_DABT_HYP (0x25) +#define HSR_WFI_IS_WFE (1U << 0) + #define HSR_HVC_IMM_MASK ((1UL << 16) - 1) #define HSR_DABT_S1PTW (1U << 7) diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h index a2f43dd..661da11 100644 --- a/arch/arm/include/asm/kvm_asm.h +++ b/arch/arm/include/asm/kvm_asm.h @@ -39,7 +39,7 @@ #define c6_IFAR 17 /* Instruction Fault Address Register */ #define c7_PAR 18 /* Physical Address Register */ #define c7_PAR_high 19 /* PAR top 32 bits */ -#define c9_L2CTLR 20 /* Cortex A15 L2 Control Register */ +#define c9_L2CTLR 20 /* Cortex A15/A7 L2 Control Register */ #define c10_PRRR 21 /* Primary Region Remap Register */ #define c10_NMRR 22 /* Normal Memory Remap Register */ #define c12_VBAR 23 /* Vector Base Address Register */ diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h index a464e8d7..0fa90c9 100644 --- a/arch/arm/include/asm/kvm_emulate.h +++ b/arch/arm/include/asm/kvm_emulate.h @@ -157,4 +157,55 @@ static inline u32 kvm_vcpu_hvc_get_imm(struct kvm_vcpu *vcpu) return kvm_vcpu_get_hsr(vcpu) & HSR_HVC_IMM_MASK; } +static inline unsigned long kvm_vcpu_get_mpidr(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.cp15[c0_MPIDR]; +} + +static inline void kvm_vcpu_set_be(struct kvm_vcpu *vcpu) +{ + *vcpu_cpsr(vcpu) |= PSR_E_BIT; +} + +static inline bool kvm_vcpu_is_be(struct kvm_vcpu *vcpu) +{ + return !!(*vcpu_cpsr(vcpu) & PSR_E_BIT); +} + +static inline unsigned long vcpu_data_guest_to_host(struct kvm_vcpu *vcpu, + unsigned long data, + unsigned int len) +{ + if (kvm_vcpu_is_be(vcpu)) { + switch (len) { + case 1: + return data & 0xff; + case 2: + return be16_to_cpu(data & 0xffff); + default: + return be32_to_cpu(data); + } + } + + return data; /* Leave LE untouched */ +} + +static inline unsigned long vcpu_data_host_to_guest(struct kvm_vcpu *vcpu, + unsigned long data, + unsigned int len) +{ + if (kvm_vcpu_is_be(vcpu)) { + switch (len) { + case 1: + return data & 0xff; + case 2: + return cpu_to_be16(data & 0xffff); + default: + return cpu_to_be32(data); + } + } + + return data; /* Leave LE untouched */ +} + #endif /* __ARM_KVM_EMULATE_H__ */ diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 7d22517..8a6f6db 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -38,11 +38,6 @@ #define KVM_VCPU_MAX_FEATURES 1 -/* We don't currently support large pages. */ -#define KVM_HPAGE_GFN_SHIFT(x) 0 -#define KVM_NR_PAGE_SIZES 1 -#define KVM_PAGES_PER_HPAGE(x) (1UL<<31) - #include <kvm/arm_vgic.h> struct kvm_vcpu; @@ -154,6 +149,7 @@ struct kvm_vcpu_stat { struct kvm_vcpu_init; int kvm_vcpu_set_target(struct kvm_vcpu *vcpu, const struct kvm_vcpu_init *init); +int kvm_vcpu_preferred_target(struct kvm_vcpu_init *init); unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu); int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices); struct kvm_one_reg; diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 9b28c41..77de4a4 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -62,6 +62,12 @@ phys_addr_t kvm_get_idmap_vector(void); int kvm_mmu_init(void); void kvm_clear_hyp_idmap(void); +static inline void kvm_set_pmd(pmd_t *pmd, pmd_t new_pmd) +{ + *pmd = new_pmd; + flush_pmd_entry(pmd); +} + static inline void kvm_set_pte(pte_t *pte, pte_t new_pte) { *pte = new_pte; @@ -103,9 +109,15 @@ static inline void kvm_set_s2pte_writable(pte_t *pte) pte_val(*pte) |= L_PTE_S2_RDWR; } +static inline void kvm_set_s2pmd_writable(pmd_t *pmd) +{ + pmd_val(*pmd) |= L_PMD_S2_RDWR; +} + struct kvm; -static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn) +static inline void coherent_icache_guest_page(struct kvm *kvm, hva_t hva, + unsigned long size) { /* * If we are going to insert an instruction page and the icache is @@ -120,8 +132,7 @@ static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn) * need any kind of flushing (DDI 0406C.b - Page B3-1392). */ if (icache_is_pipt()) { - unsigned long hva = gfn_to_hva(kvm, gfn); - __cpuc_coherent_user_range(hva, hva + PAGE_SIZE); + __cpuc_coherent_user_range(hva, hva + size); } else if (!icache_is_vivt_asid_tagged()) { /* any kind of VIPT cache */ __flush_icache_all(); diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index 39c54cf..4f95039 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h @@ -126,6 +126,8 @@ #define L_PTE_S2_RDONLY (_AT(pteval_t, 1) << 6) /* HAP[1] */ #define L_PTE_S2_RDWR (_AT(pteval_t, 3) << 6) /* HAP[2:1] */ +#define L_PMD_S2_RDWR (_AT(pmdval_t, 3) << 6) /* HAP[2:1] */ + /* * Hyp-mode PL2 PTE definitions for LPAE. */ diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h index c1ee007..c498b60 100644 --- a/arch/arm/include/uapi/asm/kvm.h +++ b/arch/arm/include/uapi/asm/kvm.h @@ -63,7 +63,8 @@ struct kvm_regs { /* Supported Processor Types */ #define KVM_ARM_TARGET_CORTEX_A15 0 -#define KVM_ARM_NUM_TARGETS 1 +#define KVM_ARM_TARGET_CORTEX_A7 1 +#define KVM_ARM_NUM_TARGETS 2 /* KVM_ARM_SET_DEVICE_ADDR ioctl id encoding */ #define KVM_ARM_DEVICE_TYPE_SHIFT 0 diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig index ebf5015..466bd29 100644 --- a/arch/arm/kvm/Kconfig +++ b/arch/arm/kvm/Kconfig @@ -20,6 +20,7 @@ config KVM bool "Kernel-based Virtual Machine (KVM) support" select PREEMPT_NOTIFIERS select ANON_INODES + select HAVE_KVM_CPU_RELAX_INTERCEPT select KVM_MMIO select KVM_ARM_HOST depends on ARM_VIRT_EXT && ARM_LPAE diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile index d99bee4..789bca9 100644 --- a/arch/arm/kvm/Makefile +++ b/arch/arm/kvm/Makefile @@ -19,6 +19,6 @@ kvm-arm-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o obj-y += kvm-arm.o init.o interrupts.o obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o -obj-y += coproc.o coproc_a15.o mmio.o psci.o perf.o +obj-y += coproc.o coproc_a15.o coproc_a7.o mmio.o psci.o perf.o obj-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic.o obj-$(CONFIG_KVM_ARM_TIMER) += $(KVM)/arm/arch_timer.o diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index aea7ccb..2a700e0 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -152,12 +152,13 @@ int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) return VM_FAULT_SIGBUS; } -void kvm_arch_free_memslot(struct kvm_memory_slot *free, +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, struct kvm_memory_slot *dont) { } -int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) +int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, + unsigned long npages) { return 0; } @@ -797,6 +798,19 @@ long kvm_arch_vm_ioctl(struct file *filp, return -EFAULT; return kvm_vm_ioctl_set_device_addr(kvm, &dev_addr); } + case KVM_ARM_PREFERRED_TARGET: { + int err; + struct kvm_vcpu_init init; + + err = kvm_vcpu_preferred_target(&init); + if (err) + return err; + + if (copy_to_user(argp, &init, sizeof(init))) + return -EFAULT; + + return 0; + } default: return -EINVAL; } diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c index db9cf69..78c0885 100644 --- a/arch/arm/kvm/coproc.c +++ b/arch/arm/kvm/coproc.c @@ -71,6 +71,98 @@ int kvm_handle_cp14_access(struct kvm_vcpu *vcpu, struct kvm_run *run) return 1; } +static void reset_mpidr(struct kvm_vcpu *vcpu, const struct coproc_reg *r) +{ + /* + * Compute guest MPIDR. We build a virtual cluster out of the + * vcpu_id, but we read the 'U' bit from the underlying + * hardware directly. + */ + vcpu->arch.cp15[c0_MPIDR] = ((read_cpuid_mpidr() & MPIDR_SMP_BITMASK) | + ((vcpu->vcpu_id >> 2) << MPIDR_LEVEL_BITS) | + (vcpu->vcpu_id & 3)); +} + +/* TRM entries A7:4.3.31 A15:4.3.28 - RO WI */ +static bool access_actlr(struct kvm_vcpu *vcpu, + const struct coproc_params *p, + const struct coproc_reg *r) +{ + if (p->is_write) + return ignore_write(vcpu, p); + + *vcpu_reg(vcpu, p->Rt1) = vcpu->arch.cp15[c1_ACTLR]; + return true; +} + +/* TRM entries A7:4.3.56, A15:4.3.60 - R/O. */ +static bool access_cbar(struct kvm_vcpu *vcpu, + const struct coproc_params *p, + const struct coproc_reg *r) +{ + if (p->is_write) + return write_to_read_only(vcpu, p); + return read_zero(vcpu, p); +} + +/* TRM entries A7:4.3.49, A15:4.3.48 - R/O WI */ +static bool access_l2ctlr(struct kvm_vcpu *vcpu, + const struct coproc_params *p, + const struct coproc_reg *r) +{ + if (p->is_write) + return ignore_write(vcpu, p); + + *vcpu_reg(vcpu, p->Rt1) = vcpu->arch.cp15[c9_L2CTLR]; + return true; +} + +static void reset_l2ctlr(struct kvm_vcpu *vcpu, const struct coproc_reg *r) +{ + u32 l2ctlr, ncores; + + asm volatile("mrc p15, 1, %0, c9, c0, 2\n" : "=r" (l2ctlr)); + l2ctlr &= ~(3 << 24); + ncores = atomic_read(&vcpu->kvm->online_vcpus) - 1; + /* How many cores in the current cluster and the next ones */ + ncores -= (vcpu->vcpu_id & ~3); + /* Cap it to the maximum number of cores in a single cluster */ + ncores = min(ncores, 3U); + l2ctlr |= (ncores & 3) << 24; + + vcpu->arch.cp15[c9_L2CTLR] = l2ctlr; +} + +static void reset_actlr(struct kvm_vcpu *vcpu, const struct coproc_reg *r) +{ + u32 actlr; + + /* ACTLR contains SMP bit: make sure you create all cpus first! */ + asm volatile("mrc p15, 0, %0, c1, c0, 1\n" : "=r" (actlr)); + /* Make the SMP bit consistent with the guest configuration */ + if (atomic_read(&vcpu->kvm->online_vcpus) > 1) + actlr |= 1U << 6; + else + actlr &= ~(1U << 6); + + vcpu->arch.cp15[c1_ACTLR] = actlr; +} + +/* + * TRM entries: A7:4.3.50, A15:4.3.49 + * R/O WI (even if NSACR.NS_L2ERR, a write of 1 is ignored). + */ +static bool access_l2ectlr(struct kvm_vcpu *vcpu, + const struct coproc_params *p, + const struct coproc_reg *r) +{ + if (p->is_write) + return ignore_write(vcpu, p); + + *vcpu_reg(vcpu, p->Rt1) = 0; + return true; +} + /* See note at ARM ARM B1.14.4 */ static bool access_dcsw(struct kvm_vcpu *vcpu, const struct coproc_params *p, @@ -153,10 +245,22 @@ static bool pm_fake(struct kvm_vcpu *vcpu, * registers preceding 32-bit ones. */ static const struct coproc_reg cp15_regs[] = { + /* MPIDR: we use VMPIDR for guest access. */ + { CRn( 0), CRm( 0), Op1( 0), Op2( 5), is32, + NULL, reset_mpidr, c0_MPIDR }, + /* CSSELR: swapped by interrupt.S. */ { CRn( 0), CRm( 0), Op1( 2), Op2( 0), is32, NULL, reset_unknown, c0_CSSELR }, + /* ACTLR: trapped by HCR.TAC bit. */ + { CRn( 1), CRm( 0), Op1( 0), Op2( 1), is32, + access_actlr, reset_actlr, c1_ACTLR }, + + /* CPACR: swapped by interrupt.S. */ + { CRn( 1), CRm( 0), Op1( 0), Op2( 2), is32, + NULL, reset_val, c1_CPACR, 0x00000000 }, + /* TTBR0/TTBR1: swapped by interrupt.S. */ { CRm64( 2), Op1( 0), is64, NULL, reset_unknown64, c2_TTBR0 }, { CRm64( 2), Op1( 1), is64, NULL, reset_unknown64, c2_TTBR1 }, @@ -195,6 +299,13 @@ static const struct coproc_reg cp15_regs[] = { { CRn( 7), CRm(10), Op1( 0), Op2( 2), is32, access_dcsw}, { CRn( 7), CRm(14), Op1( 0), Op2( 2), is32, access_dcsw}, /* + * L2CTLR access (guest wants to know #CPUs). + */ + { CRn( 9), CRm( 0), Op1( 1), Op2( 2), is32, + access_l2ctlr, reset_l2ctlr, c9_L2CTLR }, + { CRn( 9), CRm( 0), Op1( 1), Op2( 3), is32, access_l2ectlr}, + + /* * Dummy performance monitor implementation. */ { CRn( 9), CRm(12), Op1( 0), Op2( 0), is32, access_pmcr}, @@ -234,6 +345,9 @@ static const struct coproc_reg cp15_regs[] = { /* CNTKCTL: swapped by interrupt.S. */ { CRn(14), CRm( 1), Op1( 0), Op2( 0), is32, NULL, reset_val, c14_CNTKCTL, 0x00000000 }, + + /* The Configuration Base Address Register. */ + { CRn(15), CRm( 0), Op1( 4), Op2( 0), is32, access_cbar}, }; /* Target specific emulation tables */ @@ -241,6 +355,12 @@ static struct kvm_coproc_target_table *target_tables[KVM_ARM_NUM_TARGETS]; void kvm_register_target_coproc_table(struct kvm_coproc_target_table *table) { + unsigned int i; + + for (i = 1; i < table->num; i++) + BUG_ON(cmp_reg(&table->table[i-1], + &table->table[i]) >= 0); + target_tables[table->target] = table; } diff --git a/arch/arm/kvm/coproc_a15.c b/arch/arm/kvm/coproc_a15.c index cf93472..bb0cac1 100644 --- a/arch/arm/kvm/coproc_a15.c +++ b/arch/arm/kvm/coproc_a15.c @@ -17,101 +17,12 @@ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #include <linux/kvm_host.h> -#include <asm/cputype.h> -#include <asm/kvm_arm.h> -#include <asm/kvm_host.h> -#include <asm/kvm_emulate.h> #include <asm/kvm_coproc.h> +#include <asm/kvm_emulate.h> #include <linux/init.h> -static void reset_mpidr(struct kvm_vcpu *vcpu, const struct coproc_reg *r) -{ - /* - * Compute guest MPIDR: - * (Even if we present only one VCPU to the guest on an SMP - * host we don't set the U bit in the MPIDR, or vice versa, as - * revealing the underlying hardware properties is likely to - * be the best choice). - */ - vcpu->arch.cp15[c0_MPIDR] = (read_cpuid_mpidr() & ~MPIDR_LEVEL_MASK) - | (vcpu->vcpu_id & MPIDR_LEVEL_MASK); -} - #include "coproc.h" -/* A15 TRM 4.3.28: RO WI */ -static bool access_actlr(struct kvm_vcpu *vcpu, - const struct coproc_params *p, - const struct coproc_reg *r) -{ - if (p->is_write) - return ignore_write(vcpu, p); - - *vcpu_reg(vcpu, p->Rt1) = vcpu->arch.cp15[c1_ACTLR]; - return true; -} - -/* A15 TRM 4.3.60: R/O. */ -static bool access_cbar(struct kvm_vcpu *vcpu, - const struct coproc_params *p, - const struct coproc_reg *r) -{ - if (p->is_write) - return write_to_read_only(vcpu, p); - return read_zero(vcpu, p); -} - -/* A15 TRM 4.3.48: R/O WI. */ -static bool access_l2ctlr(struct kvm_vcpu *vcpu, - const struct coproc_params *p, - const struct coproc_reg *r) -{ - if (p->is_write) - return ignore_write(vcpu, p); - - *vcpu_reg(vcpu, p->Rt1) = vcpu->arch.cp15[c9_L2CTLR]; - return true; -} - -static void reset_l2ctlr(struct kvm_vcpu *vcpu, const struct coproc_reg *r) -{ - u32 l2ctlr, ncores; - - asm volatile("mrc p15, 1, %0, c9, c0, 2\n" : "=r" (l2ctlr)); - l2ctlr &= ~(3 << 24); - ncores = atomic_read(&vcpu->kvm->online_vcpus) - 1; - l2ctlr |= (ncores & 3) << 24; - - vcpu->arch.cp15[c9_L2CTLR] = l2ctlr; -} - -static void reset_actlr(struct kvm_vcpu *vcpu, const struct coproc_reg *r) -{ - u32 actlr; - - /* ACTLR contains SMP bit: make sure you create all cpus first! */ - asm volatile("mrc p15, 0, %0, c1, c0, 1\n" : "=r" (actlr)); - /* Make the SMP bit consistent with the guest configuration */ - if (atomic_read(&vcpu->kvm->online_vcpus) > 1) - actlr |= 1U << 6; - else - actlr &= ~(1U << 6); - - vcpu->arch.cp15[c1_ACTLR] = actlr; -} - -/* A15 TRM 4.3.49: R/O WI (even if NSACR.NS_L2ERR, a write of 1 is ignored). */ -static bool access_l2ectlr(struct kvm_vcpu *vcpu, - const struct coproc_params *p, - const struct coproc_reg *r) -{ - if (p->is_write) - return ignore_write(vcpu, p); - - *vcpu_reg(vcpu, p->Rt1) = 0; - return true; -} - /* * A15-specific CP15 registers. * CRn denotes the primary register number, but is copied to the CRm in the @@ -121,29 +32,9 @@ static bool access_l2ectlr(struct kvm_vcpu *vcpu, * registers preceding 32-bit ones. */ static const struct coproc_reg a15_regs[] = { - /* MPIDR: we use VMPIDR for guest access. */ - { CRn( 0), CRm( 0), Op1( 0), Op2( 5), is32, - NULL, reset_mpidr, c0_MPIDR }, - /* SCTLR: swapped by interrupt.S. */ { CRn( 1), CRm( 0), Op1( 0), Op2( 0), is32, NULL, reset_val, c1_SCTLR, 0x00C50078 }, - /* ACTLR: trapped by HCR.TAC bit. */ - { CRn( 1), CRm( 0), Op1( 0), Op2( 1), is32, - access_actlr, reset_actlr, c1_ACTLR }, - /* CPACR: swapped by interrupt.S. */ - { CRn( 1), CRm( 0), Op1( 0), Op2( 2), is32, - NULL, reset_val, c1_CPACR, 0x00000000 }, - - /* - * L2CTLR access (guest wants to know #CPUs). - */ - { CRn( 9), CRm( 0), Op1( 1), Op2( 2), is32, - access_l2ctlr, reset_l2ctlr, c9_L2CTLR }, - { CRn( 9), CRm( 0), Op1( 1), Op2( 3), is32, access_l2ectlr}, - - /* The Configuration Base Address Register. */ - { CRn(15), CRm( 0), Op1( 4), Op2( 0), is32, access_cbar}, }; static struct kvm_coproc_target_table a15_target_table = { @@ -154,12 +45,6 @@ static struct kvm_coproc_target_table a15_target_table = { static int __init coproc_a15_init(void) { - unsigned int i; - - for (i = 1; i < ARRAY_SIZE(a15_regs); i++) - BUG_ON(cmp_reg(&a15_regs[i-1], - &a15_regs[i]) >= 0); - kvm_register_target_coproc_table(&a15_target_table); return 0; } diff --git a/arch/arm/kvm/coproc_a7.c b/arch/arm/kvm/coproc_a7.c new file mode 100644 index 0000000..1df7673 --- /dev/null +++ b/arch/arm/kvm/coproc_a7.c @@ -0,0 +1,54 @@ +/* + * Copyright (C) 2012 - Virtual Open Systems and Columbia University + * Copyright (C) 2013 - ARM Ltd + * + * Authors: Rusty Russell <rusty@rustcorp.au> + * Christoffer Dall <c.dall@virtualopensystems.com> + * Jonathan Austin <jonathan.austin@arm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include <linux/kvm_host.h> +#include <asm/kvm_coproc.h> +#include <asm/kvm_emulate.h> +#include <linux/init.h> + +#include "coproc.h" + +/* + * Cortex-A7 specific CP15 registers. + * CRn denotes the primary register number, but is copied to the CRm in the + * user space API for 64-bit register access in line with the terminology used + * in the ARM ARM. + * Important: Must be sorted ascending by CRn, CRM, Op1, Op2 and with 64-bit + * registers preceding 32-bit ones. + */ +static const struct coproc_reg a7_regs[] = { + /* SCTLR: swapped by interrupt.S. */ + { CRn( 1), CRm( 0), Op1( 0), Op2( 0), is32, + NULL, reset_val, c1_SCTLR, 0x00C50878 }, +}; + +static struct kvm_coproc_target_table a7_target_table = { + .target = KVM_ARM_TARGET_CORTEX_A7, + .table = a7_regs, + .num = ARRAY_SIZE(a7_regs), +}; + +static int __init coproc_a7_init(void) +{ + kvm_register_target_coproc_table(&a7_target_table); + return 0; +} +late_initcall(coproc_a7_init); diff --git a/arch/arm/kvm/emulate.c b/arch/arm/kvm/emulate.c index bdede9e..d6c0052 100644 --- a/arch/arm/kvm/emulate.c +++ b/arch/arm/kvm/emulate.c @@ -354,7 +354,7 @@ static void inject_abt(struct kvm_vcpu *vcpu, bool is_pabt, unsigned long addr) *vcpu_pc(vcpu) = exc_vector_base(vcpu) + vect_offset; if (is_pabt) { - /* Set DFAR and DFSR */ + /* Set IFAR and IFSR */ vcpu->arch.cp15[c6_IFAR] = addr; is_lpae = (vcpu->arch.cp15[c2_TTBCR] >> 31); /* Always give debug fault for now - should give guest a clue */ diff --git a/arch/arm/kvm/guest.c b/arch/arm/kvm/guest.c index 152d036..20f8d97 100644 --- a/arch/arm/kvm/guest.c +++ b/arch/arm/kvm/guest.c @@ -190,6 +190,8 @@ int __attribute_const__ kvm_target_cpu(void) return -EINVAL; switch (part_number) { + case ARM_CPU_PART_CORTEX_A7: + return KVM_ARM_TARGET_CORTEX_A7; case ARM_CPU_PART_CORTEX_A15: return KVM_ARM_TARGET_CORTEX_A15; default: @@ -202,7 +204,7 @@ int kvm_vcpu_set_target(struct kvm_vcpu *vcpu, { unsigned int i; - /* We can only do a cortex A15 for now. */ + /* We can only cope with guest==host and only on A15/A7 (for now). */ if (init->target != kvm_target_cpu()) return -EINVAL; @@ -222,6 +224,26 @@ int kvm_vcpu_set_target(struct kvm_vcpu *vcpu, return kvm_reset_vcpu(vcpu); } +int kvm_vcpu_preferred_target(struct kvm_vcpu_init *init) +{ + int target = kvm_target_cpu(); + + if (target < 0) + return -ENODEV; + + memset(init, 0, sizeof(*init)); + + /* + * For now, we don't return any features. + * In future, we might use features to return target + * specific features available for the preferred + * target type. + */ + init->target = (__u32)target; + + return 0; +} + int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { return -EINVAL; diff --git a/arch/arm/kvm/handle_exit.c b/arch/arm/kvm/handle_exit.c index df4c82d..a920790 100644 --- a/arch/arm/kvm/handle_exit.c +++ b/arch/arm/kvm/handle_exit.c @@ -73,23 +73,29 @@ static int handle_dabt_hyp(struct kvm_vcpu *vcpu, struct kvm_run *run) } /** - * kvm_handle_wfi - handle a wait-for-interrupts instruction executed by a guest + * kvm_handle_wfx - handle a WFI or WFE instructions trapped in guests * @vcpu: the vcpu pointer * @run: the kvm_run structure pointer * - * Simply sets the wait_for_interrupts flag on the vcpu structure, which will - * halt execution of world-switches and schedule other host processes until - * there is an incoming IRQ or FIQ to the VM. + * WFE: Yield the CPU and come back to this vcpu when the scheduler + * decides to. + * WFI: Simply call kvm_vcpu_block(), which will halt execution of + * world-switches and schedule other host processes until there is an + * incoming IRQ or FIQ to the VM. */ -static int kvm_handle_wfi(struct kvm_vcpu *vcpu, struct kvm_run *run) +static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct kvm_run *run) { trace_kvm_wfi(*vcpu_pc(vcpu)); - kvm_vcpu_block(vcpu); + if (kvm_vcpu_get_hsr(vcpu) & HSR_WFI_IS_WFE) + kvm_vcpu_on_spin(vcpu); + else + kvm_vcpu_block(vcpu); + return 1; } static exit_handle_fn arm_exit_handlers[] = { - [HSR_EC_WFI] = kvm_handle_wfi, + [HSR_EC_WFI] = kvm_handle_wfx, [HSR_EC_CP15_32] = kvm_handle_cp15_32, [HSR_EC_CP15_64] = kvm_handle_cp15_64, [HSR_EC_CP14_MR] = kvm_handle_cp14_access, diff --git a/arch/arm/kvm/mmio.c b/arch/arm/kvm/mmio.c index 0c25d94..4cb5a93 100644 --- a/arch/arm/kvm/mmio.c +++ b/arch/arm/kvm/mmio.c @@ -23,6 +23,68 @@ #include "trace.h" +static void mmio_write_buf(char *buf, unsigned int len, unsigned long data) +{ + void *datap = NULL; + union { + u8 byte; + u16 hword; + u32 word; + u64 dword; + } tmp; + + switch (len) { + case 1: + tmp.byte = data; + datap = &tmp.byte; + break; + case 2: + tmp.hword = data; + datap = &tmp.hword; + break; + case 4: + tmp.word = data; + datap = &tmp.word; + break; + case 8: + tmp.dword = data; + datap = &tmp.dword; + break; + } + + memcpy(buf, datap, len); +} + +static unsigned long mmio_read_buf(char *buf, unsigned int len) +{ + unsigned long data = 0; + union { + u16 hword; + u32 word; + u64 dword; + } tmp; + + switch (len) { + case 1: + data = buf[0]; + break; + case 2: + memcpy(&tmp.hword, buf, len); + data = tmp.hword; + break; + case 4: + memcpy(&tmp.word, buf, len); + data = tmp.word; + break; + case 8: + memcpy(&tmp.dword, buf, len); + data = tmp.dword; + break; + } + + return data; +} + /** * kvm_handle_mmio_return -- Handle MMIO loads after user space emulation * @vcpu: The VCPU pointer @@ -33,28 +95,27 @@ */ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run) { - unsigned long *dest; + unsigned long data; unsigned int len; int mask; if (!run->mmio.is_write) { - dest = vcpu_reg(vcpu, vcpu->arch.mmio_decode.rt); - *dest = 0; - len = run->mmio.len; if (len > sizeof(unsigned long)) return -EINVAL; - memcpy(dest, run->mmio.data, len); - - trace_kvm_mmio(KVM_TRACE_MMIO_READ, len, run->mmio.phys_addr, - *((u64 *)run->mmio.data)); + data = mmio_read_buf(run->mmio.data, len); if (vcpu->arch.mmio_decode.sign_extend && len < sizeof(unsigned long)) { mask = 1U << ((len * 8) - 1); - *dest = (*dest ^ mask) - mask; + data = (data ^ mask) - mask; } + + trace_kvm_mmio(KVM_TRACE_MMIO_READ, len, run->mmio.phys_addr, + data); + data = vcpu_data_host_to_guest(vcpu, data, len); + *vcpu_reg(vcpu, vcpu->arch.mmio_decode.rt) = data; } return 0; @@ -105,6 +166,7 @@ int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, phys_addr_t fault_ipa) { struct kvm_exit_mmio mmio; + unsigned long data; unsigned long rt; int ret; @@ -125,13 +187,15 @@ int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, } rt = vcpu->arch.mmio_decode.rt; + data = vcpu_data_guest_to_host(vcpu, *vcpu_reg(vcpu, rt), mmio.len); + trace_kvm_mmio((mmio.is_write) ? KVM_TRACE_MMIO_WRITE : KVM_TRACE_MMIO_READ_UNSATISFIED, mmio.len, fault_ipa, - (mmio.is_write) ? *vcpu_reg(vcpu, rt) : 0); + (mmio.is_write) ? data : 0); if (mmio.is_write) - memcpy(mmio.data, vcpu_reg(vcpu, rt), mmio.len); + mmio_write_buf(mmio.data, mmio.len, data); if (vgic_handle_mmio(vcpu, run, &mmio)) return 1; diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index b0de86b..3719583 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -19,6 +19,7 @@ #include <linux/mman.h> #include <linux/kvm_host.h> #include <linux/io.h> +#include <linux/hugetlb.h> #include <trace/events/kvm.h> #include <asm/pgalloc.h> #include <asm/cacheflush.h> @@ -41,6 +42,8 @@ static unsigned long hyp_idmap_start; static unsigned long hyp_idmap_end; static phys_addr_t hyp_idmap_vector; +#define kvm_pmd_huge(_x) (pmd_huge(_x) || pmd_trans_huge(_x)) + static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) { /* @@ -93,19 +96,29 @@ static bool page_empty(void *ptr) static void clear_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr) { - pmd_t *pmd_table = pmd_offset(pud, 0); - pud_clear(pud); - kvm_tlb_flush_vmid_ipa(kvm, addr); - pmd_free(NULL, pmd_table); + if (pud_huge(*pud)) { + pud_clear(pud); + kvm_tlb_flush_vmid_ipa(kvm, addr); + } else { + pmd_t *pmd_table = pmd_offset(pud, 0); + pud_clear(pud); + kvm_tlb_flush_vmid_ipa(kvm, addr); + pmd_free(NULL, pmd_table); + } put_page(virt_to_page(pud)); } static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr) { - pte_t *pte_table = pte_offset_kernel(pmd, 0); - pmd_clear(pmd); - kvm_tlb_flush_vmid_ipa(kvm, addr); - pte_free_kernel(NULL, pte_table); + if (kvm_pmd_huge(*pmd)) { + pmd_clear(pmd); + kvm_tlb_flush_vmid_ipa(kvm, addr); + } else { + pte_t *pte_table = pte_offset_kernel(pmd, 0); + pmd_clear(pmd); + kvm_tlb_flush_vmid_ipa(kvm, addr); + pte_free_kernel(NULL, pte_table); + } put_page(virt_to_page(pmd)); } @@ -136,18 +149,32 @@ static void unmap_range(struct kvm *kvm, pgd_t *pgdp, continue; } + if (pud_huge(*pud)) { + /* + * If we are dealing with a huge pud, just clear it and + * move on. + */ + clear_pud_entry(kvm, pud, addr); + addr = pud_addr_end(addr, end); + continue; + } + pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) { addr = pmd_addr_end(addr, end); continue; } - pte = pte_offset_kernel(pmd, addr); - clear_pte_entry(kvm, pte, addr); - next = addr + PAGE_SIZE; + if (!kvm_pmd_huge(*pmd)) { + pte = pte_offset_kernel(pmd, addr); + clear_pte_entry(kvm, pte, addr); + next = addr + PAGE_SIZE; + } - /* If we emptied the pte, walk back up the ladder */ - if (page_empty(pte)) { + /* + * If the pmd entry is to be cleared, walk back up the ladder + */ + if (kvm_pmd_huge(*pmd) || page_empty(pte)) { clear_pmd_entry(kvm, pmd, addr); next = pmd_addr_end(addr, end); if (page_empty(pmd) && !page_empty(pud)) { @@ -420,29 +447,71 @@ void kvm_free_stage2_pgd(struct kvm *kvm) kvm->arch.pgd = NULL; } - -static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, - phys_addr_t addr, const pte_t *new_pte, bool iomap) +static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, + phys_addr_t addr) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; - pte_t *pte, old_pte; - /* Create 2nd stage page table mapping - Level 1 */ pgd = kvm->arch.pgd + pgd_index(addr); pud = pud_offset(pgd, addr); if (pud_none(*pud)) { if (!cache) - return 0; /* ignore calls from kvm_set_spte_hva */ + return NULL; pmd = mmu_memory_cache_alloc(cache); pud_populate(NULL, pud, pmd); get_page(virt_to_page(pud)); } - pmd = pmd_offset(pud, addr); + return pmd_offset(pud, addr); +} + +static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache + *cache, phys_addr_t addr, const pmd_t *new_pmd) +{ + pmd_t *pmd, old_pmd; + + pmd = stage2_get_pmd(kvm, cache, addr); + VM_BUG_ON(!pmd); + + /* + * Mapping in huge pages should only happen through a fault. If a + * page is merged into a transparent huge page, the individual + * subpages of that huge page should be unmapped through MMU + * notifiers before we get here. + * + * Merging of CompoundPages is not supported; they should become + * splitting first, unmapped, merged, and mapped back in on-demand. + */ + VM_BUG_ON(pmd_present(*pmd) && pmd_pfn(*pmd) != pmd_pfn(*new_pmd)); + + old_pmd = *pmd; + kvm_set_pmd(pmd, *new_pmd); + if (pmd_present(old_pmd)) + kvm_tlb_flush_vmid_ipa(kvm, addr); + else + get_page(virt_to_page(pmd)); + return 0; +} + +static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, + phys_addr_t addr, const pte_t *new_pte, bool iomap) +{ + pmd_t *pmd; + pte_t *pte, old_pte; - /* Create 2nd stage page table mapping - Level 2 */ + /* Create stage-2 page table mapping - Level 1 */ + pmd = stage2_get_pmd(kvm, cache, addr); + if (!pmd) { + /* + * Ignore calls from kvm_set_spte_hva for unallocated + * address ranges. + */ + return 0; + } + + /* Create stage-2 page mappings - Level 2 */ if (pmd_none(*pmd)) { if (!cache) return 0; /* ignore calls from kvm_set_spte_hva */ @@ -507,16 +576,60 @@ out: return ret; } +static bool transparent_hugepage_adjust(pfn_t *pfnp, phys_addr_t *ipap) +{ + pfn_t pfn = *pfnp; + gfn_t gfn = *ipap >> PAGE_SHIFT; + + if (PageTransCompound(pfn_to_page(pfn))) { + unsigned long mask; + /* + * The address we faulted on is backed by a transparent huge + * page. However, because we map the compound huge page and + * not the individual tail page, we need to transfer the + * refcount to the head page. We have to be careful that the + * THP doesn't start to split while we are adjusting the + * refcounts. + * + * We are sure this doesn't happen, because mmu_notifier_retry + * was successful and we are holding the mmu_lock, so if this + * THP is trying to split, it will be blocked in the mmu + * notifier before touching any of the pages, specifically + * before being able to call __split_huge_page_refcount(). + * + * We can therefore safely transfer the refcount from PG_tail + * to PG_head and switch the pfn from a tail page to the head + * page accordingly. + */ + mask = PTRS_PER_PMD - 1; + VM_BUG_ON((gfn & mask) != (pfn & mask)); + if (pfn & mask) { + *ipap &= PMD_MASK; + kvm_release_pfn_clean(pfn); + pfn &= ~mask; + kvm_get_pfn(pfn); + *pfnp = pfn; + } + + return true; + } + + return false; +} + static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, - gfn_t gfn, struct kvm_memory_slot *memslot, + struct kvm_memory_slot *memslot, unsigned long fault_status) { - pte_t new_pte; - pfn_t pfn; int ret; - bool write_fault, writable; + bool write_fault, writable, hugetlb = false, force_pte = false; unsigned long mmu_seq; + gfn_t gfn = fault_ipa >> PAGE_SHIFT; + unsigned long hva = gfn_to_hva(vcpu->kvm, gfn); + struct kvm *kvm = vcpu->kvm; struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; + struct vm_area_struct *vma; + pfn_t pfn; write_fault = kvm_is_write_fault(kvm_vcpu_get_hsr(vcpu)); if (fault_status == FSC_PERM && !write_fault) { @@ -524,6 +637,26 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, return -EFAULT; } + /* Let's check if we will get back a huge page backed by hugetlbfs */ + down_read(¤t->mm->mmap_sem); + vma = find_vma_intersection(current->mm, hva, hva + 1); + if (is_vm_hugetlb_page(vma)) { + hugetlb = true; + gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT; + } else { + /* + * Pages belonging to VMAs not aligned to the PMD mapping + * granularity cannot be mapped using block descriptors even + * if the pages belong to a THP for the process, because the + * stage-2 block descriptor will cover more than a single THP + * and we loose atomicity for unmapping, updates, and splits + * of the THP or other pages in the stage-2 block range. + */ + if (vma->vm_start & ~PMD_MASK) + force_pte = true; + } + up_read(¤t->mm->mmap_sem); + /* We need minimum second+third level pages */ ret = mmu_topup_memory_cache(memcache, 2, KVM_NR_MEM_OBJS); if (ret) @@ -541,26 +674,40 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, */ smp_rmb(); - pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write_fault, &writable); + pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable); if (is_error_pfn(pfn)) return -EFAULT; - new_pte = pfn_pte(pfn, PAGE_S2); - coherent_icache_guest_page(vcpu->kvm, gfn); - - spin_lock(&vcpu->kvm->mmu_lock); - if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) + spin_lock(&kvm->mmu_lock); + if (mmu_notifier_retry(kvm, mmu_seq)) goto out_unlock; - if (writable) { - kvm_set_s2pte_writable(&new_pte); - kvm_set_pfn_dirty(pfn); + if (!hugetlb && !force_pte) + hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa); + + if (hugetlb) { + pmd_t new_pmd = pfn_pmd(pfn, PAGE_S2); + new_pmd = pmd_mkhuge(new_pmd); + if (writable) { + kvm_set_s2pmd_writable(&new_pmd); + kvm_set_pfn_dirty(pfn); + } + coherent_icache_guest_page(kvm, hva & PMD_MASK, PMD_SIZE); + ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd); + } else { + pte_t new_pte = pfn_pte(pfn, PAGE_S2); + if (writable) { + kvm_set_s2pte_writable(&new_pte); + kvm_set_pfn_dirty(pfn); + } + coherent_icache_guest_page(kvm, hva, PAGE_SIZE); + ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, false); } - stage2_set_pte(vcpu->kvm, memcache, fault_ipa, &new_pte, false); + out_unlock: - spin_unlock(&vcpu->kvm->mmu_lock); + spin_unlock(&kvm->mmu_lock); kvm_release_pfn_clean(pfn); - return 0; + return ret; } /** @@ -629,7 +776,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) memslot = gfn_to_memslot(vcpu->kvm, gfn); - ret = user_mem_abort(vcpu, fault_ipa, gfn, memslot, fault_status); + ret = user_mem_abort(vcpu, fault_ipa, memslot, fault_status); if (ret == 0) ret = 1; out_unlock: diff --git a/arch/arm/kvm/psci.c b/arch/arm/kvm/psci.c index 86a693a..0881bf1 100644 --- a/arch/arm/kvm/psci.c +++ b/arch/arm/kvm/psci.c @@ -18,6 +18,7 @@ #include <linux/kvm_host.h> #include <linux/wait.h> +#include <asm/cputype.h> #include <asm/kvm_emulate.h> #include <asm/kvm_psci.h> @@ -34,22 +35,30 @@ static void kvm_psci_vcpu_off(struct kvm_vcpu *vcpu) static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) { struct kvm *kvm = source_vcpu->kvm; - struct kvm_vcpu *vcpu; + struct kvm_vcpu *vcpu = NULL, *tmp; wait_queue_head_t *wq; unsigned long cpu_id; + unsigned long mpidr; phys_addr_t target_pc; + int i; cpu_id = *vcpu_reg(source_vcpu, 1); if (vcpu_mode_is_32bit(source_vcpu)) cpu_id &= ~((u32) 0); - if (cpu_id >= atomic_read(&kvm->online_vcpus)) + kvm_for_each_vcpu(i, tmp, kvm) { + mpidr = kvm_vcpu_get_mpidr(tmp); + if ((mpidr & MPIDR_HWID_BITMASK) == (cpu_id & MPIDR_HWID_BITMASK)) { + vcpu = tmp; + break; + } + } + + if (!vcpu) return KVM_PSCI_RET_INVAL; target_pc = *vcpu_reg(source_vcpu, 2); - vcpu = kvm_get_vcpu(kvm, cpu_id); - wq = kvm_arch_vcpu_wq(vcpu); if (!waitqueue_active(wq)) return KVM_PSCI_RET_INVAL; @@ -62,6 +71,10 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) vcpu_set_thumb(vcpu); } + /* Propagate caller endianness */ + if (kvm_vcpu_is_be(source_vcpu)) + kvm_vcpu_set_be(vcpu); + *vcpu_pc(vcpu) = target_pc; vcpu->arch.pause = false; smp_mb(); /* Make sure the above is visible */ diff --git a/arch/arm/kvm/reset.c b/arch/arm/kvm/reset.c index c02ba4a..f558c07 100644 --- a/arch/arm/kvm/reset.c +++ b/arch/arm/kvm/reset.c @@ -30,16 +30,14 @@ #include <kvm/arm_arch_timer.h> /****************************************************************************** - * Cortex-A15 Reset Values + * Cortex-A15 and Cortex-A7 Reset Values */ -static const int a15_max_cpu_idx = 3; - -static struct kvm_regs a15_regs_reset = { +static struct kvm_regs cortexa_regs_reset = { .usr_regs.ARM_cpsr = SVC_MODE | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT, }; -static const struct kvm_irq_level a15_vtimer_irq = { +static const struct kvm_irq_level cortexa_vtimer_irq = { { .irq = 27 }, .level = 1, }; @@ -62,12 +60,11 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) const struct kvm_irq_level *cpu_vtimer_irq; switch (vcpu->arch.target) { + case KVM_ARM_TARGET_CORTEX_A7: case KVM_ARM_TARGET_CORTEX_A15: - if (vcpu->vcpu_id > a15_max_cpu_idx) - return -EINVAL; - reset_regs = &a15_regs_reset; + reset_regs = &cortexa_regs_reset; vcpu->arch.midr = read_cpuid_id(); - cpu_vtimer_irq = &a15_vtimer_irq; + cpu_vtimer_irq = &cortexa_vtimer_irq; break; default: return -ENODEV; diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index a5f28e2..c98ef47 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -63,6 +63,7 @@ * TAC: Trap ACTLR * TSC: Trap SMC * TSW: Trap cache operations by set/way + * TWE: Trap WFE * TWI: Trap WFI * TIDCP: Trap L2CTLR/L2ECTLR * BSU_IS: Upgrade barriers to the inner shareable domain @@ -72,8 +73,9 @@ * FMO: Override CPSR.F and enable signaling with VF * SWIO: Turn set/way invalidates into set/way clean+invalidate */ -#define HCR_GUEST_FLAGS (HCR_TSC | HCR_TSW | HCR_TWI | HCR_VM | HCR_BSU_IS | \ - HCR_FB | HCR_TAC | HCR_AMO | HCR_IMO | HCR_FMO | \ +#define HCR_GUEST_FLAGS (HCR_TSC | HCR_TSW | HCR_TWE | HCR_TWI | HCR_VM | \ + HCR_BSU_IS | HCR_FB | HCR_TAC | \ + HCR_AMO | HCR_IMO | HCR_FMO | \ HCR_SWIO | HCR_TIDCP | HCR_RW) #define HCR_VIRT_EXCP_MASK (HCR_VA | HCR_VI | HCR_VF) @@ -242,4 +244,6 @@ #define ESR_EL2_EC_xABT_xFSR_EXTABT 0x10 +#define ESR_EL2_EC_WFI_ISS_WFE (1 << 0) + #endif /* __ARM64_KVM_ARM_H__ */ diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index eec0738..dd8ecfc3 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -177,4 +177,65 @@ static inline u8 kvm_vcpu_trap_get_fault(const struct kvm_vcpu *vcpu) return kvm_vcpu_get_hsr(vcpu) & ESR_EL2_FSC_TYPE; } +static inline unsigned long kvm_vcpu_get_mpidr(struct kvm_vcpu *vcpu) +{ + return vcpu_sys_reg(vcpu, MPIDR_EL1); +} + +static inline void kvm_vcpu_set_be(struct kvm_vcpu *vcpu) +{ + if (vcpu_mode_is_32bit(vcpu)) + *vcpu_cpsr(vcpu) |= COMPAT_PSR_E_BIT; + else + vcpu_sys_reg(vcpu, SCTLR_EL1) |= (1 << 25); +} + +static inline bool kvm_vcpu_is_be(struct kvm_vcpu *vcpu) +{ + if (vcpu_mode_is_32bit(vcpu)) + return !!(*vcpu_cpsr(vcpu) & COMPAT_PSR_E_BIT); + + return !!(vcpu_sys_reg(vcpu, SCTLR_EL1) & (1 << 25)); +} + +static inline unsigned long vcpu_data_guest_to_host(struct kvm_vcpu *vcpu, + unsigned long data, + unsigned int len) +{ + if (kvm_vcpu_is_be(vcpu)) { + switch (len) { + case 1: + return data & 0xff; + case 2: + return be16_to_cpu(data & 0xffff); + case 4: + return be32_to_cpu(data & 0xffffffff); + default: + return be64_to_cpu(data); + } + } + + return data; /* Leave LE untouched */ +} + +static inline unsigned long vcpu_data_host_to_guest(struct kvm_vcpu *vcpu, + unsigned long data, + unsigned int len) +{ + if (kvm_vcpu_is_be(vcpu)) { + switch (len) { + case 1: + return data & 0xff; + case 2: + return cpu_to_be16(data & 0xffff); + case 4: + return cpu_to_be32(data & 0xffffffff); + default: + return cpu_to_be64(data); + } + } + + return data; /* Leave LE untouched */ +} + #endif /* __ARM64_KVM_EMULATE_H__ */ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 0859a4d..5d85a02 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -36,11 +36,6 @@ #define KVM_VCPU_MAX_FEATURES 2 -/* We don't currently support large pages. */ -#define KVM_HPAGE_GFN_SHIFT(x) 0 -#define KVM_NR_PAGE_SIZES 1 -#define KVM_PAGES_PER_HPAGE(x) (1UL<<31) - struct kvm_vcpu; int kvm_target_cpu(void); int kvm_reset_vcpu(struct kvm_vcpu *vcpu); @@ -151,6 +146,7 @@ struct kvm_vcpu_stat { struct kvm_vcpu_init; int kvm_vcpu_set_target(struct kvm_vcpu *vcpu, const struct kvm_vcpu_init *init); +int kvm_vcpu_preferred_target(struct kvm_vcpu_init *init); unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu); int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices); struct kvm_one_reg; diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index efe609c..680f74e 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -91,6 +91,7 @@ int kvm_mmu_init(void); void kvm_clear_hyp_idmap(void); #define kvm_set_pte(ptep, pte) set_pte(ptep, pte) +#define kvm_set_pmd(pmdp, pmd) set_pmd(pmdp, pmd) static inline bool kvm_is_write_fault(unsigned long esr) { @@ -116,13 +117,18 @@ static inline void kvm_set_s2pte_writable(pte_t *pte) pte_val(*pte) |= PTE_S2_RDWR; } +static inline void kvm_set_s2pmd_writable(pmd_t *pmd) +{ + pmd_val(*pmd) |= PMD_S2_RDWR; +} + struct kvm; -static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn) +static inline void coherent_icache_guest_page(struct kvm *kvm, hva_t hva, + unsigned long size) { if (!icache_is_aliasing()) { /* PIPT */ - unsigned long hva = gfn_to_hva(kvm, gfn); - flush_icache_range(hva, hva + PAGE_SIZE); + flush_icache_range(hva, hva + size); } else if (!icache_is_aivivt()) { /* non ASID-tagged VIVT */ /* any kind of VIPT cache */ __flush_icache_all(); diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index d57e668..755f861 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -85,6 +85,8 @@ #define PTE_S2_RDONLY (_AT(pteval_t, 1) << 6) /* HAP[2:1] */ #define PTE_S2_RDWR (_AT(pteval_t, 3) << 6) /* HAP[2:1] */ +#define PMD_S2_RDWR (_AT(pmdval_t, 3) << 6) /* HAP[2:1] */ + /* * Memory Attribute override for Stage-2 (MemAttr[3:0]) */ diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 21e9082..4480ab3 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -21,6 +21,7 @@ config KVM select MMU_NOTIFIER select PREEMPT_NOTIFIERS select ANON_INODES + select HAVE_KVM_CPU_RELAX_INTERCEPT select KVM_MMIO select KVM_ARM_HOST select KVM_ARM_VGIC diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 2c3ff67..3f0731e 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -248,6 +248,26 @@ int kvm_vcpu_set_target(struct kvm_vcpu *vcpu, return kvm_reset_vcpu(vcpu); } +int kvm_vcpu_preferred_target(struct kvm_vcpu_init *init) +{ + int target = kvm_target_cpu(); + + if (target < 0) + return -ENODEV; + + memset(init, 0, sizeof(*init)); + + /* + * For now, we don't return any features. + * In future, we might use features to return target + * specific features available for the preferred + * target type. + */ + init->target = (__u32)target; + + return 0; +} + int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { return -EINVAL; diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index 9beaca0..8da5606 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -47,21 +47,29 @@ static int handle_smc(struct kvm_vcpu *vcpu, struct kvm_run *run) } /** - * kvm_handle_wfi - handle a wait-for-interrupts instruction executed by a guest + * kvm_handle_wfx - handle a wait-for-interrupts or wait-for-event + * instruction executed by a guest + * * @vcpu: the vcpu pointer * - * Simply call kvm_vcpu_block(), which will halt execution of + * WFE: Yield the CPU and come back to this vcpu when the scheduler + * decides to. + * WFI: Simply call kvm_vcpu_block(), which will halt execution of * world-switches and schedule other host processes until there is an * incoming IRQ or FIQ to the VM. */ -static int kvm_handle_wfi(struct kvm_vcpu *vcpu, struct kvm_run *run) +static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct kvm_run *run) { - kvm_vcpu_block(vcpu); + if (kvm_vcpu_get_hsr(vcpu) & ESR_EL2_EC_WFI_ISS_WFE) + kvm_vcpu_on_spin(vcpu); + else + kvm_vcpu_block(vcpu); + return 1; } static exit_handle_fn arm_exit_handlers[] = { - [ESR_EL2_EC_WFI] = kvm_handle_wfi, + [ESR_EL2_EC_WFI] = kvm_handle_wfx, [ESR_EL2_EC_CP15_32] = kvm_handle_cp15_32, [ESR_EL2_EC_CP15_64] = kvm_handle_cp15_64, [ESR_EL2_EC_CP14_MR] = kvm_handle_cp14_access, diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h index 989dd3f..db95f57 100644 --- a/arch/ia64/include/asm/kvm_host.h +++ b/arch/ia64/include/asm/kvm_host.h @@ -234,10 +234,6 @@ struct kvm_vm_data { #define KVM_REQ_PTC_G 32 #define KVM_REQ_RESUME 33 -#define KVM_HPAGE_GFN_SHIFT(x) 0 -#define KVM_NR_PAGE_SIZES 1 -#define KVM_PAGES_PER_HPAGE(x) 1 - struct kvm; struct kvm_vcpu; @@ -480,7 +476,7 @@ struct kvm_arch { struct list_head assigned_dev_head; struct iommu_domain *iommu_domain; - int iommu_flags; + bool iommu_noncoherent; unsigned long irq_sources_bitmap; unsigned long irq_states[KVM_IOAPIC_NUM_PINS]; diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index bdfd878..985bf80 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -1550,12 +1550,13 @@ int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) return VM_FAULT_SIGBUS; } -void kvm_arch_free_memslot(struct kvm_memory_slot *free, +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, struct kvm_memory_slot *dont) { } -int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) +int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, + unsigned long npages) { return 0; } diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 4d6fa0b..3296696 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -27,13 +27,6 @@ #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 -/* Don't support huge pages */ -#define KVM_HPAGE_GFN_SHIFT(x) 0 - -/* We don't currently support large pages. */ -#define KVM_NR_PAGE_SIZES 1 -#define KVM_PAGES_PER_HPAGE(x) 1 - /* Special address that contains the comm page, used for reducing # of traps */ diff --git a/arch/mips/kvm/kvm_mips.c b/arch/mips/kvm/kvm_mips.c index a7b0445..73b3482 100644 --- a/arch/mips/kvm/kvm_mips.c +++ b/arch/mips/kvm/kvm_mips.c @@ -198,12 +198,13 @@ kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) return -ENOIOCTLCMD; } -void kvm_arch_free_memslot(struct kvm_memory_slot *free, +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, struct kvm_memory_slot *dont) { } -int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) +int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, + unsigned long npages) { return 0; } diff --git a/arch/powerpc/include/asm/disassemble.h b/arch/powerpc/include/asm/disassemble.h index 9b198d1..856f8de 100644 --- a/arch/powerpc/include/asm/disassemble.h +++ b/arch/powerpc/include/asm/disassemble.h @@ -77,4 +77,8 @@ static inline unsigned int get_d(u32 inst) return inst & 0xffff; } +static inline unsigned int get_oc(u32 inst) +{ + return (inst >> 11) & 0x7fff; +} #endif /* __ASM_PPC_DISASSEMBLE_H__ */ diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index cca12f0..894662a 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -198,12 +198,27 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) cmpwi r10,0; \ bne do_kvm_##n +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE +/* + * If hv is possible, interrupts come into to the hv version + * of the kvmppc_interrupt code, which then jumps to the PR handler, + * kvmppc_interrupt_pr, if the guest is a PR guest. + */ +#define kvmppc_interrupt kvmppc_interrupt_hv +#else +#define kvmppc_interrupt kvmppc_interrupt_pr +#endif + #define __KVM_HANDLER(area, h, n) \ do_kvm_##n: \ BEGIN_FTR_SECTION_NESTED(947) \ ld r10,area+EX_CFAR(r13); \ std r10,HSTATE_CFAR(r13); \ END_FTR_SECTION_NESTED(CPU_FTR_CFAR,CPU_FTR_CFAR,947); \ + BEGIN_FTR_SECTION_NESTED(948) \ + ld r10,area+EX_PPR(r13); \ + std r10,HSTATE_PPR(r13); \ + END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948); \ ld r10,area+EX_R10(r13); \ stw r9,HSTATE_SCRATCH1(r13); \ ld r9,area+EX_R9(r13); \ @@ -217,6 +232,10 @@ do_kvm_##n: \ ld r10,area+EX_R10(r13); \ beq 89f; \ stw r9,HSTATE_SCRATCH1(r13); \ + BEGIN_FTR_SECTION_NESTED(948) \ + ld r9,area+EX_PPR(r13); \ + std r9,HSTATE_PPR(r13); \ + END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948); \ ld r9,area+EX_R9(r13); \ std r12,HSTATE_SCRATCH0(r13); \ li r12,n; \ @@ -236,7 +255,7 @@ do_kvm_##n: \ #define KVM_HANDLER_SKIP(area, h, n) #endif -#ifdef CONFIG_KVM_BOOK3S_PR +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE #define KVMTEST_PR(n) __KVMTEST(n) #define KVM_HANDLER_PR(area, h, n) __KVM_HANDLER(area, h, n) #define KVM_HANDLER_PR_SKIP(area, h, n) __KVM_HANDLER_SKIP(area, h, n) diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h index 851bac7..1bd92fd 100644 --- a/arch/powerpc/include/asm/kvm_asm.h +++ b/arch/powerpc/include/asm/kvm_asm.h @@ -123,6 +123,8 @@ #define BOOK3S_HFLAG_SLB 0x2 #define BOOK3S_HFLAG_PAIRED_SINGLE 0x4 #define BOOK3S_HFLAG_NATIVE_PS 0x8 +#define BOOK3S_HFLAG_MULTI_PGSIZE 0x10 +#define BOOK3S_HFLAG_NEW_TLBIE 0x20 #define RESUME_FLAG_NV (1<<0) /* Reload guest nonvolatile state? */ #define RESUME_FLAG_HOST (1<<1) /* Resume host? */ @@ -136,6 +138,8 @@ #define KVM_GUEST_MODE_NONE 0 #define KVM_GUEST_MODE_GUEST 1 #define KVM_GUEST_MODE_SKIP 2 +#define KVM_GUEST_MODE_GUEST_HV 3 +#define KVM_GUEST_MODE_HOST_HV 4 #define KVM_INST_FETCH_FAILED -1 diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index fa19e2f..4a594b7 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -58,16 +58,18 @@ struct hpte_cache { struct hlist_node list_pte_long; struct hlist_node list_vpte; struct hlist_node list_vpte_long; +#ifdef CONFIG_PPC_BOOK3S_64 + struct hlist_node list_vpte_64k; +#endif struct rcu_head rcu_head; u64 host_vpn; u64 pfn; ulong slot; struct kvmppc_pte pte; + int pagesize; }; struct kvmppc_vcpu_book3s { - struct kvm_vcpu vcpu; - struct kvmppc_book3s_shadow_vcpu *shadow_vcpu; struct kvmppc_sid_map sid_map[SID_MAP_NUM]; struct { u64 esid; @@ -99,6 +101,9 @@ struct kvmppc_vcpu_book3s { struct hlist_head hpte_hash_pte_long[HPTEG_HASH_NUM_PTE_LONG]; struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE]; struct hlist_head hpte_hash_vpte_long[HPTEG_HASH_NUM_VPTE_LONG]; +#ifdef CONFIG_PPC_BOOK3S_64 + struct hlist_head hpte_hash_vpte_64k[HPTEG_HASH_NUM_VPTE_64K]; +#endif int hpte_cache_count; spinlock_t mmu_lock; }; @@ -107,8 +112,9 @@ struct kvmppc_vcpu_book3s { #define CONTEXT_GUEST 1 #define CONTEXT_GUEST_END 2 -#define VSID_REAL 0x0fffffffffc00000ULL -#define VSID_BAT 0x0fffffffffb00000ULL +#define VSID_REAL 0x07ffffffffc00000ULL +#define VSID_BAT 0x07ffffffffb00000ULL +#define VSID_64K 0x0800000000000000ULL #define VSID_1T 0x1000000000000000ULL #define VSID_REAL_DR 0x2000000000000000ULL #define VSID_REAL_IR 0x4000000000000000ULL @@ -118,11 +124,12 @@ extern void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong ea, ulong ea_mask) extern void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 vp, u64 vp_mask); extern void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end); extern void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 new_msr); -extern void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr); extern void kvmppc_mmu_book3s_64_init(struct kvm_vcpu *vcpu); extern void kvmppc_mmu_book3s_32_init(struct kvm_vcpu *vcpu); extern void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu); -extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte); +extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte, + bool iswrite); +extern void kvmppc_mmu_unmap_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte); extern int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr); extern void kvmppc_mmu_flush_segment(struct kvm_vcpu *vcpu, ulong eaddr, ulong seg_size); extern void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu); @@ -134,6 +141,7 @@ extern long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, extern void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte); extern struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu); +extern void kvmppc_mmu_hpte_cache_free(struct hpte_cache *pte); extern void kvmppc_mmu_hpte_destroy(struct kvm_vcpu *vcpu); extern int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu); extern void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte); @@ -151,7 +159,8 @@ extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat, bool upper, u32 val); extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr); extern int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu); -extern pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn); +extern pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, bool writing, + bool *writable); extern void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, unsigned long *rmap, long pte_index, int realmode); extern void kvmppc_invalidate_hpte(struct kvm *kvm, unsigned long *hptep, @@ -172,6 +181,8 @@ extern long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags, unsigned long *hpret); extern long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long *map); +extern void kvmppc_update_lpcr(struct kvm *kvm, unsigned long lpcr, + unsigned long mask); extern void kvmppc_entry_trampoline(void); extern void kvmppc_hv_entry_trampoline(void); @@ -184,11 +195,9 @@ extern int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd); static inline struct kvmppc_vcpu_book3s *to_book3s(struct kvm_vcpu *vcpu) { - return container_of(vcpu, struct kvmppc_vcpu_book3s, vcpu); + return vcpu->arch.book3s; } -extern void kvm_return_point(void); - /* Also add subarch specific defines */ #ifdef CONFIG_KVM_BOOK3S_32_HANDLER @@ -198,203 +207,6 @@ extern void kvm_return_point(void); #include <asm/kvm_book3s_64.h> #endif -#ifdef CONFIG_KVM_BOOK3S_PR - -static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu) -{ - return to_book3s(vcpu)->hior; -} - -static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu, - unsigned long pending_now, unsigned long old_pending) -{ - if (pending_now) - vcpu->arch.shared->int_pending = 1; - else if (old_pending) - vcpu->arch.shared->int_pending = 0; -} - -static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val) -{ - if ( num < 14 ) { - struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); - svcpu->gpr[num] = val; - svcpu_put(svcpu); - to_book3s(vcpu)->shadow_vcpu->gpr[num] = val; - } else - vcpu->arch.gpr[num] = val; -} - -static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num) -{ - if ( num < 14 ) { - struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); - ulong r = svcpu->gpr[num]; - svcpu_put(svcpu); - return r; - } else - return vcpu->arch.gpr[num]; -} - -static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val) -{ - struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); - svcpu->cr = val; - svcpu_put(svcpu); - to_book3s(vcpu)->shadow_vcpu->cr = val; -} - -static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu) -{ - struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); - u32 r; - r = svcpu->cr; - svcpu_put(svcpu); - return r; -} - -static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, u32 val) -{ - struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); - svcpu->xer = val; - to_book3s(vcpu)->shadow_vcpu->xer = val; - svcpu_put(svcpu); -} - -static inline u32 kvmppc_get_xer(struct kvm_vcpu *vcpu) -{ - struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); - u32 r; - r = svcpu->xer; - svcpu_put(svcpu); - return r; -} - -static inline void kvmppc_set_ctr(struct kvm_vcpu *vcpu, ulong val) -{ - struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); - svcpu->ctr = val; - svcpu_put(svcpu); -} - -static inline ulong kvmppc_get_ctr(struct kvm_vcpu *vcpu) -{ - struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); - ulong r; - r = svcpu->ctr; - svcpu_put(svcpu); - return r; -} - -static inline void kvmppc_set_lr(struct kvm_vcpu *vcpu, ulong val) -{ - struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); - svcpu->lr = val; - svcpu_put(svcpu); -} - -static inline ulong kvmppc_get_lr(struct kvm_vcpu *vcpu) -{ - struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); - ulong r; - r = svcpu->lr; - svcpu_put(svcpu); - return r; -} - -static inline void kvmppc_set_pc(struct kvm_vcpu *vcpu, ulong val) -{ - struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); - svcpu->pc = val; - svcpu_put(svcpu); -} - -static inline ulong kvmppc_get_pc(struct kvm_vcpu *vcpu) -{ - struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); - ulong r; - r = svcpu->pc; - svcpu_put(svcpu); - return r; -} - -static inline u32 kvmppc_get_last_inst(struct kvm_vcpu *vcpu) -{ - ulong pc = kvmppc_get_pc(vcpu); - struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); - u32 r; - - /* Load the instruction manually if it failed to do so in the - * exit path */ - if (svcpu->last_inst == KVM_INST_FETCH_FAILED) - kvmppc_ld(vcpu, &pc, sizeof(u32), &svcpu->last_inst, false); - - r = svcpu->last_inst; - svcpu_put(svcpu); - return r; -} - -/* - * Like kvmppc_get_last_inst(), but for fetching a sc instruction. - * Because the sc instruction sets SRR0 to point to the following - * instruction, we have to fetch from pc - 4. - */ -static inline u32 kvmppc_get_last_sc(struct kvm_vcpu *vcpu) -{ - ulong pc = kvmppc_get_pc(vcpu) - 4; - struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); - u32 r; - - /* Load the instruction manually if it failed to do so in the - * exit path */ - if (svcpu->last_inst == KVM_INST_FETCH_FAILED) - kvmppc_ld(vcpu, &pc, sizeof(u32), &svcpu->last_inst, false); - - r = svcpu->last_inst; - svcpu_put(svcpu); - return r; -} - -static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu) -{ - struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); - ulong r; - r = svcpu->fault_dar; - svcpu_put(svcpu); - return r; -} - -static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu) -{ - ulong crit_raw = vcpu->arch.shared->critical; - ulong crit_r1 = kvmppc_get_gpr(vcpu, 1); - bool crit; - - /* Truncate crit indicators in 32 bit mode */ - if (!(vcpu->arch.shared->msr & MSR_SF)) { - crit_raw &= 0xffffffff; - crit_r1 &= 0xffffffff; - } - - /* Critical section when crit == r1 */ - crit = (crit_raw == crit_r1); - /* ... and we're in supervisor mode */ - crit = crit && !(vcpu->arch.shared->msr & MSR_PR); - - return crit; -} -#else /* CONFIG_KVM_BOOK3S_PR */ - -static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu) -{ - return 0; -} - -static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu, - unsigned long pending_now, unsigned long old_pending) -{ -} - static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val) { vcpu->arch.gpr[num] = val; @@ -489,12 +301,6 @@ static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu) return vcpu->arch.fault_dar; } -static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu) -{ - return false; -} -#endif - /* Magic register values loaded into r3 and r4 before the 'sc' assembly * instruction for the OSI hypercalls */ #define OSI_SC_MAGIC_R3 0x113724FA diff --git a/arch/powerpc/include/asm/kvm_book3s_32.h b/arch/powerpc/include/asm/kvm_book3s_32.h index ce0ef6c..c720e0b 100644 --- a/arch/powerpc/include/asm/kvm_book3s_32.h +++ b/arch/powerpc/include/asm/kvm_book3s_32.h @@ -22,7 +22,7 @@ static inline struct kvmppc_book3s_shadow_vcpu *svcpu_get(struct kvm_vcpu *vcpu) { - return to_book3s(vcpu)->shadow_vcpu; + return vcpu->arch.shadow_vcpu; } static inline void svcpu_put(struct kvmppc_book3s_shadow_vcpu *svcpu) diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 86d638a..bf0fa8b0a 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -20,7 +20,7 @@ #ifndef __ASM_KVM_BOOK3S_64_H__ #define __ASM_KVM_BOOK3S_64_H__ -#ifdef CONFIG_KVM_BOOK3S_PR +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE static inline struct kvmppc_book3s_shadow_vcpu *svcpu_get(struct kvm_vcpu *vcpu) { preempt_disable(); @@ -35,7 +35,7 @@ static inline void svcpu_put(struct kvmppc_book3s_shadow_vcpu *svcpu) #define SPAPR_TCE_SHIFT 12 -#ifdef CONFIG_KVM_BOOK3S_64_HV +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE #define KVM_DEFAULT_HPT_ORDER 24 /* 16MB HPT by default */ extern unsigned long kvm_rma_pages; #endif @@ -278,7 +278,7 @@ static inline int is_vrma_hpte(unsigned long hpte_v) (HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16))); } -#ifdef CONFIG_KVM_BOOK3S_64_HV +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE /* * Note modification of an HPTE; set the HPTE modified bit * if anyone is interested. @@ -289,6 +289,6 @@ static inline void note_hpte_modification(struct kvm *kvm, if (atomic_read(&kvm->arch.hpte_mod_interest)) rev->guest_rpte |= HPTE_GR_MODIFIED; } -#endif /* CONFIG_KVM_BOOK3S_64_HV */ +#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ #endif /* __ASM_KVM_BOOK3S_64_H__ */ diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h index 9039d3c..0bd9348 100644 --- a/arch/powerpc/include/asm/kvm_book3s_asm.h +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h @@ -83,7 +83,7 @@ struct kvmppc_host_state { u8 restore_hid5; u8 napping; -#ifdef CONFIG_KVM_BOOK3S_64_HV +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE u8 hwthread_req; u8 hwthread_state; u8 host_ipi; @@ -101,6 +101,7 @@ struct kvmppc_host_state { #endif #ifdef CONFIG_PPC_BOOK3S_64 u64 cfar; + u64 ppr; #endif }; @@ -108,14 +109,14 @@ struct kvmppc_book3s_shadow_vcpu { ulong gpr[14]; u32 cr; u32 xer; - - u32 fault_dsisr; - u32 last_inst; ulong ctr; ulong lr; ulong pc; + ulong shadow_srr1; ulong fault_dar; + u32 fault_dsisr; + u32 last_inst; #ifdef CONFIG_PPC_BOOK3S_32 u32 sr[16]; /* Guest SRs */ diff --git a/arch/powerpc/include/asm/kvm_booke.h b/arch/powerpc/include/asm/kvm_booke.h index d3c1eb3..dd8f615 100644 --- a/arch/powerpc/include/asm/kvm_booke.h +++ b/arch/powerpc/include/asm/kvm_booke.h @@ -26,7 +26,12 @@ /* LPIDs we support with this build -- runtime limit may be lower */ #define KVMPPC_NR_LPIDS 64 -#define KVMPPC_INST_EHPRIV 0x7c00021c +#define KVMPPC_INST_EHPRIV 0x7c00021c +#define EHPRIV_OC_SHIFT 11 +/* "ehpriv 1" : ehpriv with OC = 1 is used for debug emulation */ +#define EHPRIV_OC_DEBUG 1 +#define KVMPPC_INST_EHPRIV_DEBUG (KVMPPC_INST_EHPRIV | \ + (EHPRIV_OC_DEBUG << EHPRIV_OC_SHIFT)) static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val) { diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 3328353..237d1d2 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -63,20 +63,17 @@ extern void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); #endif -/* We don't currently support large pages. */ -#define KVM_HPAGE_GFN_SHIFT(x) 0 -#define KVM_NR_PAGE_SIZES 1 -#define KVM_PAGES_PER_HPAGE(x) (1UL<<31) - #define HPTEG_CACHE_NUM (1 << 15) #define HPTEG_HASH_BITS_PTE 13 #define HPTEG_HASH_BITS_PTE_LONG 12 #define HPTEG_HASH_BITS_VPTE 13 #define HPTEG_HASH_BITS_VPTE_LONG 5 +#define HPTEG_HASH_BITS_VPTE_64K 11 #define HPTEG_HASH_NUM_PTE (1 << HPTEG_HASH_BITS_PTE) #define HPTEG_HASH_NUM_PTE_LONG (1 << HPTEG_HASH_BITS_PTE_LONG) #define HPTEG_HASH_NUM_VPTE (1 << HPTEG_HASH_BITS_VPTE) #define HPTEG_HASH_NUM_VPTE_LONG (1 << HPTEG_HASH_BITS_VPTE_LONG) +#define HPTEG_HASH_NUM_VPTE_64K (1 << HPTEG_HASH_BITS_VPTE_64K) /* Physical Address Mask - allowed range of real mode RAM access */ #define KVM_PAM 0x0fffffffffffffffULL @@ -89,6 +86,9 @@ struct lppaca; struct slb_shadow; struct dtl_entry; +struct kvmppc_vcpu_book3s; +struct kvmppc_book3s_shadow_vcpu; + struct kvm_vm_stat { u32 remote_tlb_flush; }; @@ -224,15 +224,15 @@ struct revmap_entry { #define KVMPPC_GOT_PAGE 0x80 struct kvm_arch_memory_slot { -#ifdef CONFIG_KVM_BOOK3S_64_HV +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE unsigned long *rmap; unsigned long *slot_phys; -#endif /* CONFIG_KVM_BOOK3S_64_HV */ +#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ }; struct kvm_arch { unsigned int lpid; -#ifdef CONFIG_KVM_BOOK3S_64_HV +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE unsigned long hpt_virt; struct revmap_entry *revmap; unsigned int host_lpid; @@ -256,7 +256,10 @@ struct kvm_arch { cpumask_t need_tlb_flush; struct kvmppc_vcore *vcores[KVM_MAX_VCORES]; int hpt_cma_alloc; -#endif /* CONFIG_KVM_BOOK3S_64_HV */ +#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE + struct mutex hpt_mutex; +#endif #ifdef CONFIG_PPC_BOOK3S_64 struct list_head spapr_tce_tables; struct list_head rtas_tokens; @@ -267,6 +270,7 @@ struct kvm_arch { #ifdef CONFIG_KVM_XICS struct kvmppc_xics *xics; #endif + struct kvmppc_ops *kvm_ops; }; /* @@ -294,6 +298,10 @@ struct kvmppc_vcore { u64 stolen_tb; u64 preempt_tb; struct kvm_vcpu *runner; + u64 tb_offset; /* guest timebase - host timebase */ + ulong lpcr; + u32 arch_compat; + ulong pcr; }; #define VCORE_ENTRY_COUNT(vc) ((vc)->entry_exit_count & 0xff) @@ -328,6 +336,7 @@ struct kvmppc_pte { bool may_read : 1; bool may_write : 1; bool may_execute : 1; + u8 page_size; /* MMU_PAGE_xxx */ }; struct kvmppc_mmu { @@ -340,7 +349,8 @@ struct kvmppc_mmu { /* book3s */ void (*mtsrin)(struct kvm_vcpu *vcpu, u32 srnum, ulong value); u32 (*mfsrin)(struct kvm_vcpu *vcpu, u32 srnum); - int (*xlate)(struct kvm_vcpu *vcpu, gva_t eaddr, struct kvmppc_pte *pte, bool data); + int (*xlate)(struct kvm_vcpu *vcpu, gva_t eaddr, + struct kvmppc_pte *pte, bool data, bool iswrite); void (*reset_msr)(struct kvm_vcpu *vcpu); void (*tlbie)(struct kvm_vcpu *vcpu, ulong addr, bool large); int (*esid_to_vsid)(struct kvm_vcpu *vcpu, ulong esid, u64 *vsid); @@ -360,6 +370,7 @@ struct kvmppc_slb { bool large : 1; /* PTEs are 16MB */ bool tb : 1; /* 1TB segment */ bool class : 1; + u8 base_page_size; /* MMU_PAGE_xxx */ }; # ifdef CONFIG_PPC_FSL_BOOK3E @@ -377,17 +388,6 @@ struct kvmppc_slb { #define KVMPPC_EPR_USER 1 /* exit to userspace to fill EPR */ #define KVMPPC_EPR_KERNEL 2 /* in-kernel irqchip */ -struct kvmppc_booke_debug_reg { - u32 dbcr0; - u32 dbcr1; - u32 dbcr2; -#ifdef CONFIG_KVM_E500MC - u32 dbcr4; -#endif - u64 iac[KVMPPC_BOOKE_MAX_IAC]; - u64 dac[KVMPPC_BOOKE_MAX_DAC]; -}; - #define KVMPPC_IRQ_DEFAULT 0 #define KVMPPC_IRQ_MPIC 1 #define KVMPPC_IRQ_XICS 2 @@ -402,6 +402,10 @@ struct kvm_vcpu_arch { int slb_max; /* 1 + index of last valid entry in slb[] */ int slb_nr; /* total number of entries in SLB */ struct kvmppc_mmu mmu; + struct kvmppc_vcpu_book3s *book3s; +#endif +#ifdef CONFIG_PPC_BOOK3S_32 + struct kvmppc_book3s_shadow_vcpu *shadow_vcpu; #endif ulong gpr[32]; @@ -463,6 +467,8 @@ struct kvm_vcpu_arch { u32 ctrl; ulong dabr; ulong cfar; + ulong ppr; + ulong shadow_srr1; #endif u32 vrsave; /* also USPRG0 */ u32 mmucr; @@ -498,6 +504,8 @@ struct kvm_vcpu_arch { u64 mmcr[3]; u32 pmc[8]; + u64 siar; + u64 sdar; #ifdef CONFIG_KVM_EXIT_TIMING struct mutex exit_timing_lock; @@ -531,7 +539,10 @@ struct kvm_vcpu_arch { u32 eptcfg; u32 epr; u32 crit_save; - struct kvmppc_booke_debug_reg dbg_reg; + /* guest debug registers*/ + struct debug_reg dbg_reg; + /* hardware visible debug registers when in guest state */ + struct debug_reg shadow_dbg_reg; #endif gpa_t paddr_accessed; gva_t vaddr_accessed; @@ -582,7 +593,7 @@ struct kvm_vcpu_arch { struct kvmppc_icp *icp; /* XICS presentation controller */ #endif -#ifdef CONFIG_KVM_BOOK3S_64_HV +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE struct kvm_vcpu_arch_shared shregs; unsigned long pgfault_addr; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index b15554a..c8317fb 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -106,13 +106,6 @@ extern void kvmppc_core_queue_external(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq); extern void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu); extern void kvmppc_core_flush_tlb(struct kvm_vcpu *vcpu); - -extern int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, - unsigned int op, int *advance); -extern int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, - ulong val); -extern int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, - ulong *val); extern int kvmppc_core_check_requests(struct kvm_vcpu *vcpu); extern int kvmppc_booke_init(void); @@ -135,17 +128,17 @@ extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, struct kvm_create_spapr_tce *args); extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, unsigned long ioba, unsigned long tce); -extern long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, - struct kvm_allocate_rma *rma); extern struct kvm_rma_info *kvm_alloc_rma(void); extern void kvm_release_rma(struct kvm_rma_info *ri); extern struct page *kvm_alloc_hpt(unsigned long nr_pages); extern void kvm_release_hpt(struct page *page, unsigned long nr_pages); extern int kvmppc_core_init_vm(struct kvm *kvm); extern void kvmppc_core_destroy_vm(struct kvm *kvm); -extern void kvmppc_core_free_memslot(struct kvm_memory_slot *free, +extern void kvmppc_core_free_memslot(struct kvm *kvm, + struct kvm_memory_slot *free, struct kvm_memory_slot *dont); -extern int kvmppc_core_create_memslot(struct kvm_memory_slot *slot, +extern int kvmppc_core_create_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot, unsigned long npages); extern int kvmppc_core_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, @@ -177,6 +170,72 @@ extern int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server, extern int kvmppc_xics_int_on(struct kvm *kvm, u32 irq); extern int kvmppc_xics_int_off(struct kvm *kvm, u32 irq); +union kvmppc_one_reg { + u32 wval; + u64 dval; + vector128 vval; + u64 vsxval[2]; + struct { + u64 addr; + u64 length; + } vpaval; +}; + +struct kvmppc_ops { + struct module *owner; + int (*get_sregs)(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); + int (*set_sregs)(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); + int (*get_one_reg)(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val); + int (*set_one_reg)(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val); + void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); + void (*vcpu_put)(struct kvm_vcpu *vcpu); + void (*set_msr)(struct kvm_vcpu *vcpu, u64 msr); + int (*vcpu_run)(struct kvm_run *run, struct kvm_vcpu *vcpu); + struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned int id); + void (*vcpu_free)(struct kvm_vcpu *vcpu); + int (*check_requests)(struct kvm_vcpu *vcpu); + int (*get_dirty_log)(struct kvm *kvm, struct kvm_dirty_log *log); + void (*flush_memslot)(struct kvm *kvm, struct kvm_memory_slot *memslot); + int (*prepare_memory_region)(struct kvm *kvm, + struct kvm_memory_slot *memslot, + struct kvm_userspace_memory_region *mem); + void (*commit_memory_region)(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + const struct kvm_memory_slot *old); + int (*unmap_hva)(struct kvm *kvm, unsigned long hva); + int (*unmap_hva_range)(struct kvm *kvm, unsigned long start, + unsigned long end); + int (*age_hva)(struct kvm *kvm, unsigned long hva); + int (*test_age_hva)(struct kvm *kvm, unsigned long hva); + void (*set_spte_hva)(struct kvm *kvm, unsigned long hva, pte_t pte); + void (*mmu_destroy)(struct kvm_vcpu *vcpu); + void (*free_memslot)(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont); + int (*create_memslot)(struct kvm_memory_slot *slot, + unsigned long npages); + int (*init_vm)(struct kvm *kvm); + void (*destroy_vm)(struct kvm *kvm); + int (*get_smmu_info)(struct kvm *kvm, struct kvm_ppc_smmu_info *info); + int (*emulate_op)(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int inst, int *advance); + int (*emulate_mtspr)(struct kvm_vcpu *vcpu, int sprn, ulong spr_val); + int (*emulate_mfspr)(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val); + void (*fast_vcpu_kick)(struct kvm_vcpu *vcpu); + long (*arch_vm_ioctl)(struct file *filp, unsigned int ioctl, + unsigned long arg); + +}; + +extern struct kvmppc_ops *kvmppc_hv_ops; +extern struct kvmppc_ops *kvmppc_pr_ops; + +static inline bool is_kvmppc_hv_enabled(struct kvm *kvm) +{ + return kvm->arch.kvm_ops == kvmppc_hv_ops; +} + /* * Cuts out inst bits with ordering according to spec. * That means the leftmost bit is zero. All given bits are included. @@ -210,17 +269,6 @@ static inline u32 kvmppc_set_field(u64 inst, int msb, int lsb, int value) return r; } -union kvmppc_one_reg { - u32 wval; - u64 dval; - vector128 vval; - u64 vsxval[2]; - struct { - u64 addr; - u64 length; - } vpaval; -}; - #define one_reg_size(id) \ (1ul << (((id) & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT)) @@ -245,10 +293,10 @@ union kvmppc_one_reg { __v; \ }) -void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); +int kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); -void kvmppc_get_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); +int kvmppc_get_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); int kvmppc_set_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg); @@ -260,7 +308,7 @@ void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid); struct openpic; -#ifdef CONFIG_KVM_BOOK3S_64_HV +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE extern void kvm_cma_reserve(void) __init; static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr) { @@ -269,10 +317,10 @@ static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr) static inline u32 kvmppc_get_xics_latch(void) { - u32 xirr = get_paca()->kvm_hstate.saved_xirr; + u32 xirr; + xirr = get_paca()->kvm_hstate.saved_xirr; get_paca()->kvm_hstate.saved_xirr = 0; - return xirr; } @@ -281,7 +329,10 @@ static inline void kvmppc_set_host_ipi(int cpu, u8 host_ipi) paca[cpu].kvm_hstate.host_ipi = host_ipi; } -extern void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu); +static inline void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu) +{ + vcpu->kvm->arch.kvm_ops->fast_vcpu_kick(vcpu); +} #else static inline void __init kvm_cma_reserve(void) diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index a5954ce..b6ea9e0 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -166,7 +166,7 @@ struct paca_struct { struct dtl_entry *dtl_curr; /* pointer corresponding to dtl_ridx */ #ifdef CONFIG_KVM_BOOK3S_HANDLER -#ifdef CONFIG_KVM_BOOK3S_PR +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE /* We use this to store guest state in */ struct kvmppc_book3s_shadow_vcpu shadow_vcpu; #endif diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index 7794b2b..fc14a38 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -208,6 +208,7 @@ struct debug_reg { struct thread_struct { unsigned long ksp; /* Kernel stack pointer */ + #ifdef CONFIG_PPC64 unsigned long ksp_vsid; #endif @@ -221,6 +222,7 @@ struct thread_struct { void *pgdir; /* root of page-table tree */ unsigned long ksp_limit; /* if ksp <= ksp_limit stack overflow */ #endif + /* Debug Registers */ struct debug_reg debug; struct thread_fp_state fp_state; struct thread_fp_state *fp_save_area; diff --git a/arch/powerpc/include/asm/pte-book3e.h b/arch/powerpc/include/asm/pte-book3e.h index 0156702..576ad88 100644 --- a/arch/powerpc/include/asm/pte-book3e.h +++ b/arch/powerpc/include/asm/pte-book3e.h @@ -40,7 +40,7 @@ #define _PAGE_U1 0x010000 #define _PAGE_U0 0x020000 #define _PAGE_ACCESSED 0x040000 -#define _PAGE_LENDIAN 0x080000 +#define _PAGE_ENDIAN 0x080000 #define _PAGE_GUARDED 0x100000 #define _PAGE_COHERENT 0x200000 /* M: enforce memory coherence */ #define _PAGE_NO_CACHE 0x400000 /* I: cache inhibit */ diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 126f6e9..5c45787 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -248,6 +248,7 @@ #define SPRN_TBRU 0x10D /* Time Base Read Upper Register (user, R/O) */ #define SPRN_TBWL 0x11C /* Time Base Lower Register (super, R/W) */ #define SPRN_TBWU 0x11D /* Time Base Upper Register (super, R/W) */ +#define SPRN_TBU40 0x11E /* Timebase upper 40 bits (hyper, R/W) */ #define SPRN_SPURR 0x134 /* Scaled PURR */ #define SPRN_HSPRG0 0x130 /* Hypervisor Scratch 0 */ #define SPRN_HSPRG1 0x131 /* Hypervisor Scratch 1 */ @@ -288,6 +289,7 @@ #define LPCR_ISL (1ul << (63-2)) #define LPCR_VC_SH (63-2) #define LPCR_DPFD_SH (63-11) +#define LPCR_DPFD (7ul << LPCR_DPFD_SH) #define LPCR_VRMASD (0x1ful << (63-16)) #define LPCR_VRMA_L (1ul << (63-12)) #define LPCR_VRMA_LP0 (1ul << (63-15)) @@ -304,6 +306,7 @@ #define LPCR_PECE2 0x00001000 /* machine check etc can cause exit */ #define LPCR_MER 0x00000800 /* Mediated External Exception */ #define LPCR_MER_SH 11 +#define LPCR_TC 0x00000200 /* Translation control */ #define LPCR_LPES 0x0000000c #define LPCR_LPES0 0x00000008 /* LPAR Env selector 0 */ #define LPCR_LPES1 0x00000004 /* LPAR Env selector 1 */ @@ -316,6 +319,10 @@ #define LPID_RSVD 0x3ff /* Reserved LPID for partn switching */ #define SPRN_HMER 0x150 /* Hardware m? error recovery */ #define SPRN_HMEER 0x151 /* Hardware m? enable error recovery */ +#define SPRN_PCR 0x152 /* Processor compatibility register */ +#define PCR_VEC_DIS (1ul << (63-0)) /* Vec. disable (bit NA since POWER8) */ +#define PCR_VSX_DIS (1ul << (63-1)) /* VSX disable (bit NA since POWER8) */ +#define PCR_ARCH_205 0x2 /* Architecture 2.05 */ #define SPRN_HEIR 0x153 /* Hypervisor Emulated Instruction Register */ #define SPRN_TLBINDEXR 0x154 /* P7 TLB control register */ #define SPRN_TLBVPNR 0x155 /* P7 TLB control register */ @@ -425,6 +432,7 @@ #define HID4_RMLS2_SH (63 - 2) /* Real mode limit bottom 2 bits */ #define HID4_LPID5_SH (63 - 6) /* partition ID bottom 4 bits */ #define HID4_RMOR_SH (63 - 22) /* real mode offset (16 bits) */ +#define HID4_RMOR (0xFFFFul << HID4_RMOR_SH) #define HID4_LPES1 (1 << (63-57)) /* LPAR env. sel. bit 1 */ #define HID4_RMLS0_SH (63 - 58) /* Real mode limit top bit */ #define HID4_LPID1_SH 0 /* partition ID top 2 bits */ @@ -1107,6 +1115,13 @@ #define PVR_BE 0x0070 #define PVR_PA6T 0x0090 +/* "Logical" PVR values defined in PAPR, representing architecture levels */ +#define PVR_ARCH_204 0x0f000001 +#define PVR_ARCH_205 0x0f000002 +#define PVR_ARCH_206 0x0f000003 +#define PVR_ARCH_206p 0x0f100003 +#define PVR_ARCH_207 0x0f000004 + /* Macros for setting and retrieving special purpose registers */ #ifndef __ASSEMBLY__ #define mfmsr() ({unsigned long rval; \ diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index 0fb1a6e..6836ec7 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -27,6 +27,7 @@ #define __KVM_HAVE_PPC_SMT #define __KVM_HAVE_IRQCHIP #define __KVM_HAVE_IRQ_LINE +#define __KVM_HAVE_GUEST_DEBUG struct kvm_regs { __u64 pc; @@ -269,7 +270,24 @@ struct kvm_fpu { __u64 fpr[32]; }; +/* + * Defines for h/w breakpoint, watchpoint (read, write or both) and + * software breakpoint. + * These are used as "type" in KVM_SET_GUEST_DEBUG ioctl and "status" + * for KVM_DEBUG_EXIT. + */ +#define KVMPPC_DEBUG_NONE 0x0 +#define KVMPPC_DEBUG_BREAKPOINT (1UL << 1) +#define KVMPPC_DEBUG_WATCH_WRITE (1UL << 2) +#define KVMPPC_DEBUG_WATCH_READ (1UL << 3) struct kvm_debug_exit_arch { + __u64 address; + /* + * exiting to userspace because of h/w breakpoint, watchpoint + * (read, write or both) and software breakpoint. + */ + __u32 status; + __u32 reserved; }; /* for KVM_SET_GUEST_DEBUG */ @@ -281,10 +299,6 @@ struct kvm_guest_debug_arch { * Type denotes h/w breakpoint, read watchpoint, write * watchpoint or watchpoint (both read and write). */ -#define KVMPPC_DEBUG_NONE 0x0 -#define KVMPPC_DEBUG_BREAKPOINT (1UL << 1) -#define KVMPPC_DEBUG_WATCH_WRITE (1UL << 2) -#define KVMPPC_DEBUG_WATCH_READ (1UL << 3) __u32 type; __u32 reserved; } bp[16]; @@ -429,6 +443,11 @@ struct kvm_get_htab_header { #define KVM_REG_PPC_MMCR0 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x10) #define KVM_REG_PPC_MMCR1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x11) #define KVM_REG_PPC_MMCRA (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x12) +#define KVM_REG_PPC_MMCR2 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x13) +#define KVM_REG_PPC_MMCRS (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x14) +#define KVM_REG_PPC_SIAR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x15) +#define KVM_REG_PPC_SDAR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x16) +#define KVM_REG_PPC_SIER (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x17) #define KVM_REG_PPC_PMC1 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x18) #define KVM_REG_PPC_PMC2 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x19) @@ -499,6 +518,65 @@ struct kvm_get_htab_header { #define KVM_REG_PPC_TLB3PS (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x9a) #define KVM_REG_PPC_EPTCFG (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x9b) +/* Timebase offset */ +#define KVM_REG_PPC_TB_OFFSET (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x9c) + +/* POWER8 registers */ +#define KVM_REG_PPC_SPMC1 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x9d) +#define KVM_REG_PPC_SPMC2 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x9e) +#define KVM_REG_PPC_IAMR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x9f) +#define KVM_REG_PPC_TFHAR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa0) +#define KVM_REG_PPC_TFIAR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa1) +#define KVM_REG_PPC_TEXASR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa2) +#define KVM_REG_PPC_FSCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa3) +#define KVM_REG_PPC_PSPB (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xa4) +#define KVM_REG_PPC_EBBHR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa5) +#define KVM_REG_PPC_EBBRR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa6) +#define KVM_REG_PPC_BESCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa7) +#define KVM_REG_PPC_TAR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa8) +#define KVM_REG_PPC_DPDES (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa9) +#define KVM_REG_PPC_DAWR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xaa) +#define KVM_REG_PPC_DAWRX (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xab) +#define KVM_REG_PPC_CIABR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xac) +#define KVM_REG_PPC_IC (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xad) +#define KVM_REG_PPC_VTB (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xae) +#define KVM_REG_PPC_CSIGR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xaf) +#define KVM_REG_PPC_TACR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb0) +#define KVM_REG_PPC_TCSCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb1) +#define KVM_REG_PPC_PID (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb2) +#define KVM_REG_PPC_ACOP (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb3) + +#define KVM_REG_PPC_VRSAVE (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xb4) +#define KVM_REG_PPC_LPCR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xb5) +#define KVM_REG_PPC_PPR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb6) + +/* Architecture compatibility level */ +#define KVM_REG_PPC_ARCH_COMPAT (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xb7) + +/* Transactional Memory checkpointed state: + * This is all GPRs, all VSX regs and a subset of SPRs + */ +#define KVM_REG_PPC_TM (KVM_REG_PPC | 0x80000000) +/* TM GPRs */ +#define KVM_REG_PPC_TM_GPR0 (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0) +#define KVM_REG_PPC_TM_GPR(n) (KVM_REG_PPC_TM_GPR0 + (n)) +#define KVM_REG_PPC_TM_GPR31 (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x1f) +/* TM VSX */ +#define KVM_REG_PPC_TM_VSR0 (KVM_REG_PPC_TM | KVM_REG_SIZE_U128 | 0x20) +#define KVM_REG_PPC_TM_VSR(n) (KVM_REG_PPC_TM_VSR0 + (n)) +#define KVM_REG_PPC_TM_VSR63 (KVM_REG_PPC_TM | KVM_REG_SIZE_U128 | 0x5f) +/* TM SPRS */ +#define KVM_REG_PPC_TM_CR (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x60) +#define KVM_REG_PPC_TM_LR (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x61) +#define KVM_REG_PPC_TM_CTR (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x62) +#define KVM_REG_PPC_TM_FPSCR (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x63) +#define KVM_REG_PPC_TM_AMR (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x64) +#define KVM_REG_PPC_TM_PPR (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x65) +#define KVM_REG_PPC_TM_VRSAVE (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x66) +#define KVM_REG_PPC_TM_VSCR (KVM_REG_PPC_TM | KVM_REG_SIZE_U32 | 0x67) +#define KVM_REG_PPC_TM_DSCR (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x68) +#define KVM_REG_PPC_TM_TAR (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x69) + /* PPC64 eXternal Interrupt Controller Specification */ #define KVM_DEV_XICS_GRP_SOURCES 1 /* 64-bit source attributes */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index e60a369..2ea5cc0 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -439,7 +439,7 @@ int main(void) DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr)); DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr)); DEFINE(VCPU_PC, offsetof(struct kvm_vcpu, arch.pc)); -#ifdef CONFIG_KVM_BOOK3S_64_HV +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE DEFINE(VCPU_MSR, offsetof(struct kvm_vcpu, arch.shregs.msr)); DEFINE(VCPU_SRR0, offsetof(struct kvm_vcpu, arch.shregs.srr0)); DEFINE(VCPU_SRR1, offsetof(struct kvm_vcpu, arch.shregs.srr1)); @@ -470,7 +470,7 @@ int main(void) DEFINE(KVM_LPID, offsetof(struct kvm, arch.lpid)); /* book3s */ -#ifdef CONFIG_KVM_BOOK3S_64_HV +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE DEFINE(KVM_SDR1, offsetof(struct kvm, arch.sdr1)); DEFINE(KVM_HOST_LPID, offsetof(struct kvm, arch.host_lpid)); DEFINE(KVM_HOST_LPCR, offsetof(struct kvm, arch.host_lpcr)); @@ -502,6 +502,8 @@ int main(void) DEFINE(VCPU_PRODDED, offsetof(struct kvm_vcpu, arch.prodded)); DEFINE(VCPU_MMCR, offsetof(struct kvm_vcpu, arch.mmcr)); DEFINE(VCPU_PMC, offsetof(struct kvm_vcpu, arch.pmc)); + DEFINE(VCPU_SIAR, offsetof(struct kvm_vcpu, arch.siar)); + DEFINE(VCPU_SDAR, offsetof(struct kvm_vcpu, arch.sdar)); DEFINE(VCPU_SLB, offsetof(struct kvm_vcpu, arch.slb)); DEFINE(VCPU_SLB_MAX, offsetof(struct kvm_vcpu, arch.slb_max)); DEFINE(VCPU_SLB_NR, offsetof(struct kvm_vcpu, arch.slb_nr)); @@ -511,18 +513,22 @@ int main(void) DEFINE(VCPU_TRAP, offsetof(struct kvm_vcpu, arch.trap)); DEFINE(VCPU_PTID, offsetof(struct kvm_vcpu, arch.ptid)); DEFINE(VCPU_CFAR, offsetof(struct kvm_vcpu, arch.cfar)); + DEFINE(VCPU_PPR, offsetof(struct kvm_vcpu, arch.ppr)); + DEFINE(VCPU_SHADOW_SRR1, offsetof(struct kvm_vcpu, arch.shadow_srr1)); DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_count)); DEFINE(VCORE_NAP_COUNT, offsetof(struct kvmppc_vcore, nap_count)); DEFINE(VCORE_IN_GUEST, offsetof(struct kvmppc_vcore, in_guest)); DEFINE(VCORE_NAPPING_THREADS, offsetof(struct kvmppc_vcore, napping_threads)); - DEFINE(VCPU_SVCPU, offsetof(struct kvmppc_vcpu_book3s, shadow_vcpu) - - offsetof(struct kvmppc_vcpu_book3s, vcpu)); + DEFINE(VCORE_TB_OFFSET, offsetof(struct kvmppc_vcore, tb_offset)); + DEFINE(VCORE_LPCR, offsetof(struct kvmppc_vcore, lpcr)); + DEFINE(VCORE_PCR, offsetof(struct kvmppc_vcore, pcr)); DEFINE(VCPU_SLB_E, offsetof(struct kvmppc_slb, orige)); DEFINE(VCPU_SLB_V, offsetof(struct kvmppc_slb, origv)); DEFINE(VCPU_SLB_SIZE, sizeof(struct kvmppc_slb)); #ifdef CONFIG_PPC_BOOK3S_64 -#ifdef CONFIG_KVM_BOOK3S_PR +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE + DEFINE(PACA_SVCPU, offsetof(struct paca_struct, shadow_vcpu)); # define SVCPU_FIELD(x, f) DEFINE(x, offsetof(struct paca_struct, shadow_vcpu.f)) #else # define SVCPU_FIELD(x, f) @@ -574,7 +580,7 @@ int main(void) HSTATE_FIELD(HSTATE_RESTORE_HID5, restore_hid5); HSTATE_FIELD(HSTATE_NAPPING, napping); -#ifdef CONFIG_KVM_BOOK3S_64_HV +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE HSTATE_FIELD(HSTATE_HWTHREAD_REQ, hwthread_req); HSTATE_FIELD(HSTATE_HWTHREAD_STATE, hwthread_state); HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu); @@ -590,10 +596,11 @@ int main(void) HSTATE_FIELD(HSTATE_DABR, dabr); HSTATE_FIELD(HSTATE_DECEXP, dec_expires); DEFINE(IPI_PRIORITY, IPI_PRIORITY); -#endif /* CONFIG_KVM_BOOK3S_64_HV */ +#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ #ifdef CONFIG_PPC_BOOK3S_64 HSTATE_FIELD(HSTATE_CFAR, cfar); + HSTATE_FIELD(HSTATE_PPR, ppr); #endif /* CONFIG_PPC_BOOK3S_64 */ #else /* CONFIG_PPC_BOOK3S */ diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 3a9ed6a..9f905e4 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -126,7 +126,7 @@ BEGIN_FTR_SECTION bgt cr1,. GET_PACA(r13) -#ifdef CONFIG_KVM_BOOK3S_64_HV +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE li r0,KVM_HWTHREAD_IN_KERNEL stb r0,HSTATE_HWTHREAD_STATE(r13) /* Order setting hwthread_state vs. testing hwthread_req */ @@ -425,7 +425,7 @@ data_access_check_stab: mfspr r9,SPRN_DSISR srdi r10,r10,60 rlwimi r10,r9,16,0x20 -#ifdef CONFIG_KVM_BOOK3S_PR +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE lbz r9,HSTATE_IN_GUEST(r13) rlwimi r10,r9,8,0x300 #endif @@ -650,6 +650,32 @@ slb_miss_user_pseries: b . /* prevent spec. execution */ #endif /* __DISABLED__ */ +#ifdef CONFIG_KVM_BOOK3S_64_HANDLER +kvmppc_skip_interrupt: + /* + * Here all GPRs are unchanged from when the interrupt happened + * except for r13, which is saved in SPRG_SCRATCH0. + */ + mfspr r13, SPRN_SRR0 + addi r13, r13, 4 + mtspr SPRN_SRR0, r13 + GET_SCRATCH0(r13) + rfid + b . + +kvmppc_skip_Hinterrupt: + /* + * Here all GPRs are unchanged from when the interrupt happened + * except for r13, which is saved in SPRG_SCRATCH0. + */ + mfspr r13, SPRN_HSRR0 + addi r13, r13, 4 + mtspr SPRN_HSRR0, r13 + GET_SCRATCH0(r13) + hrfid + b . +#endif + /* * Code from here down to __end_handlers is invoked from the * exception prologs above. Because the prologs assemble the diff --git a/arch/powerpc/kernel/idle_power7.S b/arch/powerpc/kernel/idle_power7.S index e11863f..847e40e 100644 --- a/arch/powerpc/kernel/idle_power7.S +++ b/arch/powerpc/kernel/idle_power7.S @@ -84,7 +84,7 @@ _GLOBAL(power7_nap) std r9,_MSR(r1) std r1,PACAR1(r13) -#ifdef CONFIG_KVM_BOOK3S_64_HV +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE /* Tell KVM we're napping */ li r4,KVM_HWTHREAD_IN_NAP stb r4,HSTATE_HWTHREAD_STATE(r13) diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 62c3dd8..907a472 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -1529,7 +1529,7 @@ static void handle_debug(struct pt_regs *regs, unsigned long debug_status) * back on or not. */ if (DBCR_ACTIVE_EVENTS(current->thread.debug.dbcr0, - current->thread.debug.dbcr1)) + current->thread.debug.dbcr1)) regs->msr |= MSR_DE; else /* Make sure the IDM flag is off */ diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c index 2f5c6b6..93221e8 100644 --- a/arch/powerpc/kvm/44x.c +++ b/arch/powerpc/kvm/44x.c @@ -31,13 +31,13 @@ #include "44x_tlb.h" #include "booke.h" -void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +static void kvmppc_core_vcpu_load_44x(struct kvm_vcpu *vcpu, int cpu) { kvmppc_booke_vcpu_load(vcpu, cpu); kvmppc_44x_tlb_load(vcpu); } -void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) +static void kvmppc_core_vcpu_put_44x(struct kvm_vcpu *vcpu) { kvmppc_44x_tlb_put(vcpu); kvmppc_booke_vcpu_put(vcpu); @@ -114,29 +114,32 @@ int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu, return 0; } -void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +static int kvmppc_core_get_sregs_44x(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) { - kvmppc_get_sregs_ivor(vcpu, sregs); + return kvmppc_get_sregs_ivor(vcpu, sregs); } -int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +static int kvmppc_core_set_sregs_44x(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) { return kvmppc_set_sregs_ivor(vcpu, sregs); } -int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, - union kvmppc_one_reg *val) +static int kvmppc_get_one_reg_44x(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) { return -EINVAL; } -int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, - union kvmppc_one_reg *val) +static int kvmppc_set_one_reg_44x(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) { return -EINVAL; } -struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) +static struct kvm_vcpu *kvmppc_core_vcpu_create_44x(struct kvm *kvm, + unsigned int id) { struct kvmppc_vcpu_44x *vcpu_44x; struct kvm_vcpu *vcpu; @@ -167,7 +170,7 @@ out: return ERR_PTR(err); } -void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) +static void kvmppc_core_vcpu_free_44x(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu); @@ -176,28 +179,53 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) kmem_cache_free(kvm_vcpu_cache, vcpu_44x); } -int kvmppc_core_init_vm(struct kvm *kvm) +static int kvmppc_core_init_vm_44x(struct kvm *kvm) { return 0; } -void kvmppc_core_destroy_vm(struct kvm *kvm) +static void kvmppc_core_destroy_vm_44x(struct kvm *kvm) { } +static struct kvmppc_ops kvm_ops_44x = { + .get_sregs = kvmppc_core_get_sregs_44x, + .set_sregs = kvmppc_core_set_sregs_44x, + .get_one_reg = kvmppc_get_one_reg_44x, + .set_one_reg = kvmppc_set_one_reg_44x, + .vcpu_load = kvmppc_core_vcpu_load_44x, + .vcpu_put = kvmppc_core_vcpu_put_44x, + .vcpu_create = kvmppc_core_vcpu_create_44x, + .vcpu_free = kvmppc_core_vcpu_free_44x, + .mmu_destroy = kvmppc_mmu_destroy_44x, + .init_vm = kvmppc_core_init_vm_44x, + .destroy_vm = kvmppc_core_destroy_vm_44x, + .emulate_op = kvmppc_core_emulate_op_44x, + .emulate_mtspr = kvmppc_core_emulate_mtspr_44x, + .emulate_mfspr = kvmppc_core_emulate_mfspr_44x, +}; + static int __init kvmppc_44x_init(void) { int r; r = kvmppc_booke_init(); if (r) - return r; + goto err_out; + + r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_44x), 0, THIS_MODULE); + if (r) + goto err_out; + kvm_ops_44x.owner = THIS_MODULE; + kvmppc_pr_ops = &kvm_ops_44x; - return kvm_init(NULL, sizeof(struct kvmppc_vcpu_44x), 0, THIS_MODULE); +err_out: + return r; } static void __exit kvmppc_44x_exit(void) { + kvmppc_pr_ops = NULL; kvmppc_booke_exit(); } diff --git a/arch/powerpc/kvm/44x_emulate.c b/arch/powerpc/kvm/44x_emulate.c index 35ec0a8..92c9ab4 100644 --- a/arch/powerpc/kvm/44x_emulate.c +++ b/arch/powerpc/kvm/44x_emulate.c @@ -91,8 +91,8 @@ static int emulate_mfdcr(struct kvm_vcpu *vcpu, int rt, int dcrn) return EMULATE_DONE; } -int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, - unsigned int inst, int *advance) +int kvmppc_core_emulate_op_44x(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int inst, int *advance) { int emulated = EMULATE_DONE; int dcrn = get_dcrn(inst); @@ -152,7 +152,7 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, return emulated; } -int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) +int kvmppc_core_emulate_mtspr_44x(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) { int emulated = EMULATE_DONE; @@ -172,7 +172,7 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) return emulated; } -int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) +int kvmppc_core_emulate_mfspr_44x(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) { int emulated = EMULATE_DONE; diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c index ed03854..0deef10 100644 --- a/arch/powerpc/kvm/44x_tlb.c +++ b/arch/powerpc/kvm/44x_tlb.c @@ -268,7 +268,7 @@ static void kvmppc_44x_shadow_release(struct kvmppc_vcpu_44x *vcpu_44x, trace_kvm_stlb_inval(stlb_index); } -void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) +void kvmppc_mmu_destroy_44x(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu); int i; diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index e593ff2..141b202 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -35,17 +35,20 @@ config KVM_BOOK3S_64_HANDLER bool select KVM_BOOK3S_HANDLER -config KVM_BOOK3S_PR +config KVM_BOOK3S_PR_POSSIBLE bool select KVM_MMIO select MMU_NOTIFIER +config KVM_BOOK3S_HV_POSSIBLE + bool + config KVM_BOOK3S_32 tristate "KVM support for PowerPC book3s_32 processors" depends on PPC_BOOK3S_32 && !SMP && !PTE_64BIT select KVM select KVM_BOOK3S_32_HANDLER - select KVM_BOOK3S_PR + select KVM_BOOK3S_PR_POSSIBLE ---help--- Support running unmodified book3s_32 guest kernels in virtual machines on book3s_32 host processors. @@ -60,6 +63,7 @@ config KVM_BOOK3S_64 depends on PPC_BOOK3S_64 select KVM_BOOK3S_64_HANDLER select KVM + select KVM_BOOK3S_PR_POSSIBLE if !KVM_BOOK3S_HV_POSSIBLE ---help--- Support running unmodified book3s_64 and book3s_32 guest kernels in virtual machines on book3s_64 host processors. @@ -70,8 +74,9 @@ config KVM_BOOK3S_64 If unsure, say N. config KVM_BOOK3S_64_HV - bool "KVM support for POWER7 and PPC970 using hypervisor mode in host" + tristate "KVM support for POWER7 and PPC970 using hypervisor mode in host" depends on KVM_BOOK3S_64 + select KVM_BOOK3S_HV_POSSIBLE select MMU_NOTIFIER select CMA ---help--- @@ -90,9 +95,20 @@ config KVM_BOOK3S_64_HV If unsure, say N. config KVM_BOOK3S_64_PR - def_bool y - depends on KVM_BOOK3S_64 && !KVM_BOOK3S_64_HV - select KVM_BOOK3S_PR + tristate "KVM support without using hypervisor mode in host" + depends on KVM_BOOK3S_64 + select KVM_BOOK3S_PR_POSSIBLE + ---help--- + Support running guest kernels in virtual machines on processors + without using hypervisor mode in the host, by running the + guest in user mode (problem state) and emulating all + privileged instructions and registers. + + This is not as fast as using hypervisor mode, but works on + machines where hypervisor mode is not available or not usable, + and can emulate processors that are different from the host + processor, including emulating 32-bit processors on a 64-bit + host. config KVM_BOOKE_HV bool diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index 6646c95..ce569b6 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -53,41 +53,51 @@ kvm-e500mc-objs := \ e500_emulate.o kvm-objs-$(CONFIG_KVM_E500MC) := $(kvm-e500mc-objs) -kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_PR) := \ - $(KVM)/coalesced_mmio.o \ +kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) := \ + book3s_64_vio_hv.o + +kvm-pr-y := \ fpu.o \ book3s_paired_singles.o \ book3s_pr.o \ book3s_pr_papr.o \ - book3s_64_vio_hv.o \ book3s_emulate.o \ book3s_interrupts.o \ book3s_mmu_hpte.o \ book3s_64_mmu_host.o \ book3s_64_mmu.o \ book3s_32_mmu.o -kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_PR) := \ + +ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE +kvm-book3s_64-module-objs := \ + $(KVM)/coalesced_mmio.o + +kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \ book3s_rmhandlers.o +endif -kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \ +kvm-hv-y += \ book3s_hv.o \ book3s_hv_interrupts.o \ book3s_64_mmu_hv.o + kvm-book3s_64-builtin-xics-objs-$(CONFIG_KVM_XICS) := \ book3s_hv_rm_xics.o -kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \ + +ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE +kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \ book3s_hv_rmhandlers.o \ book3s_hv_rm_mmu.o \ - book3s_64_vio_hv.o \ book3s_hv_ras.o \ book3s_hv_builtin.o \ book3s_hv_cma.o \ $(kvm-book3s_64-builtin-xics-objs-y) +endif kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \ book3s_xics.o -kvm-book3s_64-module-objs := \ +kvm-book3s_64-module-objs += \ $(KVM)/kvm_main.o \ $(KVM)/eventfd.o \ powerpc.o \ @@ -123,4 +133,7 @@ obj-$(CONFIG_KVM_E500MC) += kvm.o obj-$(CONFIG_KVM_BOOK3S_64) += kvm.o obj-$(CONFIG_KVM_BOOK3S_32) += kvm.o +obj-$(CONFIG_KVM_BOOK3S_64_PR) += kvm-pr.o +obj-$(CONFIG_KVM_BOOK3S_64_HV) += kvm-hv.o + obj-y += $(kvm-book3s_64-builtin-objs-y) diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 700df6f..8912608 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -34,6 +34,7 @@ #include <linux/vmalloc.h> #include <linux/highmem.h> +#include "book3s.h" #include "trace.h" #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU @@ -69,6 +70,50 @@ void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu) { } +static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu) +{ + if (!is_kvmppc_hv_enabled(vcpu->kvm)) + return to_book3s(vcpu)->hior; + return 0; +} + +static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu, + unsigned long pending_now, unsigned long old_pending) +{ + if (is_kvmppc_hv_enabled(vcpu->kvm)) + return; + if (pending_now) + vcpu->arch.shared->int_pending = 1; + else if (old_pending) + vcpu->arch.shared->int_pending = 0; +} + +static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu) +{ + ulong crit_raw; + ulong crit_r1; + bool crit; + + if (is_kvmppc_hv_enabled(vcpu->kvm)) + return false; + + crit_raw = vcpu->arch.shared->critical; + crit_r1 = kvmppc_get_gpr(vcpu, 1); + + /* Truncate crit indicators in 32 bit mode */ + if (!(vcpu->arch.shared->msr & MSR_SF)) { + crit_raw &= 0xffffffff; + crit_r1 &= 0xffffffff; + } + + /* Critical section when crit == r1 */ + crit = (crit_raw == crit_r1); + /* ... and we're in supervisor mode */ + crit = crit && !(vcpu->arch.shared->msr & MSR_PR); + + return crit; +} + void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags) { vcpu->arch.shared->srr0 = kvmppc_get_pc(vcpu); @@ -126,28 +171,32 @@ void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec) printk(KERN_INFO "Queueing interrupt %x\n", vec); #endif } - +EXPORT_SYMBOL_GPL(kvmppc_book3s_queue_irqprio); void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags) { /* might as well deliver this straight away */ kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_PROGRAM, flags); } +EXPORT_SYMBOL_GPL(kvmppc_core_queue_program); void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu) { kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DECREMENTER); } +EXPORT_SYMBOL_GPL(kvmppc_core_queue_dec); int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu) { return test_bit(BOOK3S_IRQPRIO_DECREMENTER, &vcpu->arch.pending_exceptions); } +EXPORT_SYMBOL_GPL(kvmppc_core_pending_dec); void kvmppc_core_dequeue_dec(struct kvm_vcpu *vcpu) { kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_DECREMENTER); } +EXPORT_SYMBOL_GPL(kvmppc_core_dequeue_dec); void kvmppc_core_queue_external(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) @@ -285,8 +334,10 @@ int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu) return 0; } +EXPORT_SYMBOL_GPL(kvmppc_core_prepare_to_enter); -pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn) +pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, bool writing, + bool *writable) { ulong mp_pa = vcpu->arch.magic_page_pa; @@ -302,20 +353,23 @@ pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn) pfn = (pfn_t)virt_to_phys((void*)shared_page) >> PAGE_SHIFT; get_page(pfn_to_page(pfn)); + if (writable) + *writable = true; return pfn; } - return gfn_to_pfn(vcpu->kvm, gfn); + return gfn_to_pfn_prot(vcpu->kvm, gfn, writing, writable); } +EXPORT_SYMBOL_GPL(kvmppc_gfn_to_pfn); static int kvmppc_xlate(struct kvm_vcpu *vcpu, ulong eaddr, bool data, - struct kvmppc_pte *pte) + bool iswrite, struct kvmppc_pte *pte) { int relocated = (vcpu->arch.shared->msr & (data ? MSR_DR : MSR_IR)); int r; if (relocated) { - r = vcpu->arch.mmu.xlate(vcpu, eaddr, pte, data); + r = vcpu->arch.mmu.xlate(vcpu, eaddr, pte, data, iswrite); } else { pte->eaddr = eaddr; pte->raddr = eaddr & KVM_PAM; @@ -361,7 +415,7 @@ int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, vcpu->stat.st++; - if (kvmppc_xlate(vcpu, *eaddr, data, &pte)) + if (kvmppc_xlate(vcpu, *eaddr, data, true, &pte)) return -ENOENT; *eaddr = pte.raddr; @@ -374,6 +428,7 @@ int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, return EMULATE_DONE; } +EXPORT_SYMBOL_GPL(kvmppc_st); int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data) @@ -383,7 +438,7 @@ int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, vcpu->stat.ld++; - if (kvmppc_xlate(vcpu, *eaddr, data, &pte)) + if (kvmppc_xlate(vcpu, *eaddr, data, false, &pte)) goto nopte; *eaddr = pte.raddr; @@ -404,6 +459,7 @@ nopte: mmio: return EMULATE_DO_MMIO; } +EXPORT_SYMBOL_GPL(kvmppc_ld); int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) { @@ -419,6 +475,18 @@ void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu) { } +int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + return vcpu->kvm->arch.kvm_ops->get_sregs(vcpu, sregs); +} + +int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + return vcpu->kvm->arch.kvm_ops->set_sregs(vcpu, sregs); +} + int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { int i; @@ -495,8 +563,7 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) if (size > sizeof(val)) return -EINVAL; - r = kvmppc_get_one_reg(vcpu, reg->id, &val); - + r = vcpu->kvm->arch.kvm_ops->get_one_reg(vcpu, reg->id, &val); if (r == -EINVAL) { r = 0; switch (reg->id) { @@ -528,6 +595,9 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) } val = get_reg_val(reg->id, vcpu->arch.vscr.u[3]); break; + case KVM_REG_PPC_VRSAVE: + val = get_reg_val(reg->id, vcpu->arch.vrsave); + break; #endif /* CONFIG_ALTIVEC */ case KVM_REG_PPC_DEBUG_INST: { u32 opcode = INS_TW; @@ -572,8 +642,7 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) if (copy_from_user(&val, (char __user *)(unsigned long)reg->addr, size)) return -EFAULT; - r = kvmppc_set_one_reg(vcpu, reg->id, &val); - + r = vcpu->kvm->arch.kvm_ops->set_one_reg(vcpu, reg->id, &val); if (r == -EINVAL) { r = 0; switch (reg->id) { @@ -605,6 +674,13 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) } vcpu->arch.vscr.u[3] = set_reg_val(reg->id, val); break; + case KVM_REG_PPC_VRSAVE: + if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { + r = -ENXIO; + break; + } + vcpu->arch.vrsave = set_reg_val(reg->id, val); + break; #endif /* CONFIG_ALTIVEC */ #ifdef CONFIG_KVM_XICS case KVM_REG_PPC_ICP_STATE: @@ -625,6 +701,27 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) return r; } +void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + vcpu->kvm->arch.kvm_ops->vcpu_load(vcpu, cpu); +} + +void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) +{ + vcpu->kvm->arch.kvm_ops->vcpu_put(vcpu); +} + +void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr) +{ + vcpu->kvm->arch.kvm_ops->set_msr(vcpu, msr); +} +EXPORT_SYMBOL_GPL(kvmppc_set_msr); + +int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) +{ + return vcpu->kvm->arch.kvm_ops->vcpu_run(kvm_run, vcpu); +} + int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, struct kvm_translation *tr) { @@ -644,3 +741,141 @@ void kvmppc_decrementer_func(unsigned long data) kvmppc_core_queue_dec(vcpu); kvm_vcpu_kick(vcpu); } + +struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) +{ + return kvm->arch.kvm_ops->vcpu_create(kvm, id); +} + +void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) +{ + vcpu->kvm->arch.kvm_ops->vcpu_free(vcpu); +} + +int kvmppc_core_check_requests(struct kvm_vcpu *vcpu) +{ + return vcpu->kvm->arch.kvm_ops->check_requests(vcpu); +} + +int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) +{ + return kvm->arch.kvm_ops->get_dirty_log(kvm, log); +} + +void kvmppc_core_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ + kvm->arch.kvm_ops->free_memslot(free, dont); +} + +int kvmppc_core_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, + unsigned long npages) +{ + return kvm->arch.kvm_ops->create_memslot(slot, npages); +} + +void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) +{ + kvm->arch.kvm_ops->flush_memslot(kvm, memslot); +} + +int kvmppc_core_prepare_memory_region(struct kvm *kvm, + struct kvm_memory_slot *memslot, + struct kvm_userspace_memory_region *mem) +{ + return kvm->arch.kvm_ops->prepare_memory_region(kvm, memslot, mem); +} + +void kvmppc_core_commit_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + const struct kvm_memory_slot *old) +{ + kvm->arch.kvm_ops->commit_memory_region(kvm, mem, old); +} + +int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) +{ + return kvm->arch.kvm_ops->unmap_hva(kvm, hva); +} +EXPORT_SYMBOL_GPL(kvm_unmap_hva); + +int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) +{ + return kvm->arch.kvm_ops->unmap_hva_range(kvm, start, end); +} + +int kvm_age_hva(struct kvm *kvm, unsigned long hva) +{ + return kvm->arch.kvm_ops->age_hva(kvm, hva); +} + +int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +{ + return kvm->arch.kvm_ops->test_age_hva(kvm, hva); +} + +void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +{ + kvm->arch.kvm_ops->set_spte_hva(kvm, hva, pte); +} + +void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) +{ + vcpu->kvm->arch.kvm_ops->mmu_destroy(vcpu); +} + +int kvmppc_core_init_vm(struct kvm *kvm) +{ + +#ifdef CONFIG_PPC64 + INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); + INIT_LIST_HEAD(&kvm->arch.rtas_tokens); +#endif + + return kvm->arch.kvm_ops->init_vm(kvm); +} + +void kvmppc_core_destroy_vm(struct kvm *kvm) +{ + kvm->arch.kvm_ops->destroy_vm(kvm); + +#ifdef CONFIG_PPC64 + kvmppc_rtas_tokens_free(kvm); + WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables)); +#endif +} + +int kvmppc_core_check_processor_compat(void) +{ + /* + * We always return 0 for book3s. We check + * for compatability while loading the HV + * or PR module + */ + return 0; +} + +static int kvmppc_book3s_init(void) +{ + int r; + + r = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); + if (r) + return r; +#ifdef CONFIG_KVM_BOOK3S_32 + r = kvmppc_book3s_init_pr(); +#endif + return r; + +} + +static void kvmppc_book3s_exit(void) +{ +#ifdef CONFIG_KVM_BOOK3S_32 + kvmppc_book3s_exit_pr(); +#endif + kvm_exit(); +} + +module_init(kvmppc_book3s_init); +module_exit(kvmppc_book3s_exit); diff --git a/arch/powerpc/kvm/book3s.h b/arch/powerpc/kvm/book3s.h new file mode 100644 index 0000000..4bf956c --- /dev/null +++ b/arch/powerpc/kvm/book3s.h @@ -0,0 +1,34 @@ +/* + * Copyright IBM Corporation, 2013 + * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License or (at your optional) any later version of the license. + * + */ + +#ifndef __POWERPC_KVM_BOOK3S_H__ +#define __POWERPC_KVM_BOOK3S_H__ + +extern void kvmppc_core_flush_memslot_hv(struct kvm *kvm, + struct kvm_memory_slot *memslot); +extern int kvm_unmap_hva_hv(struct kvm *kvm, unsigned long hva); +extern int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, + unsigned long end); +extern int kvm_age_hva_hv(struct kvm *kvm, unsigned long hva); +extern int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva); +extern void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte); + +extern void kvmppc_mmu_destroy_pr(struct kvm_vcpu *vcpu); +extern int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int inst, int *advance); +extern int kvmppc_core_emulate_mtspr_pr(struct kvm_vcpu *vcpu, + int sprn, ulong spr_val); +extern int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, + int sprn, ulong *spr_val); +extern int kvmppc_book3s_init_pr(void); +extern void kvmppc_book3s_exit_pr(void); + +#endif diff --git a/arch/powerpc/kvm/book3s_32_mmu.c b/arch/powerpc/kvm/book3s_32_mmu.c index c8cefdd..76a64ce 100644 --- a/arch/powerpc/kvm/book3s_32_mmu.c +++ b/arch/powerpc/kvm/book3s_32_mmu.c @@ -84,7 +84,8 @@ static inline bool sr_nx(u32 sr_raw) } static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr, - struct kvmppc_pte *pte, bool data); + struct kvmppc_pte *pte, bool data, + bool iswrite); static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid, u64 *vsid); @@ -99,7 +100,7 @@ static u64 kvmppc_mmu_book3s_32_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr, u64 vsid; struct kvmppc_pte pte; - if (!kvmppc_mmu_book3s_32_xlate_bat(vcpu, eaddr, &pte, data)) + if (!kvmppc_mmu_book3s_32_xlate_bat(vcpu, eaddr, &pte, data, false)) return pte.vpage; kvmppc_mmu_book3s_32_esid_to_vsid(vcpu, eaddr >> SID_SHIFT, &vsid); @@ -111,10 +112,11 @@ static void kvmppc_mmu_book3s_32_reset_msr(struct kvm_vcpu *vcpu) kvmppc_set_msr(vcpu, 0); } -static hva_t kvmppc_mmu_book3s_32_get_pteg(struct kvmppc_vcpu_book3s *vcpu_book3s, +static hva_t kvmppc_mmu_book3s_32_get_pteg(struct kvm_vcpu *vcpu, u32 sre, gva_t eaddr, bool primary) { + struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); u32 page, hash, pteg, htabmask; hva_t r; @@ -132,7 +134,7 @@ static hva_t kvmppc_mmu_book3s_32_get_pteg(struct kvmppc_vcpu_book3s *vcpu_book3 kvmppc_get_pc(&vcpu_book3s->vcpu), eaddr, vcpu_book3s->sdr1, pteg, sr_vsid(sre)); - r = gfn_to_hva(vcpu_book3s->vcpu.kvm, pteg >> PAGE_SHIFT); + r = gfn_to_hva(vcpu->kvm, pteg >> PAGE_SHIFT); if (kvm_is_error_hva(r)) return r; return r | (pteg & ~PAGE_MASK); @@ -145,7 +147,8 @@ static u32 kvmppc_mmu_book3s_32_get_ptem(u32 sre, gva_t eaddr, bool primary) } static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr, - struct kvmppc_pte *pte, bool data) + struct kvmppc_pte *pte, bool data, + bool iswrite) { struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); struct kvmppc_bat *bat; @@ -186,8 +189,7 @@ static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr, printk(KERN_INFO "BAT is not readable!\n"); continue; } - if (!pte->may_write) { - /* let's treat r/o BATs as not-readable for now */ + if (iswrite && !pte->may_write) { dprintk_pte("BAT is read-only!\n"); continue; } @@ -201,9 +203,8 @@ static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr, static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr, struct kvmppc_pte *pte, bool data, - bool primary) + bool iswrite, bool primary) { - struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); u32 sre; hva_t ptegp; u32 pteg[16]; @@ -218,7 +219,7 @@ static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr, pte->vpage = kvmppc_mmu_book3s_32_ea_to_vp(vcpu, eaddr, data); - ptegp = kvmppc_mmu_book3s_32_get_pteg(vcpu_book3s, sre, eaddr, primary); + ptegp = kvmppc_mmu_book3s_32_get_pteg(vcpu, sre, eaddr, primary); if (kvm_is_error_hva(ptegp)) { printk(KERN_INFO "KVM: Invalid PTEG!\n"); goto no_page_found; @@ -258,9 +259,6 @@ static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr, break; } - if ( !pte->may_read ) - continue; - dprintk_pte("MMU: Found PTE -> %x %x - %x\n", pteg[i], pteg[i+1], pp); found = 1; @@ -271,19 +269,23 @@ static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr, /* Update PTE C and A bits, so the guest's swapper knows we used the page */ if (found) { - u32 oldpte = pteg[i+1]; - - if (pte->may_read) - pteg[i+1] |= PTEG_FLAG_ACCESSED; - if (pte->may_write) - pteg[i+1] |= PTEG_FLAG_DIRTY; - else - dprintk_pte("KVM: Mapping read-only page!\n"); - - /* Write back into the PTEG */ - if (pteg[i+1] != oldpte) - copy_to_user((void __user *)ptegp, pteg, sizeof(pteg)); - + u32 pte_r = pteg[i+1]; + char __user *addr = (char __user *) &pteg[i+1]; + + /* + * Use single-byte writes to update the HPTE, to + * conform to what real hardware does. + */ + if (pte->may_read && !(pte_r & PTEG_FLAG_ACCESSED)) { + pte_r |= PTEG_FLAG_ACCESSED; + put_user(pte_r >> 8, addr + 2); + } + if (iswrite && pte->may_write && !(pte_r & PTEG_FLAG_DIRTY)) { + pte_r |= PTEG_FLAG_DIRTY; + put_user(pte_r, addr + 3); + } + if (!pte->may_read || (iswrite && !pte->may_write)) + return -EPERM; return 0; } @@ -302,12 +304,14 @@ no_page_found: } static int kvmppc_mmu_book3s_32_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, - struct kvmppc_pte *pte, bool data) + struct kvmppc_pte *pte, bool data, + bool iswrite) { int r; ulong mp_ea = vcpu->arch.magic_page_ea; pte->eaddr = eaddr; + pte->page_size = MMU_PAGE_4K; /* Magic page override */ if (unlikely(mp_ea) && @@ -323,11 +327,13 @@ static int kvmppc_mmu_book3s_32_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, return 0; } - r = kvmppc_mmu_book3s_32_xlate_bat(vcpu, eaddr, pte, data); + r = kvmppc_mmu_book3s_32_xlate_bat(vcpu, eaddr, pte, data, iswrite); if (r < 0) - r = kvmppc_mmu_book3s_32_xlate_pte(vcpu, eaddr, pte, data, true); + r = kvmppc_mmu_book3s_32_xlate_pte(vcpu, eaddr, pte, + data, iswrite, true); if (r < 0) - r = kvmppc_mmu_book3s_32_xlate_pte(vcpu, eaddr, pte, data, false); + r = kvmppc_mmu_book3s_32_xlate_pte(vcpu, eaddr, pte, + data, iswrite, false); return r; } @@ -347,7 +353,12 @@ static void kvmppc_mmu_book3s_32_mtsrin(struct kvm_vcpu *vcpu, u32 srnum, static void kvmppc_mmu_book3s_32_tlbie(struct kvm_vcpu *vcpu, ulong ea, bool large) { - kvmppc_mmu_pte_flush(vcpu, ea, 0x0FFFF000); + int i; + struct kvm_vcpu *v; + + /* flush this VA on all cpus */ + kvm_for_each_vcpu(i, v, vcpu->kvm) + kvmppc_mmu_pte_flush(v, ea, 0x0FFFF000); } static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid, diff --git a/arch/powerpc/kvm/book3s_32_mmu_host.c b/arch/powerpc/kvm/book3s_32_mmu_host.c index 00e619b..3a0abd2 100644 --- a/arch/powerpc/kvm/book3s_32_mmu_host.c +++ b/arch/powerpc/kvm/book3s_32_mmu_host.c @@ -138,7 +138,8 @@ static u32 *kvmppc_mmu_get_pteg(struct kvm_vcpu *vcpu, u32 vsid, u32 eaddr, extern char etext[]; -int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte) +int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte, + bool iswrite) { pfn_t hpaddr; u64 vpn; @@ -152,9 +153,11 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte) bool evict = false; struct hpte_cache *pte; int r = 0; + bool writable; /* Get host physical address for gpa */ - hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT); + hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT, + iswrite, &writable); if (is_error_noslot_pfn(hpaddr)) { printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", orig_pte->eaddr); @@ -204,7 +207,7 @@ next_pteg: (primary ? 0 : PTE_SEC); pteg1 = hpaddr | PTE_M | PTE_R | PTE_C; - if (orig_pte->may_write) { + if (orig_pte->may_write && writable) { pteg1 |= PP_RWRW; mark_page_dirty(vcpu->kvm, orig_pte->raddr >> PAGE_SHIFT); } else { @@ -259,6 +262,11 @@ out: return r; } +void kvmppc_mmu_unmap_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte) +{ + kvmppc_mmu_pte_vflush(vcpu, pte->vpage, 0xfffffffffULL); +} + static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid) { struct kvmppc_sid_map *map; @@ -341,7 +349,7 @@ void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu) svcpu_put(svcpu); } -void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) +void kvmppc_mmu_destroy_pr(struct kvm_vcpu *vcpu) { int i; diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c index 7e345e0..83da1f8 100644 --- a/arch/powerpc/kvm/book3s_64_mmu.c +++ b/arch/powerpc/kvm/book3s_64_mmu.c @@ -107,9 +107,20 @@ static u64 kvmppc_mmu_book3s_64_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr, return kvmppc_slb_calc_vpn(slb, eaddr); } +static int mmu_pagesize(int mmu_pg) +{ + switch (mmu_pg) { + case MMU_PAGE_64K: + return 16; + case MMU_PAGE_16M: + return 24; + } + return 12; +} + static int kvmppc_mmu_book3s_64_get_pagesize(struct kvmppc_slb *slbe) { - return slbe->large ? 24 : 12; + return mmu_pagesize(slbe->base_page_size); } static u32 kvmppc_mmu_book3s_64_get_page(struct kvmppc_slb *slbe, gva_t eaddr) @@ -119,11 +130,11 @@ static u32 kvmppc_mmu_book3s_64_get_page(struct kvmppc_slb *slbe, gva_t eaddr) return ((eaddr & kvmppc_slb_offset_mask(slbe)) >> p); } -static hva_t kvmppc_mmu_book3s_64_get_pteg( - struct kvmppc_vcpu_book3s *vcpu_book3s, +static hva_t kvmppc_mmu_book3s_64_get_pteg(struct kvm_vcpu *vcpu, struct kvmppc_slb *slbe, gva_t eaddr, bool second) { + struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); u64 hash, pteg, htabsize; u32 ssize; hva_t r; @@ -148,10 +159,10 @@ static hva_t kvmppc_mmu_book3s_64_get_pteg( /* When running a PAPR guest, SDR1 contains a HVA address instead of a GPA */ - if (vcpu_book3s->vcpu.arch.papr_enabled) + if (vcpu->arch.papr_enabled) r = pteg; else - r = gfn_to_hva(vcpu_book3s->vcpu.kvm, pteg >> PAGE_SHIFT); + r = gfn_to_hva(vcpu->kvm, pteg >> PAGE_SHIFT); if (kvm_is_error_hva(r)) return r; @@ -166,18 +177,38 @@ static u64 kvmppc_mmu_book3s_64_get_avpn(struct kvmppc_slb *slbe, gva_t eaddr) avpn = kvmppc_mmu_book3s_64_get_page(slbe, eaddr); avpn |= slbe->vsid << (kvmppc_slb_sid_shift(slbe) - p); - if (p < 24) - avpn >>= ((80 - p) - 56) - 8; + if (p < 16) + avpn >>= ((80 - p) - 56) - 8; /* 16 - p */ else - avpn <<= 8; + avpn <<= p - 16; return avpn; } +/* + * Return page size encoded in the second word of a HPTE, or + * -1 for an invalid encoding for the base page size indicated by + * the SLB entry. This doesn't handle mixed pagesize segments yet. + */ +static int decode_pagesize(struct kvmppc_slb *slbe, u64 r) +{ + switch (slbe->base_page_size) { + case MMU_PAGE_64K: + if ((r & 0xf000) == 0x1000) + return MMU_PAGE_64K; + break; + case MMU_PAGE_16M: + if ((r & 0xff000) == 0) + return MMU_PAGE_16M; + break; + } + return -1; +} + static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, - struct kvmppc_pte *gpte, bool data) + struct kvmppc_pte *gpte, bool data, + bool iswrite) { - struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); struct kvmppc_slb *slbe; hva_t ptegp; u64 pteg[16]; @@ -189,6 +220,7 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, u8 pp, key = 0; bool found = false; bool second = false; + int pgsize; ulong mp_ea = vcpu->arch.magic_page_ea; /* Magic page override */ @@ -202,6 +234,7 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, gpte->may_execute = true; gpte->may_read = true; gpte->may_write = true; + gpte->page_size = MMU_PAGE_4K; return 0; } @@ -222,8 +255,12 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, v_mask = SLB_VSID_B | HPTE_V_AVPN | HPTE_V_LARGE | HPTE_V_VALID | HPTE_V_SECONDARY; + pgsize = slbe->large ? MMU_PAGE_16M : MMU_PAGE_4K; + + mutex_lock(&vcpu->kvm->arch.hpt_mutex); + do_second: - ptegp = kvmppc_mmu_book3s_64_get_pteg(vcpu_book3s, slbe, eaddr, second); + ptegp = kvmppc_mmu_book3s_64_get_pteg(vcpu, slbe, eaddr, second); if (kvm_is_error_hva(ptegp)) goto no_page_found; @@ -240,6 +277,13 @@ do_second: for (i=0; i<16; i+=2) { /* Check all relevant fields of 1st dword */ if ((pteg[i] & v_mask) == v_val) { + /* If large page bit is set, check pgsize encoding */ + if (slbe->large && + (vcpu->arch.hflags & BOOK3S_HFLAG_MULTI_PGSIZE)) { + pgsize = decode_pagesize(slbe, pteg[i+1]); + if (pgsize < 0) + continue; + } found = true; break; } @@ -256,13 +300,15 @@ do_second: v = pteg[i]; r = pteg[i+1]; pp = (r & HPTE_R_PP) | key; - eaddr_mask = 0xFFF; + if (r & HPTE_R_PP0) + pp |= 8; gpte->eaddr = eaddr; gpte->vpage = kvmppc_mmu_book3s_64_ea_to_vp(vcpu, eaddr, data); - if (slbe->large) - eaddr_mask = 0xFFFFFF; + + eaddr_mask = (1ull << mmu_pagesize(pgsize)) - 1; gpte->raddr = (r & HPTE_R_RPN & ~eaddr_mask) | (eaddr & eaddr_mask); + gpte->page_size = pgsize; gpte->may_execute = ((r & HPTE_R_N) ? false : true); gpte->may_read = false; gpte->may_write = false; @@ -277,6 +323,7 @@ do_second: case 3: case 5: case 7: + case 10: gpte->may_read = true; break; } @@ -287,30 +334,37 @@ do_second: /* Update PTE R and C bits, so the guest's swapper knows we used the * page */ - if (gpte->may_read) { - /* Set the accessed flag */ + if (gpte->may_read && !(r & HPTE_R_R)) { + /* + * Set the accessed flag. + * We have to write this back with a single byte write + * because another vcpu may be accessing this on + * non-PAPR platforms such as mac99, and this is + * what real hardware does. + */ + char __user *addr = (char __user *) &pteg[i+1]; r |= HPTE_R_R; + put_user(r >> 8, addr + 6); } - if (data && gpte->may_write) { - /* Set the dirty flag -- XXX even if not writing */ + if (iswrite && gpte->may_write && !(r & HPTE_R_C)) { + /* Set the dirty flag */ + /* Use a single byte write */ + char __user *addr = (char __user *) &pteg[i+1]; r |= HPTE_R_C; + put_user(r, addr + 7); } - /* Write back into the PTEG */ - if (pteg[i+1] != r) { - pteg[i+1] = r; - copy_to_user((void __user *)ptegp, pteg, sizeof(pteg)); - } + mutex_unlock(&vcpu->kvm->arch.hpt_mutex); - if (!gpte->may_read) + if (!gpte->may_read || (iswrite && !gpte->may_write)) return -EPERM; return 0; no_page_found: + mutex_unlock(&vcpu->kvm->arch.hpt_mutex); return -ENOENT; no_seg_found: - dprintk("KVM MMU: Trigger segment fault\n"); return -EINVAL; } @@ -345,6 +399,21 @@ static void kvmppc_mmu_book3s_64_slbmte(struct kvm_vcpu *vcpu, u64 rs, u64 rb) slbe->nx = (rs & SLB_VSID_N) ? 1 : 0; slbe->class = (rs & SLB_VSID_C) ? 1 : 0; + slbe->base_page_size = MMU_PAGE_4K; + if (slbe->large) { + if (vcpu->arch.hflags & BOOK3S_HFLAG_MULTI_PGSIZE) { + switch (rs & SLB_VSID_LP) { + case SLB_VSID_LP_00: + slbe->base_page_size = MMU_PAGE_16M; + break; + case SLB_VSID_LP_01: + slbe->base_page_size = MMU_PAGE_64K; + break; + } + } else + slbe->base_page_size = MMU_PAGE_16M; + } + slbe->orige = rb & (ESID_MASK | SLB_ESID_V); slbe->origv = rs; @@ -460,14 +529,45 @@ static void kvmppc_mmu_book3s_64_tlbie(struct kvm_vcpu *vcpu, ulong va, bool large) { u64 mask = 0xFFFFFFFFFULL; + long i; + struct kvm_vcpu *v; dprintk("KVM MMU: tlbie(0x%lx)\n", va); - if (large) - mask = 0xFFFFFF000ULL; - kvmppc_mmu_pte_vflush(vcpu, va >> 12, mask); + /* + * The tlbie instruction changed behaviour starting with + * POWER6. POWER6 and later don't have the large page flag + * in the instruction but in the RB value, along with bits + * indicating page and segment sizes. + */ + if (vcpu->arch.hflags & BOOK3S_HFLAG_NEW_TLBIE) { + /* POWER6 or later */ + if (va & 1) { /* L bit */ + if ((va & 0xf000) == 0x1000) + mask = 0xFFFFFFFF0ULL; /* 64k page */ + else + mask = 0xFFFFFF000ULL; /* 16M page */ + } + } else { + /* older processors, e.g. PPC970 */ + if (large) + mask = 0xFFFFFF000ULL; + } + /* flush this VA on all vcpus */ + kvm_for_each_vcpu(i, v, vcpu->kvm) + kvmppc_mmu_pte_vflush(v, va >> 12, mask); } +#ifdef CONFIG_PPC_64K_PAGES +static int segment_contains_magic_page(struct kvm_vcpu *vcpu, ulong esid) +{ + ulong mp_ea = vcpu->arch.magic_page_ea; + + return mp_ea && !(vcpu->arch.shared->msr & MSR_PR) && + (mp_ea >> SID_SHIFT) == esid; +} +#endif + static int kvmppc_mmu_book3s_64_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid, u64 *vsid) { @@ -475,11 +575,13 @@ static int kvmppc_mmu_book3s_64_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid, struct kvmppc_slb *slb; u64 gvsid = esid; ulong mp_ea = vcpu->arch.magic_page_ea; + int pagesize = MMU_PAGE_64K; if (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) { slb = kvmppc_mmu_book3s_64_find_slbe(vcpu, ea); if (slb) { gvsid = slb->vsid; + pagesize = slb->base_page_size; if (slb->tb) { gvsid <<= SID_SHIFT_1T - SID_SHIFT; gvsid |= esid & ((1ul << (SID_SHIFT_1T - SID_SHIFT)) - 1); @@ -490,28 +592,41 @@ static int kvmppc_mmu_book3s_64_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid, switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) { case 0: - *vsid = VSID_REAL | esid; + gvsid = VSID_REAL | esid; break; case MSR_IR: - *vsid = VSID_REAL_IR | gvsid; + gvsid |= VSID_REAL_IR; break; case MSR_DR: - *vsid = VSID_REAL_DR | gvsid; + gvsid |= VSID_REAL_DR; break; case MSR_DR|MSR_IR: if (!slb) goto no_slb; - *vsid = gvsid; break; default: BUG(); break; } +#ifdef CONFIG_PPC_64K_PAGES + /* + * Mark this as a 64k segment if the host is using + * 64k pages, the host MMU supports 64k pages and + * the guest segment page size is >= 64k, + * but not if this segment contains the magic page. + */ + if (pagesize >= MMU_PAGE_64K && + mmu_psize_defs[MMU_PAGE_64K].shift && + !segment_contains_magic_page(vcpu, esid)) + gvsid |= VSID_64K; +#endif + if (vcpu->arch.shared->msr & MSR_PR) - *vsid |= VSID_PR; + gvsid |= VSID_PR; + *vsid = gvsid; return 0; no_slb: diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c index e524052..0d513af 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_host.c +++ b/arch/powerpc/kvm/book3s_64_mmu_host.c @@ -27,14 +27,14 @@ #include <asm/machdep.h> #include <asm/mmu_context.h> #include <asm/hw_irq.h> -#include "trace.h" +#include "trace_pr.h" #define PTE_SIZE 12 void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte) { ppc_md.hpte_invalidate(pte->slot, pte->host_vpn, - MMU_PAGE_4K, MMU_PAGE_4K, MMU_SEGSIZE_256M, + pte->pagesize, pte->pagesize, MMU_SEGSIZE_256M, false); } @@ -78,7 +78,8 @@ static struct kvmppc_sid_map *find_sid_vsid(struct kvm_vcpu *vcpu, u64 gvsid) return NULL; } -int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte) +int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte, + bool iswrite) { unsigned long vpn; pfn_t hpaddr; @@ -90,16 +91,26 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte) int attempt = 0; struct kvmppc_sid_map *map; int r = 0; + int hpsize = MMU_PAGE_4K; + bool writable; + unsigned long mmu_seq; + struct kvm *kvm = vcpu->kvm; + struct hpte_cache *cpte; + unsigned long gfn = orig_pte->raddr >> PAGE_SHIFT; + unsigned long pfn; + + /* used to check for invalidations in progress */ + mmu_seq = kvm->mmu_notifier_seq; + smp_rmb(); /* Get host physical address for gpa */ - hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT); - if (is_error_noslot_pfn(hpaddr)) { - printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", orig_pte->eaddr); + pfn = kvmppc_gfn_to_pfn(vcpu, gfn, iswrite, &writable); + if (is_error_noslot_pfn(pfn)) { + printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", gfn); r = -EINVAL; goto out; } - hpaddr <<= PAGE_SHIFT; - hpaddr |= orig_pte->raddr & (~0xfffULL & ~PAGE_MASK); + hpaddr = pfn << PAGE_SHIFT; /* and write the mapping ea -> hpa into the pt */ vcpu->arch.mmu.esid_to_vsid(vcpu, orig_pte->eaddr >> SID_SHIFT, &vsid); @@ -117,20 +128,39 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte) goto out; } - vsid = map->host_vsid; - vpn = hpt_vpn(orig_pte->eaddr, vsid, MMU_SEGSIZE_256M); + vpn = hpt_vpn(orig_pte->eaddr, map->host_vsid, MMU_SEGSIZE_256M); - if (!orig_pte->may_write) - rflags |= HPTE_R_PP; - else - mark_page_dirty(vcpu->kvm, orig_pte->raddr >> PAGE_SHIFT); + kvm_set_pfn_accessed(pfn); + if (!orig_pte->may_write || !writable) + rflags |= PP_RXRX; + else { + mark_page_dirty(vcpu->kvm, gfn); + kvm_set_pfn_dirty(pfn); + } if (!orig_pte->may_execute) rflags |= HPTE_R_N; else - kvmppc_mmu_flush_icache(hpaddr >> PAGE_SHIFT); + kvmppc_mmu_flush_icache(pfn); + + /* + * Use 64K pages if possible; otherwise, on 64K page kernels, + * we need to transfer 4 more bits from guest real to host real addr. + */ + if (vsid & VSID_64K) + hpsize = MMU_PAGE_64K; + else + hpaddr |= orig_pte->raddr & (~0xfffULL & ~PAGE_MASK); + + hash = hpt_hash(vpn, mmu_psize_defs[hpsize].shift, MMU_SEGSIZE_256M); - hash = hpt_hash(vpn, PTE_SIZE, MMU_SEGSIZE_256M); + cpte = kvmppc_mmu_hpte_cache_next(vcpu); + + spin_lock(&kvm->mmu_lock); + if (!cpte || mmu_notifier_retry(kvm, mmu_seq)) { + r = -EAGAIN; + goto out_unlock; + } map_again: hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP); @@ -139,11 +169,11 @@ map_again: if (attempt > 1) if (ppc_md.hpte_remove(hpteg) < 0) { r = -1; - goto out; + goto out_unlock; } ret = ppc_md.hpte_insert(hpteg, vpn, hpaddr, rflags, vflags, - MMU_PAGE_4K, MMU_PAGE_4K, MMU_SEGSIZE_256M); + hpsize, hpsize, MMU_SEGSIZE_256M); if (ret < 0) { /* If we couldn't map a primary PTE, try a secondary */ @@ -152,8 +182,6 @@ map_again: attempt++; goto map_again; } else { - struct hpte_cache *pte = kvmppc_mmu_hpte_cache_next(vcpu); - trace_kvm_book3s_64_mmu_map(rflags, hpteg, vpn, hpaddr, orig_pte); @@ -164,19 +192,37 @@ map_again: hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP); } - pte->slot = hpteg + (ret & 7); - pte->host_vpn = vpn; - pte->pte = *orig_pte; - pte->pfn = hpaddr >> PAGE_SHIFT; + cpte->slot = hpteg + (ret & 7); + cpte->host_vpn = vpn; + cpte->pte = *orig_pte; + cpte->pfn = pfn; + cpte->pagesize = hpsize; - kvmppc_mmu_hpte_cache_map(vcpu, pte); + kvmppc_mmu_hpte_cache_map(vcpu, cpte); + cpte = NULL; } - kvm_release_pfn_clean(hpaddr >> PAGE_SHIFT); + +out_unlock: + spin_unlock(&kvm->mmu_lock); + kvm_release_pfn_clean(pfn); + if (cpte) + kvmppc_mmu_hpte_cache_free(cpte); out: return r; } +void kvmppc_mmu_unmap_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte) +{ + u64 mask = 0xfffffffffULL; + u64 vsid; + + vcpu->arch.mmu.esid_to_vsid(vcpu, pte->eaddr >> SID_SHIFT, &vsid); + if (vsid & VSID_64K) + mask = 0xffffffff0ULL; + kvmppc_mmu_pte_vflush(vcpu, pte->vpage, mask); +} + static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid) { struct kvmppc_sid_map *map; @@ -291,6 +337,12 @@ int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr) slb_vsid &= ~SLB_VSID_KP; slb_esid |= slb_index; +#ifdef CONFIG_PPC_64K_PAGES + /* Set host segment base page size to 64K if possible */ + if (gvsid & VSID_64K) + slb_vsid |= mmu_psize_defs[MMU_PAGE_64K].sllp; +#endif + svcpu->slb[slb_index].esid = slb_esid; svcpu->slb[slb_index].vsid = slb_vsid; @@ -326,7 +378,7 @@ void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu) svcpu_put(svcpu); } -void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) +void kvmppc_mmu_destroy_pr(struct kvm_vcpu *vcpu) { kvmppc_mmu_hpte_destroy(vcpu); __destroy_context(to_book3s(vcpu)->context_id[0]); diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 043eec8..f3ff587 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -260,10 +260,6 @@ int kvmppc_mmu_hv_init(void) return 0; } -void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) -{ -} - static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu) { kvmppc_set_msr(vcpu, MSR_SF | MSR_ME); @@ -451,7 +447,7 @@ static unsigned long kvmppc_mmu_get_real_addr(unsigned long v, unsigned long r, } static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, - struct kvmppc_pte *gpte, bool data) + struct kvmppc_pte *gpte, bool data, bool iswrite) { struct kvm *kvm = vcpu->kvm; struct kvmppc_slb *slbe; @@ -906,21 +902,22 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, return 0; } -int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) +int kvm_unmap_hva_hv(struct kvm *kvm, unsigned long hva) { if (kvm->arch.using_mmu_notifiers) kvm_handle_hva(kvm, hva, kvm_unmap_rmapp); return 0; } -int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) +int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, unsigned long end) { if (kvm->arch.using_mmu_notifiers) kvm_handle_hva_range(kvm, start, end, kvm_unmap_rmapp); return 0; } -void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) +void kvmppc_core_flush_memslot_hv(struct kvm *kvm, + struct kvm_memory_slot *memslot) { unsigned long *rmapp; unsigned long gfn; @@ -994,7 +991,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, return ret; } -int kvm_age_hva(struct kvm *kvm, unsigned long hva) +int kvm_age_hva_hv(struct kvm *kvm, unsigned long hva) { if (!kvm->arch.using_mmu_notifiers) return 0; @@ -1032,14 +1029,14 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, return ret; } -int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva) { if (!kvm->arch.using_mmu_notifiers) return 0; return kvm_handle_hva(kvm, hva, kvm_test_age_rmapp); } -void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte) { if (!kvm->arch.using_mmu_notifiers) return; @@ -1512,9 +1509,8 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf, kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T | (VRMA_VSID << SLB_VSID_SHIFT_1T); - lpcr = kvm->arch.lpcr & ~LPCR_VRMASD; - lpcr |= senc << (LPCR_VRMASD_SH - 4); - kvm->arch.lpcr = lpcr; + lpcr = senc << (LPCR_VRMASD_SH - 4); + kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD); rma_setup = 1; } ++i; diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index 30c2f3b..2c25f54 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -74,3 +74,4 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, /* Didn't find the liobn, punt it to userspace */ return H_TOO_HARD; } +EXPORT_SYMBOL_GPL(kvmppc_h_put_tce); diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index 360ce68..99d40f8 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -86,8 +86,8 @@ static bool spr_allowed(struct kvm_vcpu *vcpu, enum priv_level level) return true; } -int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, - unsigned int inst, int *advance) +int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int inst, int *advance) { int emulated = EMULATE_DONE; int rt = get_rt(inst); @@ -172,7 +172,7 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, vcpu->arch.mmu.tlbie(vcpu, addr, large); break; } -#ifdef CONFIG_KVM_BOOK3S_64_PR +#ifdef CONFIG_PPC_BOOK3S_64 case OP_31_XOP_FAKE_SC1: { /* SC 1 papr hypercalls */ @@ -267,12 +267,9 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, r = kvmppc_st(vcpu, &addr, 32, zeros, true); if ((r == -ENOENT) || (r == -EPERM)) { - struct kvmppc_book3s_shadow_vcpu *svcpu; - - svcpu = svcpu_get(vcpu); *advance = 0; vcpu->arch.shared->dar = vaddr; - svcpu->fault_dar = vaddr; + vcpu->arch.fault_dar = vaddr; dsisr = DSISR_ISSTORE; if (r == -ENOENT) @@ -281,8 +278,7 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, dsisr |= DSISR_PROTFAULT; vcpu->arch.shared->dsisr = dsisr; - svcpu->fault_dsisr = dsisr; - svcpu_put(svcpu); + vcpu->arch.fault_dsisr = dsisr; kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE); @@ -349,7 +345,7 @@ static struct kvmppc_bat *kvmppc_find_bat(struct kvm_vcpu *vcpu, int sprn) return bat; } -int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) +int kvmppc_core_emulate_mtspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) { int emulated = EMULATE_DONE; @@ -472,7 +468,7 @@ unprivileged: return emulated; } -int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) +int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) { int emulated = EMULATE_DONE; diff --git a/arch/powerpc/kvm/book3s_exports.c b/arch/powerpc/kvm/book3s_exports.c index 7057a02..852989a 100644 --- a/arch/powerpc/kvm/book3s_exports.c +++ b/arch/powerpc/kvm/book3s_exports.c @@ -20,9 +20,10 @@ #include <linux/export.h> #include <asm/kvm_book3s.h> -#ifdef CONFIG_KVM_BOOK3S_64_HV +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE EXPORT_SYMBOL_GPL(kvmppc_hv_entry_trampoline); -#else +#endif +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE EXPORT_SYMBOL_GPL(kvmppc_entry_trampoline); EXPORT_SYMBOL_GPL(kvmppc_load_up_fpu); #ifdef CONFIG_ALTIVEC diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 62a2b5a..072287f 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -52,6 +52,9 @@ #include <linux/vmalloc.h> #include <linux/highmem.h> #include <linux/hugetlb.h> +#include <linux/module.h> + +#include "book3s.h" /* #define EXIT_DEBUG */ /* #define EXIT_DEBUG_SIMPLE */ @@ -66,7 +69,7 @@ static void kvmppc_end_cede(struct kvm_vcpu *vcpu); static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); -void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu) +static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu) { int me; int cpu = vcpu->cpu; @@ -125,7 +128,7 @@ void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu) * purely defensive; they should never fail.) */ -void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +static void kvmppc_core_vcpu_load_hv(struct kvm_vcpu *vcpu, int cpu) { struct kvmppc_vcore *vc = vcpu->arch.vcore; @@ -143,7 +146,7 @@ void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) spin_unlock(&vcpu->arch.tbacct_lock); } -void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) +static void kvmppc_core_vcpu_put_hv(struct kvm_vcpu *vcpu) { struct kvmppc_vcore *vc = vcpu->arch.vcore; @@ -155,17 +158,46 @@ void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) spin_unlock(&vcpu->arch.tbacct_lock); } -void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr) +static void kvmppc_set_msr_hv(struct kvm_vcpu *vcpu, u64 msr) { vcpu->arch.shregs.msr = msr; kvmppc_end_cede(vcpu); } -void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr) +void kvmppc_set_pvr_hv(struct kvm_vcpu *vcpu, u32 pvr) { vcpu->arch.pvr = pvr; } +int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat) +{ + unsigned long pcr = 0; + struct kvmppc_vcore *vc = vcpu->arch.vcore; + + if (arch_compat) { + if (!cpu_has_feature(CPU_FTR_ARCH_206)) + return -EINVAL; /* 970 has no compat mode support */ + + switch (arch_compat) { + case PVR_ARCH_205: + pcr = PCR_ARCH_205; + break; + case PVR_ARCH_206: + case PVR_ARCH_206p: + break; + default: + return -EINVAL; + } + } + + spin_lock(&vc->lock); + vc->arch_compat = arch_compat; + vc->pcr = pcr; + spin_unlock(&vc->lock); + + return 0; +} + void kvmppc_dump_regs(struct kvm_vcpu *vcpu) { int r; @@ -195,7 +227,7 @@ void kvmppc_dump_regs(struct kvm_vcpu *vcpu) pr_err(" ESID = %.16llx VSID = %.16llx\n", vcpu->arch.slb[r].orige, vcpu->arch.slb[r].origv); pr_err("lpcr = %.16lx sdr1 = %.16lx last_inst = %.8x\n", - vcpu->kvm->arch.lpcr, vcpu->kvm->arch.sdr1, + vcpu->arch.vcore->lpcr, vcpu->kvm->arch.sdr1, vcpu->arch.last_inst); } @@ -489,7 +521,7 @@ static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu, memset(dt, 0, sizeof(struct dtl_entry)); dt->dispatch_reason = 7; dt->processor_id = vc->pcpu + vcpu->arch.ptid; - dt->timebase = now; + dt->timebase = now + vc->tb_offset; dt->enqueue_to_dispatch_time = stolen; dt->srr0 = kvmppc_get_pc(vcpu); dt->srr1 = vcpu->arch.shregs.msr; @@ -538,6 +570,15 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) } break; case H_CONFER: + target = kvmppc_get_gpr(vcpu, 4); + if (target == -1) + break; + tvcpu = kvmppc_find_vcpu(vcpu->kvm, target); + if (!tvcpu) { + ret = H_PARAMETER; + break; + } + kvm_vcpu_yield_to(tvcpu); break; case H_REGISTER_VPA: ret = do_h_register_vpa(vcpu, kvmppc_get_gpr(vcpu, 4), @@ -576,8 +617,8 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) return RESUME_GUEST; } -static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, - struct task_struct *tsk) +static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, + struct task_struct *tsk) { int r = RESUME_HOST; @@ -671,16 +712,16 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n", vcpu->arch.trap, kvmppc_get_pc(vcpu), vcpu->arch.shregs.msr); + run->hw.hardware_exit_reason = vcpu->arch.trap; r = RESUME_HOST; - BUG(); break; } return r; } -int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, - struct kvm_sregs *sregs) +static int kvm_arch_vcpu_ioctl_get_sregs_hv(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) { int i; @@ -694,12 +735,12 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, return 0; } -int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, - struct kvm_sregs *sregs) +static int kvm_arch_vcpu_ioctl_set_sregs_hv(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) { int i, j; - kvmppc_set_pvr(vcpu, sregs->pvr); + kvmppc_set_pvr_hv(vcpu, sregs->pvr); j = 0; for (i = 0; i < vcpu->arch.slb_nr; i++) { @@ -714,7 +755,23 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, return 0; } -int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) +static void kvmppc_set_lpcr(struct kvm_vcpu *vcpu, u64 new_lpcr) +{ + struct kvmppc_vcore *vc = vcpu->arch.vcore; + u64 mask; + + spin_lock(&vc->lock); + /* + * Userspace can only modify DPFD (default prefetch depth), + * ILE (interrupt little-endian) and TC (translation control). + */ + mask = LPCR_DPFD | LPCR_ILE | LPCR_TC; + vc->lpcr = (vc->lpcr & ~mask) | (new_lpcr & mask); + spin_unlock(&vc->lock); +} + +static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) { int r = 0; long int i; @@ -749,6 +806,12 @@ int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) i = id - KVM_REG_PPC_PMC1; *val = get_reg_val(id, vcpu->arch.pmc[i]); break; + case KVM_REG_PPC_SIAR: + *val = get_reg_val(id, vcpu->arch.siar); + break; + case KVM_REG_PPC_SDAR: + *val = get_reg_val(id, vcpu->arch.sdar); + break; #ifdef CONFIG_VSX case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31: if (cpu_has_feature(CPU_FTR_VSX)) { @@ -787,6 +850,18 @@ int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) val->vpaval.length = vcpu->arch.dtl.len; spin_unlock(&vcpu->arch.vpa_update_lock); break; + case KVM_REG_PPC_TB_OFFSET: + *val = get_reg_val(id, vcpu->arch.vcore->tb_offset); + break; + case KVM_REG_PPC_LPCR: + *val = get_reg_val(id, vcpu->arch.vcore->lpcr); + break; + case KVM_REG_PPC_PPR: + *val = get_reg_val(id, vcpu->arch.ppr); + break; + case KVM_REG_PPC_ARCH_COMPAT: + *val = get_reg_val(id, vcpu->arch.vcore->arch_compat); + break; default: r = -EINVAL; break; @@ -795,7 +870,8 @@ int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) return r; } -int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) +static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) { int r = 0; long int i; @@ -833,6 +909,12 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) i = id - KVM_REG_PPC_PMC1; vcpu->arch.pmc[i] = set_reg_val(id, *val); break; + case KVM_REG_PPC_SIAR: + vcpu->arch.siar = set_reg_val(id, *val); + break; + case KVM_REG_PPC_SDAR: + vcpu->arch.sdar = set_reg_val(id, *val); + break; #ifdef CONFIG_VSX case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31: if (cpu_has_feature(CPU_FTR_VSX)) { @@ -880,6 +962,20 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) len -= len % sizeof(struct dtl_entry); r = set_vpa(vcpu, &vcpu->arch.dtl, addr, len); break; + case KVM_REG_PPC_TB_OFFSET: + /* round up to multiple of 2^24 */ + vcpu->arch.vcore->tb_offset = + ALIGN(set_reg_val(id, *val), 1UL << 24); + break; + case KVM_REG_PPC_LPCR: + kvmppc_set_lpcr(vcpu, set_reg_val(id, *val)); + break; + case KVM_REG_PPC_PPR: + vcpu->arch.ppr = set_reg_val(id, *val); + break; + case KVM_REG_PPC_ARCH_COMPAT: + r = kvmppc_set_arch_compat(vcpu, set_reg_val(id, *val)); + break; default: r = -EINVAL; break; @@ -888,14 +984,8 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) return r; } -int kvmppc_core_check_processor_compat(void) -{ - if (cpu_has_feature(CPU_FTR_HVMODE)) - return 0; - return -EIO; -} - -struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) +static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm, + unsigned int id) { struct kvm_vcpu *vcpu; int err = -EINVAL; @@ -919,8 +1009,7 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) vcpu->arch.mmcr[0] = MMCR0_FC; vcpu->arch.ctrl = CTRL_RUNLATCH; /* default to host PVR, since we can't spoof it */ - vcpu->arch.pvr = mfspr(SPRN_PVR); - kvmppc_set_pvr(vcpu, vcpu->arch.pvr); + kvmppc_set_pvr_hv(vcpu, mfspr(SPRN_PVR)); spin_lock_init(&vcpu->arch.vpa_update_lock); spin_lock_init(&vcpu->arch.tbacct_lock); vcpu->arch.busy_preempt = TB_NIL; @@ -940,6 +1029,7 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) spin_lock_init(&vcore->lock); init_waitqueue_head(&vcore->wq); vcore->preempt_tb = TB_NIL; + vcore->lpcr = kvm->arch.lpcr; } kvm->arch.vcores[core] = vcore; kvm->arch.online_vcores++; @@ -972,7 +1062,7 @@ static void unpin_vpa(struct kvm *kvm, struct kvmppc_vpa *vpa) vpa->dirty); } -void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) +static void kvmppc_core_vcpu_free_hv(struct kvm_vcpu *vcpu) { spin_lock(&vcpu->arch.vpa_update_lock); unpin_vpa(vcpu->kvm, &vcpu->arch.dtl); @@ -983,6 +1073,12 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) kmem_cache_free(kvm_vcpu_cache, vcpu); } +static int kvmppc_core_check_requests_hv(struct kvm_vcpu *vcpu) +{ + /* Indicate we want to get back into the guest */ + return 1; +} + static void kvmppc_set_timer(struct kvm_vcpu *vcpu) { unsigned long dec_nsec, now; @@ -1264,8 +1360,8 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc) ret = RESUME_GUEST; if (vcpu->arch.trap) - ret = kvmppc_handle_exit(vcpu->arch.kvm_run, vcpu, - vcpu->arch.run_task); + ret = kvmppc_handle_exit_hv(vcpu->arch.kvm_run, vcpu, + vcpu->arch.run_task); vcpu->arch.ret = ret; vcpu->arch.trap = 0; @@ -1424,7 +1520,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) return vcpu->arch.ret; } -int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) +static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) { int r; int srcu_idx; @@ -1546,7 +1642,8 @@ static const struct file_operations kvm_rma_fops = { .release = kvm_rma_release, }; -long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, struct kvm_allocate_rma *ret) +static long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, + struct kvm_allocate_rma *ret) { long fd; struct kvm_rma_info *ri; @@ -1592,7 +1689,8 @@ static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps, (*sps)++; } -int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm, struct kvm_ppc_smmu_info *info) +static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm, + struct kvm_ppc_smmu_info *info) { struct kvm_ppc_one_seg_page_size *sps; @@ -1613,7 +1711,8 @@ int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm, struct kvm_ppc_smmu_info *info) /* * Get (and clear) the dirty memory log for a memory slot. */ -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) +static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm, + struct kvm_dirty_log *log) { struct kvm_memory_slot *memslot; int r; @@ -1667,8 +1766,8 @@ static void unpin_slot(struct kvm_memory_slot *memslot) } } -void kvmppc_core_free_memslot(struct kvm_memory_slot *free, - struct kvm_memory_slot *dont) +static void kvmppc_core_free_memslot_hv(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) { if (!dont || free->arch.rmap != dont->arch.rmap) { vfree(free->arch.rmap); @@ -1681,8 +1780,8 @@ void kvmppc_core_free_memslot(struct kvm_memory_slot *free, } } -int kvmppc_core_create_memslot(struct kvm_memory_slot *slot, - unsigned long npages) +static int kvmppc_core_create_memslot_hv(struct kvm_memory_slot *slot, + unsigned long npages) { slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap)); if (!slot->arch.rmap) @@ -1692,9 +1791,9 @@ int kvmppc_core_create_memslot(struct kvm_memory_slot *slot, return 0; } -int kvmppc_core_prepare_memory_region(struct kvm *kvm, - struct kvm_memory_slot *memslot, - struct kvm_userspace_memory_region *mem) +static int kvmppc_core_prepare_memory_region_hv(struct kvm *kvm, + struct kvm_memory_slot *memslot, + struct kvm_userspace_memory_region *mem) { unsigned long *phys; @@ -1710,9 +1809,9 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm, return 0; } -void kvmppc_core_commit_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, - const struct kvm_memory_slot *old) +static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + const struct kvm_memory_slot *old) { unsigned long npages = mem->memory_size >> PAGE_SHIFT; struct kvm_memory_slot *memslot; @@ -1729,6 +1828,37 @@ void kvmppc_core_commit_memory_region(struct kvm *kvm, } } +/* + * Update LPCR values in kvm->arch and in vcores. + * Caller must hold kvm->lock. + */ +void kvmppc_update_lpcr(struct kvm *kvm, unsigned long lpcr, unsigned long mask) +{ + long int i; + u32 cores_done = 0; + + if ((kvm->arch.lpcr & mask) == lpcr) + return; + + kvm->arch.lpcr = (kvm->arch.lpcr & ~mask) | lpcr; + + for (i = 0; i < KVM_MAX_VCORES; ++i) { + struct kvmppc_vcore *vc = kvm->arch.vcores[i]; + if (!vc) + continue; + spin_lock(&vc->lock); + vc->lpcr = (vc->lpcr & ~mask) | lpcr; + spin_unlock(&vc->lock); + if (++cores_done >= kvm->arch.online_vcores) + break; + } +} + +static void kvmppc_mmu_destroy_hv(struct kvm_vcpu *vcpu) +{ + return; +} + static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) { int err = 0; @@ -1737,7 +1867,8 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) unsigned long hva; struct kvm_memory_slot *memslot; struct vm_area_struct *vma; - unsigned long lpcr, senc; + unsigned long lpcr = 0, senc; + unsigned long lpcr_mask = 0; unsigned long psize, porder; unsigned long rma_size; unsigned long rmls; @@ -1802,9 +1933,9 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) senc = slb_pgsize_encoding(psize); kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T | (VRMA_VSID << SLB_VSID_SHIFT_1T); - lpcr = kvm->arch.lpcr & ~LPCR_VRMASD; - lpcr |= senc << (LPCR_VRMASD_SH - 4); - kvm->arch.lpcr = lpcr; + lpcr_mask = LPCR_VRMASD; + /* the -4 is to account for senc values starting at 0x10 */ + lpcr = senc << (LPCR_VRMASD_SH - 4); /* Create HPTEs in the hash page table for the VRMA */ kvmppc_map_vrma(vcpu, memslot, porder); @@ -1825,23 +1956,21 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) kvm->arch.rma = ri; /* Update LPCR and RMOR */ - lpcr = kvm->arch.lpcr; if (cpu_has_feature(CPU_FTR_ARCH_201)) { /* PPC970; insert RMLS value (split field) in HID4 */ - lpcr &= ~((1ul << HID4_RMLS0_SH) | - (3ul << HID4_RMLS2_SH)); - lpcr |= ((rmls >> 2) << HID4_RMLS0_SH) | + lpcr_mask = (1ul << HID4_RMLS0_SH) | + (3ul << HID4_RMLS2_SH) | HID4_RMOR; + lpcr = ((rmls >> 2) << HID4_RMLS0_SH) | ((rmls & 3) << HID4_RMLS2_SH); /* RMOR is also in HID4 */ lpcr |= ((ri->base_pfn >> (26 - PAGE_SHIFT)) & 0xffff) << HID4_RMOR_SH; } else { /* POWER7 */ - lpcr &= ~(LPCR_VPM0 | LPCR_VRMA_L); - lpcr |= rmls << LPCR_RMLS_SH; + lpcr_mask = LPCR_VPM0 | LPCR_VRMA_L | LPCR_RMLS; + lpcr = rmls << LPCR_RMLS_SH; kvm->arch.rmor = ri->base_pfn << PAGE_SHIFT; } - kvm->arch.lpcr = lpcr; pr_info("KVM: Using RMO at %lx size %lx (LPCR = %lx)\n", ri->base_pfn << PAGE_SHIFT, rma_size, lpcr); @@ -1860,6 +1989,8 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) } } + kvmppc_update_lpcr(kvm, lpcr, lpcr_mask); + /* Order updates to kvm->arch.lpcr etc. vs. rma_setup_done */ smp_wmb(); kvm->arch.rma_setup_done = 1; @@ -1875,7 +2006,7 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) goto out_srcu; } -int kvmppc_core_init_vm(struct kvm *kvm) +static int kvmppc_core_init_vm_hv(struct kvm *kvm) { unsigned long lpcr, lpid; @@ -1893,9 +2024,6 @@ int kvmppc_core_init_vm(struct kvm *kvm) */ cpumask_setall(&kvm->arch.need_tlb_flush); - INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); - INIT_LIST_HEAD(&kvm->arch.rtas_tokens); - kvm->arch.rma = NULL; kvm->arch.host_sdr1 = mfspr(SPRN_SDR1); @@ -1931,61 +2059,162 @@ int kvmppc_core_init_vm(struct kvm *kvm) return 0; } -void kvmppc_core_destroy_vm(struct kvm *kvm) +static void kvmppc_free_vcores(struct kvm *kvm) +{ + long int i; + + for (i = 0; i < KVM_MAX_VCORES; ++i) + kfree(kvm->arch.vcores[i]); + kvm->arch.online_vcores = 0; +} + +static void kvmppc_core_destroy_vm_hv(struct kvm *kvm) { uninhibit_secondary_onlining(); + kvmppc_free_vcores(kvm); if (kvm->arch.rma) { kvm_release_rma(kvm->arch.rma); kvm->arch.rma = NULL; } - kvmppc_rtas_tokens_free(kvm); - kvmppc_free_hpt(kvm); - WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables)); } -/* These are stubs for now */ -void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end) +/* We don't need to emulate any privileged instructions or dcbz */ +static int kvmppc_core_emulate_op_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int inst, int *advance) { + return EMULATE_FAIL; } -/* We don't need to emulate any privileged instructions or dcbz */ -int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, - unsigned int inst, int *advance) +static int kvmppc_core_emulate_mtspr_hv(struct kvm_vcpu *vcpu, int sprn, + ulong spr_val) { return EMULATE_FAIL; } -int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) +static int kvmppc_core_emulate_mfspr_hv(struct kvm_vcpu *vcpu, int sprn, + ulong *spr_val) { return EMULATE_FAIL; } -int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) +static int kvmppc_core_check_processor_compat_hv(void) { - return EMULATE_FAIL; + if (!cpu_has_feature(CPU_FTR_HVMODE)) + return -EIO; + return 0; } -static int kvmppc_book3s_hv_init(void) +static long kvm_arch_vm_ioctl_hv(struct file *filp, + unsigned int ioctl, unsigned long arg) { - int r; + struct kvm *kvm __maybe_unused = filp->private_data; + void __user *argp = (void __user *)arg; + long r; - r = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); + switch (ioctl) { - if (r) + case KVM_ALLOCATE_RMA: { + struct kvm_allocate_rma rma; + struct kvm *kvm = filp->private_data; + + r = kvm_vm_ioctl_allocate_rma(kvm, &rma); + if (r >= 0 && copy_to_user(argp, &rma, sizeof(rma))) + r = -EFAULT; + break; + } + + case KVM_PPC_ALLOCATE_HTAB: { + u32 htab_order; + + r = -EFAULT; + if (get_user(htab_order, (u32 __user *)argp)) + break; + r = kvmppc_alloc_reset_hpt(kvm, &htab_order); + if (r) + break; + r = -EFAULT; + if (put_user(htab_order, (u32 __user *)argp)) + break; + r = 0; + break; + } + + case KVM_PPC_GET_HTAB_FD: { + struct kvm_get_htab_fd ghf; + + r = -EFAULT; + if (copy_from_user(&ghf, argp, sizeof(ghf))) + break; + r = kvm_vm_ioctl_get_htab_fd(kvm, &ghf); + break; + } + + default: + r = -ENOTTY; + } + + return r; +} + +static struct kvmppc_ops kvm_ops_hv = { + .get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv, + .set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv, + .get_one_reg = kvmppc_get_one_reg_hv, + .set_one_reg = kvmppc_set_one_reg_hv, + .vcpu_load = kvmppc_core_vcpu_load_hv, + .vcpu_put = kvmppc_core_vcpu_put_hv, + .set_msr = kvmppc_set_msr_hv, + .vcpu_run = kvmppc_vcpu_run_hv, + .vcpu_create = kvmppc_core_vcpu_create_hv, + .vcpu_free = kvmppc_core_vcpu_free_hv, + .check_requests = kvmppc_core_check_requests_hv, + .get_dirty_log = kvm_vm_ioctl_get_dirty_log_hv, + .flush_memslot = kvmppc_core_flush_memslot_hv, + .prepare_memory_region = kvmppc_core_prepare_memory_region_hv, + .commit_memory_region = kvmppc_core_commit_memory_region_hv, + .unmap_hva = kvm_unmap_hva_hv, + .unmap_hva_range = kvm_unmap_hva_range_hv, + .age_hva = kvm_age_hva_hv, + .test_age_hva = kvm_test_age_hva_hv, + .set_spte_hva = kvm_set_spte_hva_hv, + .mmu_destroy = kvmppc_mmu_destroy_hv, + .free_memslot = kvmppc_core_free_memslot_hv, + .create_memslot = kvmppc_core_create_memslot_hv, + .init_vm = kvmppc_core_init_vm_hv, + .destroy_vm = kvmppc_core_destroy_vm_hv, + .get_smmu_info = kvm_vm_ioctl_get_smmu_info_hv, + .emulate_op = kvmppc_core_emulate_op_hv, + .emulate_mtspr = kvmppc_core_emulate_mtspr_hv, + .emulate_mfspr = kvmppc_core_emulate_mfspr_hv, + .fast_vcpu_kick = kvmppc_fast_vcpu_kick_hv, + .arch_vm_ioctl = kvm_arch_vm_ioctl_hv, +}; + +static int kvmppc_book3s_init_hv(void) +{ + int r; + /* + * FIXME!! Do we need to check on all cpus ? + */ + r = kvmppc_core_check_processor_compat_hv(); + if (r < 0) return r; - r = kvmppc_mmu_hv_init(); + kvm_ops_hv.owner = THIS_MODULE; + kvmppc_hv_ops = &kvm_ops_hv; + r = kvmppc_mmu_hv_init(); return r; } -static void kvmppc_book3s_hv_exit(void) +static void kvmppc_book3s_exit_hv(void) { - kvm_exit(); + kvmppc_hv_ops = NULL; } -module_init(kvmppc_book3s_hv_init); -module_exit(kvmppc_book3s_hv_exit); +module_init(kvmppc_book3s_init_hv); +module_exit(kvmppc_book3s_exit_hv); +MODULE_LICENSE("GPL"); diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S index 37f1cc4..928142c 100644 --- a/arch/powerpc/kvm/book3s_hv_interrupts.S +++ b/arch/powerpc/kvm/book3s_hv_interrupts.S @@ -158,9 +158,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) * Interrupts are enabled again at this point. */ -.global kvmppc_handler_highmem -kvmppc_handler_highmem: - /* * Register usage at this point: * diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index c71103b..bc8de75 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -33,30 +33,6 @@ #error Need to fix lppaca and SLB shadow accesses in little endian mode #endif -/***************************************************************************** - * * - * Real Mode handlers that need to be in the linear mapping * - * * - ****************************************************************************/ - - .globl kvmppc_skip_interrupt -kvmppc_skip_interrupt: - mfspr r13,SPRN_SRR0 - addi r13,r13,4 - mtspr SPRN_SRR0,r13 - GET_SCRATCH0(r13) - rfid - b . - - .globl kvmppc_skip_Hinterrupt -kvmppc_skip_Hinterrupt: - mfspr r13,SPRN_HSRR0 - addi r13,r13,4 - mtspr SPRN_HSRR0,r13 - GET_SCRATCH0(r13) - hrfid - b . - /* * Call kvmppc_hv_entry in real mode. * Must be called with interrupts hard-disabled. @@ -66,8 +42,11 @@ kvmppc_skip_Hinterrupt: * LR = return address to continue at after eventually re-enabling MMU */ _GLOBAL(kvmppc_hv_entry_trampoline) + mflr r0 + std r0, PPC_LR_STKOFF(r1) + stdu r1, -112(r1) mfmsr r10 - LOAD_REG_ADDR(r5, kvmppc_hv_entry) + LOAD_REG_ADDR(r5, kvmppc_call_hv_entry) li r0,MSR_RI andc r0,r10,r0 li r6,MSR_IR | MSR_DR @@ -77,11 +56,103 @@ _GLOBAL(kvmppc_hv_entry_trampoline) mtsrr1 r6 RFI -/****************************************************************************** - * * - * Entry code * - * * - *****************************************************************************/ +kvmppc_call_hv_entry: + bl kvmppc_hv_entry + + /* Back from guest - restore host state and return to caller */ + + /* Restore host DABR and DABRX */ + ld r5,HSTATE_DABR(r13) + li r6,7 + mtspr SPRN_DABR,r5 + mtspr SPRN_DABRX,r6 + + /* Restore SPRG3 */ + ld r3,PACA_SPRG3(r13) + mtspr SPRN_SPRG3,r3 + + /* + * Reload DEC. HDEC interrupts were disabled when + * we reloaded the host's LPCR value. + */ + ld r3, HSTATE_DECEXP(r13) + mftb r4 + subf r4, r4, r3 + mtspr SPRN_DEC, r4 + + /* Reload the host's PMU registers */ + ld r3, PACALPPACAPTR(r13) /* is the host using the PMU? */ + lbz r4, LPPACA_PMCINUSE(r3) + cmpwi r4, 0 + beq 23f /* skip if not */ + lwz r3, HSTATE_PMC(r13) + lwz r4, HSTATE_PMC + 4(r13) + lwz r5, HSTATE_PMC + 8(r13) + lwz r6, HSTATE_PMC + 12(r13) + lwz r8, HSTATE_PMC + 16(r13) + lwz r9, HSTATE_PMC + 20(r13) +BEGIN_FTR_SECTION + lwz r10, HSTATE_PMC + 24(r13) + lwz r11, HSTATE_PMC + 28(r13) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) + mtspr SPRN_PMC1, r3 + mtspr SPRN_PMC2, r4 + mtspr SPRN_PMC3, r5 + mtspr SPRN_PMC4, r6 + mtspr SPRN_PMC5, r8 + mtspr SPRN_PMC6, r9 +BEGIN_FTR_SECTION + mtspr SPRN_PMC7, r10 + mtspr SPRN_PMC8, r11 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) + ld r3, HSTATE_MMCR(r13) + ld r4, HSTATE_MMCR + 8(r13) + ld r5, HSTATE_MMCR + 16(r13) + mtspr SPRN_MMCR1, r4 + mtspr SPRN_MMCRA, r5 + mtspr SPRN_MMCR0, r3 + isync +23: + + /* + * For external and machine check interrupts, we need + * to call the Linux handler to process the interrupt. + * We do that by jumping to absolute address 0x500 for + * external interrupts, or the machine_check_fwnmi label + * for machine checks (since firmware might have patched + * the vector area at 0x200). The [h]rfid at the end of the + * handler will return to the book3s_hv_interrupts.S code. + * For other interrupts we do the rfid to get back + * to the book3s_hv_interrupts.S code here. + */ + ld r8, 112+PPC_LR_STKOFF(r1) + addi r1, r1, 112 + ld r7, HSTATE_HOST_MSR(r13) + + cmpwi cr1, r12, BOOK3S_INTERRUPT_MACHINE_CHECK + cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL +BEGIN_FTR_SECTION + beq 11f +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) + + /* RFI into the highmem handler, or branch to interrupt handler */ + mfmsr r6 + li r0, MSR_RI + andc r6, r6, r0 + mtmsrd r6, 1 /* Clear RI in MSR */ + mtsrr0 r8 + mtsrr1 r7 + beqa 0x500 /* external interrupt (PPC970) */ + beq cr1, 13f /* machine check */ + RFI + + /* On POWER7, we have external interrupts set to use HSRR0/1 */ +11: mtspr SPRN_HSRR0, r8 + mtspr SPRN_HSRR1, r7 + ba 0x500 + +13: b machine_check_fwnmi + /* * We come in here when wakened from nap mode on a secondary hw thread. @@ -137,7 +208,7 @@ kvm_start_guest: cmpdi r4,0 /* if we have no vcpu to run, go back to sleep */ beq kvm_no_guest - b kvmppc_hv_entry + b 30f 27: /* XXX should handle hypervisor maintenance interrupts etc. here */ b kvm_no_guest @@ -147,6 +218,57 @@ kvm_start_guest: stw r8,HSTATE_SAVED_XIRR(r13) b kvm_no_guest +30: bl kvmppc_hv_entry + + /* Back from the guest, go back to nap */ + /* Clear our vcpu pointer so we don't come back in early */ + li r0, 0 + std r0, HSTATE_KVM_VCPU(r13) + lwsync + /* Clear any pending IPI - we're an offline thread */ + ld r5, HSTATE_XICS_PHYS(r13) + li r7, XICS_XIRR + lwzcix r3, r5, r7 /* ack any pending interrupt */ + rlwinm. r0, r3, 0, 0xffffff /* any pending? */ + beq 37f + sync + li r0, 0xff + li r6, XICS_MFRR + stbcix r0, r5, r6 /* clear the IPI */ + stwcix r3, r5, r7 /* EOI it */ +37: sync + + /* increment the nap count and then go to nap mode */ + ld r4, HSTATE_KVM_VCORE(r13) + addi r4, r4, VCORE_NAP_COUNT + lwsync /* make previous updates visible */ +51: lwarx r3, 0, r4 + addi r3, r3, 1 + stwcx. r3, 0, r4 + bne 51b + +kvm_no_guest: + li r0, KVM_HWTHREAD_IN_NAP + stb r0, HSTATE_HWTHREAD_STATE(r13) + li r3, LPCR_PECE0 + mfspr r4, SPRN_LPCR + rlwimi r4, r3, 0, LPCR_PECE0 | LPCR_PECE1 + mtspr SPRN_LPCR, r4 + isync + std r0, HSTATE_SCRATCH0(r13) + ptesync + ld r0, HSTATE_SCRATCH0(r13) +1: cmpd r0, r0 + bne 1b + nap + b . + +/****************************************************************************** + * * + * Entry code * + * * + *****************************************************************************/ + .global kvmppc_hv_entry kvmppc_hv_entry: @@ -159,7 +281,8 @@ kvmppc_hv_entry: * all other volatile GPRS = free */ mflr r0 - std r0, HSTATE_VMHANDLER(r13) + std r0, PPC_LR_STKOFF(r1) + stdu r1, -112(r1) /* Set partition DABR */ /* Do this before re-enabling PMU to avoid P7 DABR corruption bug */ @@ -200,8 +323,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) ld r3, VCPU_MMCR(r4) ld r5, VCPU_MMCR + 8(r4) ld r6, VCPU_MMCR + 16(r4) + ld r7, VCPU_SIAR(r4) + ld r8, VCPU_SDAR(r4) mtspr SPRN_MMCR1, r5 mtspr SPRN_MMCRA, r6 + mtspr SPRN_SIAR, r7 + mtspr SPRN_SDAR, r8 mtspr SPRN_MMCR0, r3 isync @@ -254,22 +381,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) /* Save R1 in the PACA */ std r1, HSTATE_HOST_R1(r13) - /* Increment yield count if they have a VPA */ - ld r3, VCPU_VPA(r4) - cmpdi r3, 0 - beq 25f - lwz r5, LPPACA_YIELDCOUNT(r3) - addi r5, r5, 1 - stw r5, LPPACA_YIELDCOUNT(r3) - li r6, 1 - stb r6, VCPU_VPA_DIRTY(r4) -25: /* Load up DAR and DSISR */ ld r5, VCPU_DAR(r4) lwz r6, VCPU_DSISR(r4) mtspr SPRN_DAR, r5 mtspr SPRN_DSISR, r6 + li r6, KVM_GUEST_MODE_HOST_HV + stb r6, HSTATE_IN_GUEST(r13) + BEGIN_FTR_SECTION /* Restore AMR and UAMOR, set AMOR to all 1s */ ld r5,VCPU_AMR(r4) @@ -343,7 +463,28 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) bdnz 28b ptesync -22: li r0,1 + /* Add timebase offset onto timebase */ +22: ld r8,VCORE_TB_OFFSET(r5) + cmpdi r8,0 + beq 37f + mftb r6 /* current host timebase */ + add r8,r8,r6 + mtspr SPRN_TBU40,r8 /* update upper 40 bits */ + mftb r7 /* check if lower 24 bits overflowed */ + clrldi r6,r6,40 + clrldi r7,r7,40 + cmpld r7,r6 + bge 37f + addis r8,r8,0x100 /* if so, increment upper 40 bits */ + mtspr SPRN_TBU40,r8 + + /* Load guest PCR value to select appropriate compat mode */ +37: ld r7, VCORE_PCR(r5) + cmpdi r7, 0 + beq 38f + mtspr SPRN_PCR, r7 +38: + li r0,1 stb r0,VCORE_IN_GUEST(r5) /* signal secondaries to continue */ b 10f @@ -353,12 +494,22 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) beq 20b /* Set LPCR and RMOR. */ -10: ld r8,KVM_LPCR(r9) +10: ld r8,VCORE_LPCR(r5) mtspr SPRN_LPCR,r8 ld r8,KVM_RMOR(r9) mtspr SPRN_RMOR,r8 isync + /* Increment yield count if they have a VPA */ + ld r3, VCPU_VPA(r4) + cmpdi r3, 0 + beq 25f + lwz r5, LPPACA_YIELDCOUNT(r3) + addi r5, r5, 1 + stw r5, LPPACA_YIELDCOUNT(r3) + li r6, 1 + stb r6, VCPU_VPA_DIRTY(r4) +25: /* Check if HDEC expires soon */ mfspr r3,SPRN_HDEC cmpwi r3,10 @@ -405,7 +556,8 @@ toc_tlbie_lock: bne 24b isync - ld r7,KVM_LPCR(r9) /* use kvm->arch.lpcr to store HID4 */ + ld r5,HSTATE_KVM_VCORE(r13) + ld r7,VCORE_LPCR(r5) /* use vcore->lpcr to store HID4 */ li r0,0x18f rotldi r0,r0,HID4_LPID5_SH /* all lpid bits in HID4 = 1 */ or r0,r7,r0 @@ -541,7 +693,7 @@ fast_guest_return: mtspr SPRN_HSRR1,r11 /* Activate guest mode, so faults get handled by KVM */ - li r9, KVM_GUEST_MODE_GUEST + li r9, KVM_GUEST_MODE_GUEST_HV stb r9, HSTATE_IN_GUEST(r13) /* Enter guest */ @@ -550,13 +702,15 @@ BEGIN_FTR_SECTION ld r5, VCPU_CFAR(r4) mtspr SPRN_CFAR, r5 END_FTR_SECTION_IFSET(CPU_FTR_CFAR) +BEGIN_FTR_SECTION + ld r0, VCPU_PPR(r4) +END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) ld r5, VCPU_LR(r4) lwz r6, VCPU_CR(r4) mtlr r5 mtcr r6 - ld r0, VCPU_GPR(R0)(r4) ld r1, VCPU_GPR(R1)(r4) ld r2, VCPU_GPR(R2)(r4) ld r3, VCPU_GPR(R3)(r4) @@ -570,6 +724,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR) ld r12, VCPU_GPR(R12)(r4) ld r13, VCPU_GPR(R13)(r4) +BEGIN_FTR_SECTION + mtspr SPRN_PPR, r0 +END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) + ld r0, VCPU_GPR(R0)(r4) ld r4, VCPU_GPR(R4)(r4) hrfid @@ -584,8 +742,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR) /* * We come here from the first-level interrupt handlers. */ - .globl kvmppc_interrupt -kvmppc_interrupt: + .globl kvmppc_interrupt_hv +kvmppc_interrupt_hv: /* * Register contents: * R12 = interrupt vector @@ -595,6 +753,19 @@ kvmppc_interrupt: */ /* abuse host_r2 as third scratch area; we get r2 from PACATOC(r13) */ std r9, HSTATE_HOST_R2(r13) + + lbz r9, HSTATE_IN_GUEST(r13) + cmpwi r9, KVM_GUEST_MODE_HOST_HV + beq kvmppc_bad_host_intr +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE + cmpwi r9, KVM_GUEST_MODE_GUEST + ld r9, HSTATE_HOST_R2(r13) + beq kvmppc_interrupt_pr +#endif + /* We're now back in the host but in guest MMU context */ + li r9, KVM_GUEST_MODE_HOST_HV + stb r9, HSTATE_IN_GUEST(r13) + ld r9, HSTATE_KVM_VCPU(r13) /* Save registers */ @@ -620,6 +791,10 @@ BEGIN_FTR_SECTION ld r3, HSTATE_CFAR(r13) std r3, VCPU_CFAR(r9) END_FTR_SECTION_IFSET(CPU_FTR_CFAR) +BEGIN_FTR_SECTION + ld r4, HSTATE_PPR(r13) + std r4, VCPU_PPR(r9) +END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) /* Restore R1/R2 so we can handle faults */ ld r1, HSTATE_HOST_R1(r13) @@ -642,10 +817,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR) std r3, VCPU_GPR(R13)(r9) std r4, VCPU_LR(r9) - /* Unset guest mode */ - li r0, KVM_GUEST_MODE_NONE - stb r0, HSTATE_IN_GUEST(r13) - stw r12,VCPU_TRAP(r9) /* Save HEIR (HV emulation assist reg) in last_inst @@ -696,46 +867,11 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206) * set, we know the host wants us out so let's do it now */ do_ext_interrupt: - lbz r0, HSTATE_HOST_IPI(r13) - cmpwi r0, 0 - bne ext_interrupt_to_host - - /* Now read the interrupt from the ICP */ - ld r5, HSTATE_XICS_PHYS(r13) - li r7, XICS_XIRR - cmpdi r5, 0 - beq- ext_interrupt_to_host - lwzcix r3, r5, r7 - rlwinm. r0, r3, 0, 0xffffff - sync - beq 3f /* if nothing pending in the ICP */ - - /* We found something in the ICP... - * - * If it's not an IPI, stash it in the PACA and return to - * the host, we don't (yet) handle directing real external - * interrupts directly to the guest - */ - cmpwi r0, XICS_IPI - bne ext_stash_for_host - - /* It's an IPI, clear the MFRR and EOI it */ - li r0, 0xff - li r6, XICS_MFRR - stbcix r0, r5, r6 /* clear the IPI */ - stwcix r3, r5, r7 /* EOI it */ - sync - - /* We need to re-check host IPI now in case it got set in the - * meantime. If it's clear, we bounce the interrupt to the - * guest - */ - lbz r0, HSTATE_HOST_IPI(r13) - cmpwi r0, 0 - bne- 1f + bl kvmppc_read_intr + cmpdi r3, 0 + bgt ext_interrupt_to_host /* Allright, looks like an IPI for the guest, we need to set MER */ -3: /* Check if any CPU is heading out to the host, if so head out too */ ld r5, HSTATE_KVM_VCORE(r13) lwz r0, VCORE_ENTRY_EXIT(r5) @@ -764,27 +900,9 @@ do_ext_interrupt: mtspr SPRN_LPCR, r8 b fast_guest_return - /* We raced with the host, we need to resend that IPI, bummer */ -1: li r0, IPI_PRIORITY - stbcix r0, r5, r6 /* set the IPI */ - sync - b ext_interrupt_to_host - -ext_stash_for_host: - /* It's not an IPI and it's for the host, stash it in the PACA - * before exit, it will be picked up by the host ICP driver - */ - stw r3, HSTATE_SAVED_XIRR(r13) ext_interrupt_to_host: guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ - /* Save DEC */ - mfspr r5,SPRN_DEC - mftb r6 - extsw r5,r5 - add r5,r5,r6 - std r5,VCPU_DEC_EXPIRES(r9) - /* Save more register state */ mfdar r6 mfdsisr r7 @@ -954,7 +1072,30 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) mtspr SPRN_SDR1,r6 /* switch to partition page table */ mtspr SPRN_LPID,r7 isync - li r0,0 + + /* Subtract timebase offset from timebase */ + ld r8,VCORE_TB_OFFSET(r5) + cmpdi r8,0 + beq 17f + mftb r6 /* current host timebase */ + subf r8,r8,r6 + mtspr SPRN_TBU40,r8 /* update upper 40 bits */ + mftb r7 /* check if lower 24 bits overflowed */ + clrldi r6,r6,40 + clrldi r7,r7,40 + cmpld r7,r6 + bge 17f + addis r8,r8,0x100 /* if so, increment upper 40 bits */ + mtspr SPRN_TBU40,r8 + + /* Reset PCR */ +17: ld r0, VCORE_PCR(r5) + cmpdi r0, 0 + beq 18f + li r0, 0 + mtspr SPRN_PCR, r0 +18: + /* Signal secondary CPUs to continue */ stb r0,VCORE_IN_GUEST(r5) lis r8,0x7fff /* MAX_INT@h */ mtspr SPRN_HDEC,r8 @@ -1052,6 +1193,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) 1: addi r8,r8,16 .endr + /* Save DEC */ + mfspr r5,SPRN_DEC + mftb r6 + extsw r5,r5 + add r5,r5,r6 + std r5,VCPU_DEC_EXPIRES(r9) + /* Save and reset AMR and UAMOR before turning on the MMU */ BEGIN_FTR_SECTION mfspr r5,SPRN_AMR @@ -1062,6 +1210,10 @@ BEGIN_FTR_SECTION mtspr SPRN_AMR,r6 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) + /* Unset guest mode */ + li r0, KVM_GUEST_MODE_NONE + stb r0, HSTATE_IN_GUEST(r13) + /* Switch DSCR back to host value */ BEGIN_FTR_SECTION mfspr r8, SPRN_DSCR @@ -1134,9 +1286,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) std r3, VCPU_MMCR(r9) /* if not, set saved MMCR0 to FC */ b 22f 21: mfspr r5, SPRN_MMCR1 + mfspr r7, SPRN_SIAR + mfspr r8, SPRN_SDAR std r4, VCPU_MMCR(r9) std r5, VCPU_MMCR + 8(r9) std r6, VCPU_MMCR + 16(r9) + std r7, VCPU_SIAR(r9) + std r8, VCPU_SDAR(r9) mfspr r3, SPRN_PMC1 mfspr r4, SPRN_PMC2 mfspr r5, SPRN_PMC3 @@ -1158,103 +1314,30 @@ BEGIN_FTR_SECTION stw r11, VCPU_PMC + 28(r9) END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) 22: + ld r0, 112+PPC_LR_STKOFF(r1) + addi r1, r1, 112 + mtlr r0 + blr +secondary_too_late: + ld r5,HSTATE_KVM_VCORE(r13) + HMT_LOW +13: lbz r3,VCORE_IN_GUEST(r5) + cmpwi r3,0 + bne 13b + HMT_MEDIUM + li r0, KVM_GUEST_MODE_NONE + stb r0, HSTATE_IN_GUEST(r13) + ld r11,PACA_SLBSHADOWPTR(r13) - /* Secondary threads go off to take a nap on POWER7 */ -BEGIN_FTR_SECTION - lwz r0,VCPU_PTID(r9) - cmpwi r0,0 - bne secondary_nap -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) - - /* Restore host DABR and DABRX */ - ld r5,HSTATE_DABR(r13) - li r6,7 - mtspr SPRN_DABR,r5 - mtspr SPRN_DABRX,r6 - - /* Restore SPRG3 */ - ld r3,PACA_SPRG3(r13) - mtspr SPRN_SPRG3,r3 - - /* - * Reload DEC. HDEC interrupts were disabled when - * we reloaded the host's LPCR value. - */ - ld r3, HSTATE_DECEXP(r13) - mftb r4 - subf r4, r4, r3 - mtspr SPRN_DEC, r4 - - /* Reload the host's PMU registers */ - ld r3, PACALPPACAPTR(r13) /* is the host using the PMU? */ - lbz r4, LPPACA_PMCINUSE(r3) - cmpwi r4, 0 - beq 23f /* skip if not */ - lwz r3, HSTATE_PMC(r13) - lwz r4, HSTATE_PMC + 4(r13) - lwz r5, HSTATE_PMC + 8(r13) - lwz r6, HSTATE_PMC + 12(r13) - lwz r8, HSTATE_PMC + 16(r13) - lwz r9, HSTATE_PMC + 20(r13) -BEGIN_FTR_SECTION - lwz r10, HSTATE_PMC + 24(r13) - lwz r11, HSTATE_PMC + 28(r13) -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) - mtspr SPRN_PMC1, r3 - mtspr SPRN_PMC2, r4 - mtspr SPRN_PMC3, r5 - mtspr SPRN_PMC4, r6 - mtspr SPRN_PMC5, r8 - mtspr SPRN_PMC6, r9 -BEGIN_FTR_SECTION - mtspr SPRN_PMC7, r10 - mtspr SPRN_PMC8, r11 -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) - ld r3, HSTATE_MMCR(r13) - ld r4, HSTATE_MMCR + 8(r13) - ld r5, HSTATE_MMCR + 16(r13) - mtspr SPRN_MMCR1, r4 - mtspr SPRN_MMCRA, r5 - mtspr SPRN_MMCR0, r3 - isync -23: - /* - * For external and machine check interrupts, we need - * to call the Linux handler to process the interrupt. - * We do that by jumping to absolute address 0x500 for - * external interrupts, or the machine_check_fwnmi label - * for machine checks (since firmware might have patched - * the vector area at 0x200). The [h]rfid at the end of the - * handler will return to the book3s_hv_interrupts.S code. - * For other interrupts we do the rfid to get back - * to the book3s_hv_interrupts.S code here. - */ - ld r8, HSTATE_VMHANDLER(r13) - ld r7, HSTATE_HOST_MSR(r13) - - cmpwi cr1, r12, BOOK3S_INTERRUPT_MACHINE_CHECK - cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL -BEGIN_FTR_SECTION - beq 11f -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) - - /* RFI into the highmem handler, or branch to interrupt handler */ - mfmsr r6 - li r0, MSR_RI - andc r6, r6, r0 - mtmsrd r6, 1 /* Clear RI in MSR */ - mtsrr0 r8 - mtsrr1 r7 - beqa 0x500 /* external interrupt (PPC970) */ - beq cr1, 13f /* machine check */ - RFI - - /* On POWER7, we have external interrupts set to use HSRR0/1 */ -11: mtspr SPRN_HSRR0, r8 - mtspr SPRN_HSRR1, r7 - ba 0x500 - -13: b machine_check_fwnmi + .rept SLB_NUM_BOLTED + ld r5,SLBSHADOW_SAVEAREA(r11) + ld r6,SLBSHADOW_SAVEAREA+8(r11) + andis. r7,r5,SLB_ESID_V@h + beq 1f + slbmte r6,r5 +1: addi r11,r11,16 + .endr + b 22b /* * Check whether an HDSI is an HPTE not found fault or something else. @@ -1333,7 +1416,7 @@ fast_interrupt_c_return: stw r8, VCPU_LAST_INST(r9) /* Unset guest mode. */ - li r0, KVM_GUEST_MODE_NONE + li r0, KVM_GUEST_MODE_HOST_HV stb r0, HSTATE_IN_GUEST(r13) b guest_exit_cont @@ -1701,67 +1784,70 @@ machine_check_realmode: rotldi r11, r11, 63 b fast_interrupt_c_return -secondary_too_late: - ld r5,HSTATE_KVM_VCORE(r13) - HMT_LOW -13: lbz r3,VCORE_IN_GUEST(r5) - cmpwi r3,0 - bne 13b - HMT_MEDIUM - ld r11,PACA_SLBSHADOWPTR(r13) - - .rept SLB_NUM_BOLTED - ld r5,SLBSHADOW_SAVEAREA(r11) - ld r6,SLBSHADOW_SAVEAREA+8(r11) - andis. r7,r5,SLB_ESID_V@h - beq 1f - slbmte r6,r5 -1: addi r11,r11,16 - .endr +/* + * Determine what sort of external interrupt is pending (if any). + * Returns: + * 0 if no interrupt is pending + * 1 if an interrupt is pending that needs to be handled by the host + * -1 if there was a guest wakeup IPI (which has now been cleared) + */ +kvmppc_read_intr: + /* see if a host IPI is pending */ + li r3, 1 + lbz r0, HSTATE_HOST_IPI(r13) + cmpwi r0, 0 + bne 1f -secondary_nap: - /* Clear our vcpu pointer so we don't come back in early */ - li r0, 0 - std r0, HSTATE_KVM_VCPU(r13) - lwsync - /* Clear any pending IPI - assume we're a secondary thread */ - ld r5, HSTATE_XICS_PHYS(r13) + /* Now read the interrupt from the ICP */ + ld r6, HSTATE_XICS_PHYS(r13) li r7, XICS_XIRR - lwzcix r3, r5, r7 /* ack any pending interrupt */ - rlwinm. r0, r3, 0, 0xffffff /* any pending? */ - beq 37f + cmpdi r6, 0 + beq- 1f + lwzcix r0, r6, r7 + rlwinm. r3, r0, 0, 0xffffff sync - li r0, 0xff - li r6, XICS_MFRR - stbcix r0, r5, r6 /* clear the IPI */ - stwcix r3, r5, r7 /* EOI it */ -37: sync + beq 1f /* if nothing pending in the ICP */ - /* increment the nap count and then go to nap mode */ - ld r4, HSTATE_KVM_VCORE(r13) - addi r4, r4, VCORE_NAP_COUNT - lwsync /* make previous updates visible */ -51: lwarx r3, 0, r4 - addi r3, r3, 1 - stwcx. r3, 0, r4 - bne 51b + /* We found something in the ICP... + * + * If it's not an IPI, stash it in the PACA and return to + * the host, we don't (yet) handle directing real external + * interrupts directly to the guest + */ + cmpwi r3, XICS_IPI /* if there is, is it an IPI? */ + li r3, 1 + bne 42f -kvm_no_guest: - li r0, KVM_HWTHREAD_IN_NAP - stb r0, HSTATE_HWTHREAD_STATE(r13) + /* It's an IPI, clear the MFRR and EOI it */ + li r3, 0xff + li r8, XICS_MFRR + stbcix r3, r6, r8 /* clear the IPI */ + stwcix r0, r6, r7 /* EOI it */ + sync - li r3, LPCR_PECE0 - mfspr r4, SPRN_LPCR - rlwimi r4, r3, 0, LPCR_PECE0 | LPCR_PECE1 - mtspr SPRN_LPCR, r4 - isync - std r0, HSTATE_SCRATCH0(r13) - ptesync - ld r0, HSTATE_SCRATCH0(r13) -1: cmpd r0, r0 - bne 1b - nap - b . + /* We need to re-check host IPI now in case it got set in the + * meantime. If it's clear, we bounce the interrupt to the + * guest + */ + lbz r0, HSTATE_HOST_IPI(r13) + cmpwi r0, 0 + bne- 43f + + /* OK, it's an IPI for us */ + li r3, -1 +1: blr + +42: /* It's not an IPI and it's for the host, stash it in the PACA + * before exit, it will be picked up by the host ICP driver + */ + stw r0, HSTATE_SAVED_XIRR(r13) + b 1b + +43: /* We raced with the host, we need to resend that IPI, bummer */ + li r0, IPI_PRIORITY + stbcix r0, r6, r8 /* set the IPI */ + sync + b 1b /* * Save away FP, VMX and VSX registers. @@ -1879,3 +1965,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) lwz r7,VCPU_VRSAVE(r4) mtspr SPRN_VRSAVE,r7 blr + +/* + * We come here if we get any exception or interrupt while we are + * executing host real mode code while in guest MMU context. + * For now just spin, but we should do something better. + */ +kvmppc_bad_host_intr: + b . diff --git a/arch/powerpc/kvm/book3s_interrupts.S b/arch/powerpc/kvm/book3s_interrupts.S index 17cfae5..f4dd041 100644 --- a/arch/powerpc/kvm/book3s_interrupts.S +++ b/arch/powerpc/kvm/book3s_interrupts.S @@ -26,8 +26,12 @@ #if defined(CONFIG_PPC_BOOK3S_64) #define FUNC(name) GLUE(.,name) +#define GET_SHADOW_VCPU(reg) addi reg, r13, PACA_SVCPU + #elif defined(CONFIG_PPC_BOOK3S_32) #define FUNC(name) name +#define GET_SHADOW_VCPU(reg) lwz reg, (THREAD + THREAD_KVM_SVCPU)(r2) + #endif /* CONFIG_PPC_BOOK3S_XX */ #define VCPU_LOAD_NVGPRS(vcpu) \ @@ -87,8 +91,14 @@ kvm_start_entry: VCPU_LOAD_NVGPRS(r4) kvm_start_lightweight: + /* Copy registers into shadow vcpu so we can access them in real mode */ + GET_SHADOW_VCPU(r3) + bl FUNC(kvmppc_copy_to_svcpu) + nop + REST_GPR(4, r1) #ifdef CONFIG_PPC_BOOK3S_64 + /* Get the dcbz32 flag */ PPC_LL r3, VCPU_HFLAGS(r4) rldicl r3, r3, 0, 63 /* r3 &= 1 */ stb r3, HSTATE_RESTORE_HID5(r13) @@ -111,9 +121,6 @@ kvm_start_lightweight: * */ -.global kvmppc_handler_highmem -kvmppc_handler_highmem: - /* * Register usage at this point: * @@ -125,18 +132,31 @@ kvmppc_handler_highmem: * */ - /* R7 = vcpu */ - PPC_LL r7, GPR4(r1) + /* Transfer reg values from shadow vcpu back to vcpu struct */ + /* On 64-bit, interrupts are still off at this point */ + PPC_LL r3, GPR4(r1) /* vcpu pointer */ + GET_SHADOW_VCPU(r4) + bl FUNC(kvmppc_copy_from_svcpu) + nop #ifdef CONFIG_PPC_BOOK3S_64 + /* Re-enable interrupts */ + ld r3, HSTATE_HOST_MSR(r13) + ori r3, r3, MSR_EE + MTMSR_EERI(r3) + /* * Reload kernel SPRG3 value. * No need to save guest value as usermode can't modify SPRG3. */ ld r3, PACA_SPRG3(r13) mtspr SPRN_SPRG3, r3 + #endif /* CONFIG_PPC_BOOK3S_64 */ + /* R7 = vcpu */ + PPC_LL r7, GPR4(r1) + PPC_STL r14, VCPU_GPR(R14)(r7) PPC_STL r15, VCPU_GPR(R15)(r7) PPC_STL r16, VCPU_GPR(R16)(r7) @@ -161,7 +181,7 @@ kvmppc_handler_highmem: /* Restore r3 (kvm_run) and r4 (vcpu) */ REST_2GPRS(3, r1) - bl FUNC(kvmppc_handle_exit) + bl FUNC(kvmppc_handle_exit_pr) /* If RESUME_GUEST, get back in the loop */ cmpwi r3, RESUME_GUEST diff --git a/arch/powerpc/kvm/book3s_mmu_hpte.c b/arch/powerpc/kvm/book3s_mmu_hpte.c index da8b13c..5a1ab12 100644 --- a/arch/powerpc/kvm/book3s_mmu_hpte.c +++ b/arch/powerpc/kvm/book3s_mmu_hpte.c @@ -28,7 +28,7 @@ #include <asm/mmu_context.h> #include <asm/hw_irq.h> -#include "trace.h" +#include "trace_pr.h" #define PTE_SIZE 12 @@ -56,6 +56,14 @@ static inline u64 kvmppc_mmu_hash_vpte_long(u64 vpage) HPTEG_HASH_BITS_VPTE_LONG); } +#ifdef CONFIG_PPC_BOOK3S_64 +static inline u64 kvmppc_mmu_hash_vpte_64k(u64 vpage) +{ + return hash_64((vpage & 0xffffffff0ULL) >> 4, + HPTEG_HASH_BITS_VPTE_64K); +} +#endif + void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte) { u64 index; @@ -83,6 +91,15 @@ void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte) hlist_add_head_rcu(&pte->list_vpte_long, &vcpu3s->hpte_hash_vpte_long[index]); +#ifdef CONFIG_PPC_BOOK3S_64 + /* Add to vPTE_64k list */ + index = kvmppc_mmu_hash_vpte_64k(pte->pte.vpage); + hlist_add_head_rcu(&pte->list_vpte_64k, + &vcpu3s->hpte_hash_vpte_64k[index]); +#endif + + vcpu3s->hpte_cache_count++; + spin_unlock(&vcpu3s->mmu_lock); } @@ -113,10 +130,13 @@ static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte) hlist_del_init_rcu(&pte->list_pte_long); hlist_del_init_rcu(&pte->list_vpte); hlist_del_init_rcu(&pte->list_vpte_long); +#ifdef CONFIG_PPC_BOOK3S_64 + hlist_del_init_rcu(&pte->list_vpte_64k); +#endif + vcpu3s->hpte_cache_count--; spin_unlock(&vcpu3s->mmu_lock); - vcpu3s->hpte_cache_count--; call_rcu(&pte->rcu_head, free_pte_rcu); } @@ -219,6 +239,29 @@ static void kvmppc_mmu_pte_vflush_short(struct kvm_vcpu *vcpu, u64 guest_vp) rcu_read_unlock(); } +#ifdef CONFIG_PPC_BOOK3S_64 +/* Flush with mask 0xffffffff0 */ +static void kvmppc_mmu_pte_vflush_64k(struct kvm_vcpu *vcpu, u64 guest_vp) +{ + struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); + struct hlist_head *list; + struct hpte_cache *pte; + u64 vp_mask = 0xffffffff0ULL; + + list = &vcpu3s->hpte_hash_vpte_64k[ + kvmppc_mmu_hash_vpte_64k(guest_vp)]; + + rcu_read_lock(); + + /* Check the list for matching entries and invalidate */ + hlist_for_each_entry_rcu(pte, list, list_vpte_64k) + if ((pte->pte.vpage & vp_mask) == guest_vp) + invalidate_pte(vcpu, pte); + + rcu_read_unlock(); +} +#endif + /* Flush with mask 0xffffff000 */ static void kvmppc_mmu_pte_vflush_long(struct kvm_vcpu *vcpu, u64 guest_vp) { @@ -249,6 +292,11 @@ void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask) case 0xfffffffffULL: kvmppc_mmu_pte_vflush_short(vcpu, guest_vp); break; +#ifdef CONFIG_PPC_BOOK3S_64 + case 0xffffffff0ULL: + kvmppc_mmu_pte_vflush_64k(vcpu, guest_vp); + break; +#endif case 0xffffff000ULL: kvmppc_mmu_pte_vflush_long(vcpu, guest_vp); break; @@ -285,15 +333,19 @@ struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu) struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); struct hpte_cache *pte; - pte = kmem_cache_zalloc(hpte_cache, GFP_KERNEL); - vcpu3s->hpte_cache_count++; - if (vcpu3s->hpte_cache_count == HPTEG_CACHE_NUM) kvmppc_mmu_pte_flush_all(vcpu); + pte = kmem_cache_zalloc(hpte_cache, GFP_KERNEL); + return pte; } +void kvmppc_mmu_hpte_cache_free(struct hpte_cache *pte) +{ + kmem_cache_free(hpte_cache, pte); +} + void kvmppc_mmu_hpte_destroy(struct kvm_vcpu *vcpu) { kvmppc_mmu_pte_flush(vcpu, 0, 0); @@ -320,6 +372,10 @@ int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu) ARRAY_SIZE(vcpu3s->hpte_hash_vpte)); kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_vpte_long, ARRAY_SIZE(vcpu3s->hpte_hash_vpte_long)); +#ifdef CONFIG_PPC_BOOK3S_64 + kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_vpte_64k, + ARRAY_SIZE(vcpu3s->hpte_hash_vpte_64k)); +#endif spin_lock_init(&vcpu3s->mmu_lock); diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index c0b48f96..fe14ca3 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -40,8 +40,12 @@ #include <linux/sched.h> #include <linux/vmalloc.h> #include <linux/highmem.h> +#include <linux/module.h> -#include "trace.h" +#include "book3s.h" + +#define CREATE_TRACE_POINTS +#include "trace_pr.h" /* #define EXIT_DEBUG */ /* #define DEBUG_EXT */ @@ -56,29 +60,25 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr, #define HW_PAGE_SIZE PAGE_SIZE #endif -void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +static void kvmppc_core_vcpu_load_pr(struct kvm_vcpu *vcpu, int cpu) { #ifdef CONFIG_PPC_BOOK3S_64 struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); memcpy(svcpu->slb, to_book3s(vcpu)->slb_shadow, sizeof(svcpu->slb)); - memcpy(&get_paca()->shadow_vcpu, to_book3s(vcpu)->shadow_vcpu, - sizeof(get_paca()->shadow_vcpu)); svcpu->slb_max = to_book3s(vcpu)->slb_shadow_max; svcpu_put(svcpu); #endif vcpu->cpu = smp_processor_id(); #ifdef CONFIG_PPC_BOOK3S_32 - current->thread.kvm_shadow_vcpu = to_book3s(vcpu)->shadow_vcpu; + current->thread.kvm_shadow_vcpu = vcpu->arch.shadow_vcpu; #endif } -void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) +static void kvmppc_core_vcpu_put_pr(struct kvm_vcpu *vcpu) { #ifdef CONFIG_PPC_BOOK3S_64 struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); memcpy(to_book3s(vcpu)->slb_shadow, svcpu->slb, sizeof(svcpu->slb)); - memcpy(to_book3s(vcpu)->shadow_vcpu, &get_paca()->shadow_vcpu, - sizeof(get_paca()->shadow_vcpu)); to_book3s(vcpu)->slb_shadow_max = svcpu->slb_max; svcpu_put(svcpu); #endif @@ -87,7 +87,61 @@ void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) vcpu->cpu = -1; } -int kvmppc_core_check_requests(struct kvm_vcpu *vcpu) +/* Copy data needed by real-mode code from vcpu to shadow vcpu */ +void kvmppc_copy_to_svcpu(struct kvmppc_book3s_shadow_vcpu *svcpu, + struct kvm_vcpu *vcpu) +{ + svcpu->gpr[0] = vcpu->arch.gpr[0]; + svcpu->gpr[1] = vcpu->arch.gpr[1]; + svcpu->gpr[2] = vcpu->arch.gpr[2]; + svcpu->gpr[3] = vcpu->arch.gpr[3]; + svcpu->gpr[4] = vcpu->arch.gpr[4]; + svcpu->gpr[5] = vcpu->arch.gpr[5]; + svcpu->gpr[6] = vcpu->arch.gpr[6]; + svcpu->gpr[7] = vcpu->arch.gpr[7]; + svcpu->gpr[8] = vcpu->arch.gpr[8]; + svcpu->gpr[9] = vcpu->arch.gpr[9]; + svcpu->gpr[10] = vcpu->arch.gpr[10]; + svcpu->gpr[11] = vcpu->arch.gpr[11]; + svcpu->gpr[12] = vcpu->arch.gpr[12]; + svcpu->gpr[13] = vcpu->arch.gpr[13]; + svcpu->cr = vcpu->arch.cr; + svcpu->xer = vcpu->arch.xer; + svcpu->ctr = vcpu->arch.ctr; + svcpu->lr = vcpu->arch.lr; + svcpu->pc = vcpu->arch.pc; +} + +/* Copy data touched by real-mode code from shadow vcpu back to vcpu */ +void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu, + struct kvmppc_book3s_shadow_vcpu *svcpu) +{ + vcpu->arch.gpr[0] = svcpu->gpr[0]; + vcpu->arch.gpr[1] = svcpu->gpr[1]; + vcpu->arch.gpr[2] = svcpu->gpr[2]; + vcpu->arch.gpr[3] = svcpu->gpr[3]; + vcpu->arch.gpr[4] = svcpu->gpr[4]; + vcpu->arch.gpr[5] = svcpu->gpr[5]; + vcpu->arch.gpr[6] = svcpu->gpr[6]; + vcpu->arch.gpr[7] = svcpu->gpr[7]; + vcpu->arch.gpr[8] = svcpu->gpr[8]; + vcpu->arch.gpr[9] = svcpu->gpr[9]; + vcpu->arch.gpr[10] = svcpu->gpr[10]; + vcpu->arch.gpr[11] = svcpu->gpr[11]; + vcpu->arch.gpr[12] = svcpu->gpr[12]; + vcpu->arch.gpr[13] = svcpu->gpr[13]; + vcpu->arch.cr = svcpu->cr; + vcpu->arch.xer = svcpu->xer; + vcpu->arch.ctr = svcpu->ctr; + vcpu->arch.lr = svcpu->lr; + vcpu->arch.pc = svcpu->pc; + vcpu->arch.shadow_srr1 = svcpu->shadow_srr1; + vcpu->arch.fault_dar = svcpu->fault_dar; + vcpu->arch.fault_dsisr = svcpu->fault_dsisr; + vcpu->arch.last_inst = svcpu->last_inst; +} + +static int kvmppc_core_check_requests_pr(struct kvm_vcpu *vcpu) { int r = 1; /* Indicate we want to get back into the guest */ @@ -100,44 +154,69 @@ int kvmppc_core_check_requests(struct kvm_vcpu *vcpu) } /************* MMU Notifiers *************/ +static void do_kvm_unmap_hva(struct kvm *kvm, unsigned long start, + unsigned long end) +{ + long i; + struct kvm_vcpu *vcpu; + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + + slots = kvm_memslots(kvm); + kvm_for_each_memslot(memslot, slots) { + unsigned long hva_start, hva_end; + gfn_t gfn, gfn_end; + + hva_start = max(start, memslot->userspace_addr); + hva_end = min(end, memslot->userspace_addr + + (memslot->npages << PAGE_SHIFT)); + if (hva_start >= hva_end) + continue; + /* + * {gfn(page) | page intersects with [hva_start, hva_end)} = + * {gfn, gfn+1, ..., gfn_end-1}. + */ + gfn = hva_to_gfn_memslot(hva_start, memslot); + gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); + kvm_for_each_vcpu(i, vcpu, kvm) + kvmppc_mmu_pte_pflush(vcpu, gfn << PAGE_SHIFT, + gfn_end << PAGE_SHIFT); + } +} -int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) +static int kvm_unmap_hva_pr(struct kvm *kvm, unsigned long hva) { trace_kvm_unmap_hva(hva); - /* - * Flush all shadow tlb entries everywhere. This is slow, but - * we are 100% sure that we catch the to be unmapped page - */ - kvm_flush_remote_tlbs(kvm); + do_kvm_unmap_hva(kvm, hva, hva + PAGE_SIZE); return 0; } -int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) +static int kvm_unmap_hva_range_pr(struct kvm *kvm, unsigned long start, + unsigned long end) { - /* kvm_unmap_hva flushes everything anyways */ - kvm_unmap_hva(kvm, start); + do_kvm_unmap_hva(kvm, start, end); return 0; } -int kvm_age_hva(struct kvm *kvm, unsigned long hva) +static int kvm_age_hva_pr(struct kvm *kvm, unsigned long hva) { /* XXX could be more clever ;) */ return 0; } -int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +static int kvm_test_age_hva_pr(struct kvm *kvm, unsigned long hva) { /* XXX could be more clever ;) */ return 0; } -void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +static void kvm_set_spte_hva_pr(struct kvm *kvm, unsigned long hva, pte_t pte) { /* The page will get remapped properly on its next fault */ - kvm_unmap_hva(kvm, hva); + do_kvm_unmap_hva(kvm, hva, hva + PAGE_SIZE); } /*****************************************/ @@ -159,7 +238,7 @@ static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu) vcpu->arch.shadow_msr = smsr; } -void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr) +static void kvmppc_set_msr_pr(struct kvm_vcpu *vcpu, u64 msr) { ulong old_msr = vcpu->arch.shared->msr; @@ -219,7 +298,7 @@ void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr) kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP); } -void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr) +void kvmppc_set_pvr_pr(struct kvm_vcpu *vcpu, u32 pvr) { u32 host_pvr; @@ -256,6 +335,23 @@ void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr) if (!strcmp(cur_cpu_spec->platform, "ppc-cell-be")) to_book3s(vcpu)->msr_mask &= ~(MSR_FE0 | MSR_FE1); + /* + * If they're asking for POWER6 or later, set the flag + * indicating that we can do multiple large page sizes + * and 1TB segments. + * Also set the flag that indicates that tlbie has the large + * page bit in the RB operand instead of the instruction. + */ + switch (PVR_VER(pvr)) { + case PVR_POWER6: + case PVR_POWER7: + case PVR_POWER7p: + case PVR_POWER8: + vcpu->arch.hflags |= BOOK3S_HFLAG_MULTI_PGSIZE | + BOOK3S_HFLAG_NEW_TLBIE; + break; + } + #ifdef CONFIG_PPC_BOOK3S_32 /* 32 bit Book3S always has 32 byte dcbz */ vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32; @@ -334,6 +430,7 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, ulong eaddr, int vec) { bool data = (vec == BOOK3S_INTERRUPT_DATA_STORAGE); + bool iswrite = false; int r = RESUME_GUEST; int relocated; int page_found = 0; @@ -344,10 +441,12 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, u64 vsid; relocated = data ? dr : ir; + if (data && (vcpu->arch.fault_dsisr & DSISR_ISSTORE)) + iswrite = true; /* Resolve real address if translation turned on */ if (relocated) { - page_found = vcpu->arch.mmu.xlate(vcpu, eaddr, &pte, data); + page_found = vcpu->arch.mmu.xlate(vcpu, eaddr, &pte, data, iswrite); } else { pte.may_execute = true; pte.may_read = true; @@ -355,6 +454,7 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, pte.raddr = eaddr & KVM_PAM; pte.eaddr = eaddr; pte.vpage = eaddr >> 12; + pte.page_size = MMU_PAGE_64K; } switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) { @@ -388,22 +488,18 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, if (page_found == -ENOENT) { /* Page not found in guest PTE entries */ - struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu); - vcpu->arch.shared->dsisr = svcpu->fault_dsisr; + vcpu->arch.shared->dsisr = vcpu->arch.fault_dsisr; vcpu->arch.shared->msr |= - (svcpu->shadow_srr1 & 0x00000000f8000000ULL); - svcpu_put(svcpu); + vcpu->arch.shadow_srr1 & 0x00000000f8000000ULL; kvmppc_book3s_queue_irqprio(vcpu, vec); } else if (page_found == -EPERM) { /* Storage protection */ - struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu); - vcpu->arch.shared->dsisr = svcpu->fault_dsisr & ~DSISR_NOHPTE; + vcpu->arch.shared->dsisr = vcpu->arch.fault_dsisr & ~DSISR_NOHPTE; vcpu->arch.shared->dsisr |= DSISR_PROTFAULT; vcpu->arch.shared->msr |= - svcpu->shadow_srr1 & 0x00000000f8000000ULL; - svcpu_put(svcpu); + vcpu->arch.shadow_srr1 & 0x00000000f8000000ULL; kvmppc_book3s_queue_irqprio(vcpu, vec); } else if (page_found == -EINVAL) { /* Page not found in guest SLB */ @@ -411,12 +507,20 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, kvmppc_book3s_queue_irqprio(vcpu, vec + 0x80); } else if (!is_mmio && kvmppc_visible_gfn(vcpu, pte.raddr >> PAGE_SHIFT)) { + if (data && !(vcpu->arch.fault_dsisr & DSISR_NOHPTE)) { + /* + * There is already a host HPTE there, presumably + * a read-only one for a page the guest thinks + * is writable, so get rid of it first. + */ + kvmppc_mmu_unmap_page(vcpu, &pte); + } /* The guest's PTE is not mapped yet. Map on the host */ - kvmppc_mmu_map_page(vcpu, &pte); + kvmppc_mmu_map_page(vcpu, &pte, iswrite); if (data) vcpu->stat.sp_storage++; else if (vcpu->arch.mmu.is_dcbz32(vcpu) && - (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) + (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) kvmppc_patch_dcbz(vcpu, &pte); } else { /* MMIO */ @@ -619,13 +723,15 @@ static void kvmppc_handle_lost_ext(struct kvm_vcpu *vcpu) if (lost_ext & MSR_FP) kvmppc_load_up_fpu(); +#ifdef CONFIG_ALTIVEC if (lost_ext & MSR_VEC) kvmppc_load_up_altivec(); +#endif current->thread.regs->msr |= lost_ext; } -int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, - unsigned int exit_nr) +int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int exit_nr) { int r = RESUME_HOST; int s; @@ -643,25 +749,32 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, switch (exit_nr) { case BOOK3S_INTERRUPT_INST_STORAGE: { - struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); - ulong shadow_srr1 = svcpu->shadow_srr1; + ulong shadow_srr1 = vcpu->arch.shadow_srr1; vcpu->stat.pf_instruc++; #ifdef CONFIG_PPC_BOOK3S_32 /* We set segments as unused segments when invalidating them. So * treat the respective fault as segment fault. */ - if (svcpu->sr[kvmppc_get_pc(vcpu) >> SID_SHIFT] == SR_INVALID) { - kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)); - r = RESUME_GUEST; + { + struct kvmppc_book3s_shadow_vcpu *svcpu; + u32 sr; + + svcpu = svcpu_get(vcpu); + sr = svcpu->sr[kvmppc_get_pc(vcpu) >> SID_SHIFT]; svcpu_put(svcpu); - break; + if (sr == SR_INVALID) { + kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)); + r = RESUME_GUEST; + break; + } } #endif - svcpu_put(svcpu); /* only care about PTEG not found errors, but leave NX alone */ if (shadow_srr1 & 0x40000000) { + int idx = srcu_read_lock(&vcpu->kvm->srcu); r = kvmppc_handle_pagefault(run, vcpu, kvmppc_get_pc(vcpu), exit_nr); + srcu_read_unlock(&vcpu->kvm->srcu, idx); vcpu->stat.sp_instruc++; } else if (vcpu->arch.mmu.is_dcbz32(vcpu) && (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) { @@ -682,25 +795,36 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, case BOOK3S_INTERRUPT_DATA_STORAGE: { ulong dar = kvmppc_get_fault_dar(vcpu); - struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); - u32 fault_dsisr = svcpu->fault_dsisr; + u32 fault_dsisr = vcpu->arch.fault_dsisr; vcpu->stat.pf_storage++; #ifdef CONFIG_PPC_BOOK3S_32 /* We set segments as unused segments when invalidating them. So * treat the respective fault as segment fault. */ - if ((svcpu->sr[dar >> SID_SHIFT]) == SR_INVALID) { - kvmppc_mmu_map_segment(vcpu, dar); - r = RESUME_GUEST; + { + struct kvmppc_book3s_shadow_vcpu *svcpu; + u32 sr; + + svcpu = svcpu_get(vcpu); + sr = svcpu->sr[dar >> SID_SHIFT]; svcpu_put(svcpu); - break; + if (sr == SR_INVALID) { + kvmppc_mmu_map_segment(vcpu, dar); + r = RESUME_GUEST; + break; + } } #endif - svcpu_put(svcpu); - /* The only case we need to handle is missing shadow PTEs */ - if (fault_dsisr & DSISR_NOHPTE) { + /* + * We need to handle missing shadow PTEs, and + * protection faults due to us mapping a page read-only + * when the guest thinks it is writable. + */ + if (fault_dsisr & (DSISR_NOHPTE | DSISR_PROTFAULT)) { + int idx = srcu_read_lock(&vcpu->kvm->srcu); r = kvmppc_handle_pagefault(run, vcpu, dar, exit_nr); + srcu_read_unlock(&vcpu->kvm->srcu, idx); } else { vcpu->arch.shared->dar = dar; vcpu->arch.shared->dsisr = fault_dsisr; @@ -743,13 +867,10 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, case BOOK3S_INTERRUPT_H_EMUL_ASSIST: { enum emulation_result er; - struct kvmppc_book3s_shadow_vcpu *svcpu; ulong flags; program_interrupt: - svcpu = svcpu_get(vcpu); - flags = svcpu->shadow_srr1 & 0x1f0000ull; - svcpu_put(svcpu); + flags = vcpu->arch.shadow_srr1 & 0x1f0000ull; if (vcpu->arch.shared->msr & MSR_PR) { #ifdef EXIT_DEBUG @@ -798,7 +919,7 @@ program_interrupt: ulong cmd = kvmppc_get_gpr(vcpu, 3); int i; -#ifdef CONFIG_KVM_BOOK3S_64_PR +#ifdef CONFIG_PPC_BOOK3S_64 if (kvmppc_h_pr(vcpu, cmd) == EMULATE_DONE) { r = RESUME_GUEST; break; @@ -881,9 +1002,7 @@ program_interrupt: break; default: { - struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); - ulong shadow_srr1 = svcpu->shadow_srr1; - svcpu_put(svcpu); + ulong shadow_srr1 = vcpu->arch.shadow_srr1; /* Ugh - bork here! What did we get? */ printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | msr=0x%lx\n", exit_nr, kvmppc_get_pc(vcpu), shadow_srr1); @@ -920,8 +1039,8 @@ program_interrupt: return r; } -int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, - struct kvm_sregs *sregs) +static int kvm_arch_vcpu_ioctl_get_sregs_pr(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) { struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); int i; @@ -947,13 +1066,13 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, return 0; } -int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, - struct kvm_sregs *sregs) +static int kvm_arch_vcpu_ioctl_set_sregs_pr(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) { struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); int i; - kvmppc_set_pvr(vcpu, sregs->pvr); + kvmppc_set_pvr_pr(vcpu, sregs->pvr); vcpu3s->sdr1 = sregs->u.s.sdr1; if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) { @@ -983,7 +1102,8 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, return 0; } -int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) +static int kvmppc_get_one_reg_pr(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) { int r = 0; @@ -1012,7 +1132,8 @@ int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) return r; } -int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) +static int kvmppc_set_one_reg_pr(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) { int r = 0; @@ -1042,28 +1163,30 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) return r; } -int kvmppc_core_check_processor_compat(void) -{ - return 0; -} - -struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) +static struct kvm_vcpu *kvmppc_core_vcpu_create_pr(struct kvm *kvm, + unsigned int id) { struct kvmppc_vcpu_book3s *vcpu_book3s; struct kvm_vcpu *vcpu; int err = -ENOMEM; unsigned long p; - vcpu_book3s = vzalloc(sizeof(struct kvmppc_vcpu_book3s)); - if (!vcpu_book3s) + vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); + if (!vcpu) goto out; - vcpu_book3s->shadow_vcpu = - kzalloc(sizeof(*vcpu_book3s->shadow_vcpu), GFP_KERNEL); - if (!vcpu_book3s->shadow_vcpu) + vcpu_book3s = vzalloc(sizeof(struct kvmppc_vcpu_book3s)); + if (!vcpu_book3s) goto free_vcpu; + vcpu->arch.book3s = vcpu_book3s; + +#ifdef CONFIG_KVM_BOOK3S_32 + vcpu->arch.shadow_vcpu = + kzalloc(sizeof(*vcpu->arch.shadow_vcpu), GFP_KERNEL); + if (!vcpu->arch.shadow_vcpu) + goto free_vcpu3s; +#endif - vcpu = &vcpu_book3s->vcpu; err = kvm_vcpu_init(vcpu, kvm, id); if (err) goto free_shadow_vcpu; @@ -1076,13 +1199,19 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) vcpu->arch.shared = (void *)(p + PAGE_SIZE - 4096); #ifdef CONFIG_PPC_BOOK3S_64 - /* default to book3s_64 (970fx) */ + /* + * Default to the same as the host if we're on sufficiently + * recent machine that we have 1TB segments; + * otherwise default to PPC970FX. + */ vcpu->arch.pvr = 0x3C0301; + if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) + vcpu->arch.pvr = mfspr(SPRN_PVR); #else /* default to book3s_32 (750) */ vcpu->arch.pvr = 0x84202; #endif - kvmppc_set_pvr(vcpu, vcpu->arch.pvr); + kvmppc_set_pvr_pr(vcpu, vcpu->arch.pvr); vcpu->arch.slb_nr = 64; vcpu->arch.shadow_msr = MSR_USER64; @@ -1096,24 +1225,31 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) uninit_vcpu: kvm_vcpu_uninit(vcpu); free_shadow_vcpu: - kfree(vcpu_book3s->shadow_vcpu); -free_vcpu: +#ifdef CONFIG_KVM_BOOK3S_32 + kfree(vcpu->arch.shadow_vcpu); +free_vcpu3s: +#endif vfree(vcpu_book3s); +free_vcpu: + kmem_cache_free(kvm_vcpu_cache, vcpu); out: return ERR_PTR(err); } -void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) +static void kvmppc_core_vcpu_free_pr(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); free_page((unsigned long)vcpu->arch.shared & PAGE_MASK); kvm_vcpu_uninit(vcpu); - kfree(vcpu_book3s->shadow_vcpu); +#ifdef CONFIG_KVM_BOOK3S_32 + kfree(vcpu->arch.shadow_vcpu); +#endif vfree(vcpu_book3s); + kmem_cache_free(kvm_vcpu_cache, vcpu); } -int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) +static int kvmppc_vcpu_run_pr(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) { int ret; struct thread_fp_state fp; @@ -1216,8 +1352,8 @@ out: /* * Get (and clear) the dirty memory log for a memory slot. */ -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, - struct kvm_dirty_log *log) +static int kvm_vm_ioctl_get_dirty_log_pr(struct kvm *kvm, + struct kvm_dirty_log *log) { struct kvm_memory_slot *memslot; struct kvm_vcpu *vcpu; @@ -1252,67 +1388,100 @@ out: return r; } -#ifdef CONFIG_PPC64 -int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm, struct kvm_ppc_smmu_info *info) +static void kvmppc_core_flush_memslot_pr(struct kvm *kvm, + struct kvm_memory_slot *memslot) { - info->flags = KVM_PPC_1T_SEGMENTS; - - /* SLB is always 64 entries */ - info->slb_size = 64; - - /* Standard 4k base page size segment */ - info->sps[0].page_shift = 12; - info->sps[0].slb_enc = 0; - info->sps[0].enc[0].page_shift = 12; - info->sps[0].enc[0].pte_enc = 0; - - /* Standard 16M large page size segment */ - info->sps[1].page_shift = 24; - info->sps[1].slb_enc = SLB_VSID_L; - info->sps[1].enc[0].page_shift = 24; - info->sps[1].enc[0].pte_enc = 0; + return; +} +static int kvmppc_core_prepare_memory_region_pr(struct kvm *kvm, + struct kvm_memory_slot *memslot, + struct kvm_userspace_memory_region *mem) +{ return 0; } -#endif /* CONFIG_PPC64 */ -void kvmppc_core_free_memslot(struct kvm_memory_slot *free, - struct kvm_memory_slot *dont) +static void kvmppc_core_commit_memory_region_pr(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + const struct kvm_memory_slot *old) { + return; } -int kvmppc_core_create_memslot(struct kvm_memory_slot *slot, - unsigned long npages) +static void kvmppc_core_free_memslot_pr(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) { - return 0; + return; } -int kvmppc_core_prepare_memory_region(struct kvm *kvm, - struct kvm_memory_slot *memslot, - struct kvm_userspace_memory_region *mem) +static int kvmppc_core_create_memslot_pr(struct kvm_memory_slot *slot, + unsigned long npages) { return 0; } -void kvmppc_core_commit_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, - const struct kvm_memory_slot *old) + +#ifdef CONFIG_PPC64 +static int kvm_vm_ioctl_get_smmu_info_pr(struct kvm *kvm, + struct kvm_ppc_smmu_info *info) { -} + long int i; + struct kvm_vcpu *vcpu; + + info->flags = 0; + + /* SLB is always 64 entries */ + info->slb_size = 64; + + /* Standard 4k base page size segment */ + info->sps[0].page_shift = 12; + info->sps[0].slb_enc = 0; + info->sps[0].enc[0].page_shift = 12; + info->sps[0].enc[0].pte_enc = 0; + + /* + * 64k large page size. + * We only want to put this in if the CPUs we're emulating + * support it, but unfortunately we don't have a vcpu easily + * to hand here to test. Just pick the first vcpu, and if + * that doesn't exist yet, report the minimum capability, + * i.e., no 64k pages. + * 1T segment support goes along with 64k pages. + */ + i = 1; + vcpu = kvm_get_vcpu(kvm, 0); + if (vcpu && (vcpu->arch.hflags & BOOK3S_HFLAG_MULTI_PGSIZE)) { + info->flags = KVM_PPC_1T_SEGMENTS; + info->sps[i].page_shift = 16; + info->sps[i].slb_enc = SLB_VSID_L | SLB_VSID_LP_01; + info->sps[i].enc[0].page_shift = 16; + info->sps[i].enc[0].pte_enc = 1; + ++i; + } + + /* Standard 16M large page size segment */ + info->sps[i].page_shift = 24; + info->sps[i].slb_enc = SLB_VSID_L; + info->sps[i].enc[0].page_shift = 24; + info->sps[i].enc[0].pte_enc = 0; -void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) + return 0; +} +#else +static int kvm_vm_ioctl_get_smmu_info_pr(struct kvm *kvm, + struct kvm_ppc_smmu_info *info) { + /* We should not get called */ + BUG(); } +#endif /* CONFIG_PPC64 */ static unsigned int kvm_global_user_count = 0; static DEFINE_SPINLOCK(kvm_global_user_count_lock); -int kvmppc_core_init_vm(struct kvm *kvm) +static int kvmppc_core_init_vm_pr(struct kvm *kvm) { -#ifdef CONFIG_PPC64 - INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); - INIT_LIST_HEAD(&kvm->arch.rtas_tokens); -#endif + mutex_init(&kvm->arch.hpt_mutex); if (firmware_has_feature(FW_FEATURE_SET_MODE)) { spin_lock(&kvm_global_user_count_lock); @@ -1323,7 +1492,7 @@ int kvmppc_core_init_vm(struct kvm *kvm) return 0; } -void kvmppc_core_destroy_vm(struct kvm *kvm) +static void kvmppc_core_destroy_vm_pr(struct kvm *kvm) { #ifdef CONFIG_PPC64 WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables)); @@ -1338,26 +1507,81 @@ void kvmppc_core_destroy_vm(struct kvm *kvm) } } -static int kvmppc_book3s_init(void) +static int kvmppc_core_check_processor_compat_pr(void) { - int r; + /* we are always compatible */ + return 0; +} - r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_book3s), 0, - THIS_MODULE); +static long kvm_arch_vm_ioctl_pr(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + return -ENOTTY; +} - if (r) +static struct kvmppc_ops kvm_ops_pr = { + .get_sregs = kvm_arch_vcpu_ioctl_get_sregs_pr, + .set_sregs = kvm_arch_vcpu_ioctl_set_sregs_pr, + .get_one_reg = kvmppc_get_one_reg_pr, + .set_one_reg = kvmppc_set_one_reg_pr, + .vcpu_load = kvmppc_core_vcpu_load_pr, + .vcpu_put = kvmppc_core_vcpu_put_pr, + .set_msr = kvmppc_set_msr_pr, + .vcpu_run = kvmppc_vcpu_run_pr, + .vcpu_create = kvmppc_core_vcpu_create_pr, + .vcpu_free = kvmppc_core_vcpu_free_pr, + .check_requests = kvmppc_core_check_requests_pr, + .get_dirty_log = kvm_vm_ioctl_get_dirty_log_pr, + .flush_memslot = kvmppc_core_flush_memslot_pr, + .prepare_memory_region = kvmppc_core_prepare_memory_region_pr, + .commit_memory_region = kvmppc_core_commit_memory_region_pr, + .unmap_hva = kvm_unmap_hva_pr, + .unmap_hva_range = kvm_unmap_hva_range_pr, + .age_hva = kvm_age_hva_pr, + .test_age_hva = kvm_test_age_hva_pr, + .set_spte_hva = kvm_set_spte_hva_pr, + .mmu_destroy = kvmppc_mmu_destroy_pr, + .free_memslot = kvmppc_core_free_memslot_pr, + .create_memslot = kvmppc_core_create_memslot_pr, + .init_vm = kvmppc_core_init_vm_pr, + .destroy_vm = kvmppc_core_destroy_vm_pr, + .get_smmu_info = kvm_vm_ioctl_get_smmu_info_pr, + .emulate_op = kvmppc_core_emulate_op_pr, + .emulate_mtspr = kvmppc_core_emulate_mtspr_pr, + .emulate_mfspr = kvmppc_core_emulate_mfspr_pr, + .fast_vcpu_kick = kvm_vcpu_kick, + .arch_vm_ioctl = kvm_arch_vm_ioctl_pr, +}; + + +int kvmppc_book3s_init_pr(void) +{ + int r; + + r = kvmppc_core_check_processor_compat_pr(); + if (r < 0) return r; - r = kvmppc_mmu_hpte_sysinit(); + kvm_ops_pr.owner = THIS_MODULE; + kvmppc_pr_ops = &kvm_ops_pr; + r = kvmppc_mmu_hpte_sysinit(); return r; } -static void kvmppc_book3s_exit(void) +void kvmppc_book3s_exit_pr(void) { + kvmppc_pr_ops = NULL; kvmppc_mmu_hpte_sysexit(); - kvm_exit(); } -module_init(kvmppc_book3s_init); -module_exit(kvmppc_book3s_exit); +/* + * We only support separate modules for book3s 64 + */ +#ifdef CONFIG_PPC_BOOK3S_64 + +module_init(kvmppc_book3s_init_pr); +module_exit(kvmppc_book3s_exit_pr); + +MODULE_LICENSE("GPL"); +#endif diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c index da0e0bc..5efa97b 100644 --- a/arch/powerpc/kvm/book3s_pr_papr.c +++ b/arch/powerpc/kvm/book3s_pr_papr.c @@ -21,6 +21,8 @@ #include <asm/kvm_ppc.h> #include <asm/kvm_book3s.h> +#define HPTE_SIZE 16 /* bytes per HPT entry */ + static unsigned long get_pteg_addr(struct kvm_vcpu *vcpu, long pte_index) { struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); @@ -40,32 +42,41 @@ static int kvmppc_h_pr_enter(struct kvm_vcpu *vcpu) long pte_index = kvmppc_get_gpr(vcpu, 5); unsigned long pteg[2 * 8]; unsigned long pteg_addr, i, *hpte; + long int ret; + i = pte_index & 7; pte_index &= ~7UL; pteg_addr = get_pteg_addr(vcpu, pte_index); + mutex_lock(&vcpu->kvm->arch.hpt_mutex); copy_from_user(pteg, (void __user *)pteg_addr, sizeof(pteg)); hpte = pteg; + ret = H_PTEG_FULL; if (likely((flags & H_EXACT) == 0)) { - pte_index &= ~7UL; for (i = 0; ; ++i) { if (i == 8) - return H_PTEG_FULL; + goto done; if ((*hpte & HPTE_V_VALID) == 0) break; hpte += 2; } } else { - i = kvmppc_get_gpr(vcpu, 5) & 7UL; hpte += i * 2; + if (*hpte & HPTE_V_VALID) + goto done; } hpte[0] = kvmppc_get_gpr(vcpu, 6); hpte[1] = kvmppc_get_gpr(vcpu, 7); - copy_to_user((void __user *)pteg_addr, pteg, sizeof(pteg)); - kvmppc_set_gpr(vcpu, 3, H_SUCCESS); + pteg_addr += i * HPTE_SIZE; + copy_to_user((void __user *)pteg_addr, hpte, HPTE_SIZE); kvmppc_set_gpr(vcpu, 4, pte_index | i); + ret = H_SUCCESS; + + done: + mutex_unlock(&vcpu->kvm->arch.hpt_mutex); + kvmppc_set_gpr(vcpu, 3, ret); return EMULATE_DONE; } @@ -77,26 +88,31 @@ static int kvmppc_h_pr_remove(struct kvm_vcpu *vcpu) unsigned long avpn = kvmppc_get_gpr(vcpu, 6); unsigned long v = 0, pteg, rb; unsigned long pte[2]; + long int ret; pteg = get_pteg_addr(vcpu, pte_index); + mutex_lock(&vcpu->kvm->arch.hpt_mutex); copy_from_user(pte, (void __user *)pteg, sizeof(pte)); + ret = H_NOT_FOUND; if ((pte[0] & HPTE_V_VALID) == 0 || ((flags & H_AVPN) && (pte[0] & ~0x7fUL) != avpn) || - ((flags & H_ANDCOND) && (pte[0] & avpn) != 0)) { - kvmppc_set_gpr(vcpu, 3, H_NOT_FOUND); - return EMULATE_DONE; - } + ((flags & H_ANDCOND) && (pte[0] & avpn) != 0)) + goto done; copy_to_user((void __user *)pteg, &v, sizeof(v)); rb = compute_tlbie_rb(pte[0], pte[1], pte_index); vcpu->arch.mmu.tlbie(vcpu, rb, rb & 1 ? true : false); - kvmppc_set_gpr(vcpu, 3, H_SUCCESS); + ret = H_SUCCESS; kvmppc_set_gpr(vcpu, 4, pte[0]); kvmppc_set_gpr(vcpu, 5, pte[1]); + done: + mutex_unlock(&vcpu->kvm->arch.hpt_mutex); + kvmppc_set_gpr(vcpu, 3, ret); + return EMULATE_DONE; } @@ -124,6 +140,7 @@ static int kvmppc_h_pr_bulk_remove(struct kvm_vcpu *vcpu) int paramnr = 4; int ret = H_SUCCESS; + mutex_lock(&vcpu->kvm->arch.hpt_mutex); for (i = 0; i < H_BULK_REMOVE_MAX_BATCH; i++) { unsigned long tsh = kvmppc_get_gpr(vcpu, paramnr+(2*i)); unsigned long tsl = kvmppc_get_gpr(vcpu, paramnr+(2*i)+1); @@ -172,6 +189,7 @@ static int kvmppc_h_pr_bulk_remove(struct kvm_vcpu *vcpu) } kvmppc_set_gpr(vcpu, paramnr+(2*i), tsh); } + mutex_unlock(&vcpu->kvm->arch.hpt_mutex); kvmppc_set_gpr(vcpu, 3, ret); return EMULATE_DONE; @@ -184,15 +202,16 @@ static int kvmppc_h_pr_protect(struct kvm_vcpu *vcpu) unsigned long avpn = kvmppc_get_gpr(vcpu, 6); unsigned long rb, pteg, r, v; unsigned long pte[2]; + long int ret; pteg = get_pteg_addr(vcpu, pte_index); + mutex_lock(&vcpu->kvm->arch.hpt_mutex); copy_from_user(pte, (void __user *)pteg, sizeof(pte)); + ret = H_NOT_FOUND; if ((pte[0] & HPTE_V_VALID) == 0 || - ((flags & H_AVPN) && (pte[0] & ~0x7fUL) != avpn)) { - kvmppc_set_gpr(vcpu, 3, H_NOT_FOUND); - return EMULATE_DONE; - } + ((flags & H_AVPN) && (pte[0] & ~0x7fUL) != avpn)) + goto done; v = pte[0]; r = pte[1]; @@ -207,8 +226,11 @@ static int kvmppc_h_pr_protect(struct kvm_vcpu *vcpu) rb = compute_tlbie_rb(v, r, pte_index); vcpu->arch.mmu.tlbie(vcpu, rb, rb & 1 ? true : false); copy_to_user((void __user *)pteg, pte, sizeof(pte)); + ret = H_SUCCESS; - kvmppc_set_gpr(vcpu, 3, H_SUCCESS); + done: + mutex_unlock(&vcpu->kvm->arch.hpt_mutex); + kvmppc_set_gpr(vcpu, 3, ret); return EMULATE_DONE; } diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S index 8f7633e..a38c4c9 100644 --- a/arch/powerpc/kvm/book3s_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_rmhandlers.S @@ -38,32 +38,6 @@ #define FUNC(name) GLUE(.,name) - .globl kvmppc_skip_interrupt -kvmppc_skip_interrupt: - /* - * Here all GPRs are unchanged from when the interrupt happened - * except for r13, which is saved in SPRG_SCRATCH0. - */ - mfspr r13, SPRN_SRR0 - addi r13, r13, 4 - mtspr SPRN_SRR0, r13 - GET_SCRATCH0(r13) - rfid - b . - - .globl kvmppc_skip_Hinterrupt -kvmppc_skip_Hinterrupt: - /* - * Here all GPRs are unchanged from when the interrupt happened - * except for r13, which is saved in SPRG_SCRATCH0. - */ - mfspr r13, SPRN_HSRR0 - addi r13, r13, 4 - mtspr SPRN_HSRR0, r13 - GET_SCRATCH0(r13) - hrfid - b . - #elif defined(CONFIG_PPC_BOOK3S_32) #define FUNC(name) name @@ -179,11 +153,15 @@ _GLOBAL(kvmppc_entry_trampoline) li r6, MSR_IR | MSR_DR andc r6, r5, r6 /* Clear DR and IR in MSR value */ +#ifdef CONFIG_PPC_BOOK3S_32 /* * Set EE in HOST_MSR so that it's enabled when we get into our - * C exit handler function + * C exit handler function. On 64-bit we delay enabling + * interrupts until we have finished transferring stuff + * to or from the PACA. */ ori r5, r5, MSR_EE +#endif mtsrr0 r7 mtsrr1 r6 RFI diff --git a/arch/powerpc/kvm/book3s_rtas.c b/arch/powerpc/kvm/book3s_rtas.c index 3219ba8..cf95cde 100644 --- a/arch/powerpc/kvm/book3s_rtas.c +++ b/arch/powerpc/kvm/book3s_rtas.c @@ -260,6 +260,7 @@ fail: */ return rc; } +EXPORT_SYMBOL_GPL(kvmppc_rtas_hcall); void kvmppc_rtas_tokens_free(struct kvm *kvm) { diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S index 1abe478..bc50c97 100644 --- a/arch/powerpc/kvm/book3s_segment.S +++ b/arch/powerpc/kvm/book3s_segment.S @@ -161,8 +161,8 @@ kvmppc_handler_trampoline_enter_end: .global kvmppc_handler_trampoline_exit kvmppc_handler_trampoline_exit: -.global kvmppc_interrupt -kvmppc_interrupt: +.global kvmppc_interrupt_pr +kvmppc_interrupt_pr: /* Register usage at this point: * diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index a3a5cb8..02a17dc 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -818,7 +818,7 @@ int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 req) } /* Check for real mode returning too hard */ - if (xics->real_mode) + if (xics->real_mode && is_kvmppc_hv_enabled(vcpu->kvm)) return kvmppc_xics_rm_complete(vcpu, req); switch (req) { @@ -840,6 +840,7 @@ int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 req) return rc; } +EXPORT_SYMBOL_GPL(kvmppc_xics_hcall); /* -- Initialisation code etc. -- */ @@ -1250,13 +1251,13 @@ static int kvmppc_xics_create(struct kvm_device *dev, u32 type) xics_debugfs_init(xics); -#ifdef CONFIG_KVM_BOOK3S_64_HV +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE if (cpu_has_feature(CPU_FTR_ARCH_206)) { /* Enable real mode support */ xics->real_mode = ENABLE_REALMODE; xics->real_mode_dbg = DEBUG_REALMODE; } -#endif /* CONFIG_KVM_BOOK3S_64_HV */ +#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ return 0; } diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 5133199..53e65a2 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -40,7 +40,9 @@ #include "timing.h" #include "booke.h" -#include "trace.h" + +#define CREATE_TRACE_POINTS +#include "trace_booke.h" unsigned long kvmppc_booke_handlers; @@ -133,6 +135,29 @@ static void kvmppc_vcpu_sync_fpu(struct kvm_vcpu *vcpu) #endif } +static void kvmppc_vcpu_sync_debug(struct kvm_vcpu *vcpu) +{ + /* Synchronize guest's desire to get debug interrupts into shadow MSR */ +#ifndef CONFIG_KVM_BOOKE_HV + vcpu->arch.shadow_msr &= ~MSR_DE; + vcpu->arch.shadow_msr |= vcpu->arch.shared->msr & MSR_DE; +#endif + + /* Force enable debug interrupts when user space wants to debug */ + if (vcpu->guest_debug) { +#ifdef CONFIG_KVM_BOOKE_HV + /* + * Since there is no shadow MSR, sync MSR_DE into the guest + * visible MSR. + */ + vcpu->arch.shared->msr |= MSR_DE; +#else + vcpu->arch.shadow_msr |= MSR_DE; + vcpu->arch.shared->msr &= ~MSR_DE; +#endif + } +} + /* * Helper function for "full" MSR writes. No need to call this if only * EE/CE/ME/DE/RI are changing. @@ -150,6 +175,7 @@ void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr) kvmppc_mmu_msr_notify(vcpu, old_msr); kvmppc_vcpu_sync_spe(vcpu); kvmppc_vcpu_sync_fpu(vcpu); + kvmppc_vcpu_sync_debug(vcpu); } static void kvmppc_booke_queue_irqprio(struct kvm_vcpu *vcpu, @@ -655,6 +681,7 @@ int kvmppc_core_check_requests(struct kvm_vcpu *vcpu) int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) { int ret, s; + struct thread_struct thread; #ifdef CONFIG_PPC_FPU struct thread_fp_state fp; int fpexc_mode; @@ -695,6 +722,12 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) kvmppc_load_guest_fp(vcpu); #endif + /* Switch to guest debug context */ + thread.debug = vcpu->arch.shadow_dbg_reg; + switch_booke_debug_regs(&thread); + thread.debug = current->thread.debug; + current->thread.debug = vcpu->arch.shadow_dbg_reg; + kvmppc_fix_ee_before_entry(); ret = __kvmppc_vcpu_run(kvm_run, vcpu); @@ -702,6 +735,10 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) /* No need for kvm_guest_exit. It's done in handle_exit. We also get here with interrupts enabled. */ + /* Switch back to user space debug context */ + switch_booke_debug_regs(&thread); + current->thread.debug = thread.debug; + #ifdef CONFIG_PPC_FPU kvmppc_save_guest_fp(vcpu); @@ -757,6 +794,30 @@ static int emulation_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) } } +static int kvmppc_handle_debug(struct kvm_run *run, struct kvm_vcpu *vcpu) +{ + struct debug_reg *dbg_reg = &(vcpu->arch.shadow_dbg_reg); + u32 dbsr = vcpu->arch.dbsr; + + run->debug.arch.status = 0; + run->debug.arch.address = vcpu->arch.pc; + + if (dbsr & (DBSR_IAC1 | DBSR_IAC2 | DBSR_IAC3 | DBSR_IAC4)) { + run->debug.arch.status |= KVMPPC_DEBUG_BREAKPOINT; + } else { + if (dbsr & (DBSR_DAC1W | DBSR_DAC2W)) + run->debug.arch.status |= KVMPPC_DEBUG_WATCH_WRITE; + else if (dbsr & (DBSR_DAC1R | DBSR_DAC2R)) + run->debug.arch.status |= KVMPPC_DEBUG_WATCH_READ; + if (dbsr & (DBSR_DAC1R | DBSR_DAC1W)) + run->debug.arch.address = dbg_reg->dac1; + else if (dbsr & (DBSR_DAC2R | DBSR_DAC2W)) + run->debug.arch.address = dbg_reg->dac2; + } + + return RESUME_HOST; +} + static void kvmppc_fill_pt_regs(struct pt_regs *regs) { ulong r1, ip, msr, lr; @@ -817,6 +878,11 @@ static void kvmppc_restart_interrupt(struct kvm_vcpu *vcpu, case BOOKE_INTERRUPT_CRITICAL: unknown_exception(®s); break; + case BOOKE_INTERRUPT_DEBUG: + /* Save DBSR before preemption is enabled */ + vcpu->arch.dbsr = mfspr(SPRN_DBSR); + kvmppc_clear_dbsr(); + break; } } @@ -1134,18 +1200,10 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, } case BOOKE_INTERRUPT_DEBUG: { - u32 dbsr; - - vcpu->arch.pc = mfspr(SPRN_CSRR0); - - /* clear IAC events in DBSR register */ - dbsr = mfspr(SPRN_DBSR); - dbsr &= DBSR_IAC1 | DBSR_IAC2 | DBSR_IAC3 | DBSR_IAC4; - mtspr(SPRN_DBSR, dbsr); - - run->exit_reason = KVM_EXIT_DEBUG; + r = kvmppc_handle_debug(run, vcpu); + if (r == RESUME_HOST) + run->exit_reason = KVM_EXIT_DEBUG; kvmppc_account_exit(vcpu, DEBUG_EXITS); - r = RESUME_HOST; break; } @@ -1196,7 +1254,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) kvmppc_set_msr(vcpu, 0); #ifndef CONFIG_KVM_BOOKE_HV - vcpu->arch.shadow_msr = MSR_USER | MSR_DE | MSR_IS | MSR_DS; + vcpu->arch.shadow_msr = MSR_USER | MSR_IS | MSR_DS; vcpu->arch.shadow_pid = 1; vcpu->arch.shared->msr = 0; #endif @@ -1358,7 +1416,7 @@ static int set_sregs_arch206(struct kvm_vcpu *vcpu, return 0; } -void kvmppc_get_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +int kvmppc_get_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { sregs->u.e.features |= KVM_SREGS_E_IVOR; @@ -1378,6 +1436,7 @@ void kvmppc_get_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) sregs->u.e.ivor_low[13] = vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS]; sregs->u.e.ivor_low[14] = vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS]; sregs->u.e.ivor_low[15] = vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG]; + return 0; } int kvmppc_set_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) @@ -1412,8 +1471,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, get_sregs_base(vcpu, sregs); get_sregs_arch206(vcpu, sregs); - kvmppc_core_get_sregs(vcpu, sregs); - return 0; + return vcpu->kvm->arch.kvm_ops->get_sregs(vcpu, sregs); } int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, @@ -1432,7 +1490,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, if (ret < 0) return ret; - return kvmppc_core_set_sregs(vcpu, sregs); + return vcpu->kvm->arch.kvm_ops->set_sregs(vcpu, sregs); } int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) @@ -1440,7 +1498,6 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) int r = 0; union kvmppc_one_reg val; int size; - long int i; size = one_reg_size(reg->id); if (size > sizeof(val)) @@ -1448,16 +1505,24 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) switch (reg->id) { case KVM_REG_PPC_IAC1: + val = get_reg_val(reg->id, vcpu->arch.dbg_reg.iac1); + break; case KVM_REG_PPC_IAC2: + val = get_reg_val(reg->id, vcpu->arch.dbg_reg.iac2); + break; +#if CONFIG_PPC_ADV_DEBUG_IACS > 2 case KVM_REG_PPC_IAC3: + val = get_reg_val(reg->id, vcpu->arch.dbg_reg.iac3); + break; case KVM_REG_PPC_IAC4: - i = reg->id - KVM_REG_PPC_IAC1; - val = get_reg_val(reg->id, vcpu->arch.dbg_reg.iac[i]); + val = get_reg_val(reg->id, vcpu->arch.dbg_reg.iac4); break; +#endif case KVM_REG_PPC_DAC1: + val = get_reg_val(reg->id, vcpu->arch.dbg_reg.dac1); + break; case KVM_REG_PPC_DAC2: - i = reg->id - KVM_REG_PPC_DAC1; - val = get_reg_val(reg->id, vcpu->arch.dbg_reg.dac[i]); + val = get_reg_val(reg->id, vcpu->arch.dbg_reg.dac2); break; case KVM_REG_PPC_EPR: { u32 epr = get_guest_epr(vcpu); @@ -1476,10 +1541,13 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) val = get_reg_val(reg->id, vcpu->arch.tsr); break; case KVM_REG_PPC_DEBUG_INST: - val = get_reg_val(reg->id, KVMPPC_INST_EHPRIV); + val = get_reg_val(reg->id, KVMPPC_INST_EHPRIV_DEBUG); + break; + case KVM_REG_PPC_VRSAVE: + val = get_reg_val(reg->id, vcpu->arch.vrsave); break; default: - r = kvmppc_get_one_reg(vcpu, reg->id, &val); + r = vcpu->kvm->arch.kvm_ops->get_one_reg(vcpu, reg->id, &val); break; } @@ -1497,7 +1565,6 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) int r = 0; union kvmppc_one_reg val; int size; - long int i; size = one_reg_size(reg->id); if (size > sizeof(val)) @@ -1508,16 +1575,24 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) switch (reg->id) { case KVM_REG_PPC_IAC1: + vcpu->arch.dbg_reg.iac1 = set_reg_val(reg->id, val); + break; case KVM_REG_PPC_IAC2: + vcpu->arch.dbg_reg.iac2 = set_reg_val(reg->id, val); + break; +#if CONFIG_PPC_ADV_DEBUG_IACS > 2 case KVM_REG_PPC_IAC3: + vcpu->arch.dbg_reg.iac3 = set_reg_val(reg->id, val); + break; case KVM_REG_PPC_IAC4: - i = reg->id - KVM_REG_PPC_IAC1; - vcpu->arch.dbg_reg.iac[i] = set_reg_val(reg->id, val); + vcpu->arch.dbg_reg.iac4 = set_reg_val(reg->id, val); break; +#endif case KVM_REG_PPC_DAC1: + vcpu->arch.dbg_reg.dac1 = set_reg_val(reg->id, val); + break; case KVM_REG_PPC_DAC2: - i = reg->id - KVM_REG_PPC_DAC1; - vcpu->arch.dbg_reg.dac[i] = set_reg_val(reg->id, val); + vcpu->arch.dbg_reg.dac2 = set_reg_val(reg->id, val); break; case KVM_REG_PPC_EPR: { u32 new_epr = set_reg_val(reg->id, val); @@ -1551,20 +1626,17 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) kvmppc_set_tcr(vcpu, tcr); break; } + case KVM_REG_PPC_VRSAVE: + vcpu->arch.vrsave = set_reg_val(reg->id, val); + break; default: - r = kvmppc_set_one_reg(vcpu, reg->id, &val); + r = vcpu->kvm->arch.kvm_ops->set_one_reg(vcpu, reg->id, &val); break; } return r; } -int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, - struct kvm_guest_debug *dbg) -{ - return -EINVAL; -} - int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { return -ENOTSUPP; @@ -1589,12 +1661,12 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) return -ENOTSUPP; } -void kvmppc_core_free_memslot(struct kvm_memory_slot *free, +void kvmppc_core_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, struct kvm_memory_slot *dont) { } -int kvmppc_core_create_memslot(struct kvm_memory_slot *slot, +int kvmppc_core_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, unsigned long npages) { return 0; @@ -1670,6 +1742,157 @@ void kvmppc_decrementer_func(unsigned long data) kvmppc_set_tsr_bits(vcpu, TSR_DIS); } +static int kvmppc_booke_add_breakpoint(struct debug_reg *dbg_reg, + uint64_t addr, int index) +{ + switch (index) { + case 0: + dbg_reg->dbcr0 |= DBCR0_IAC1; + dbg_reg->iac1 = addr; + break; + case 1: + dbg_reg->dbcr0 |= DBCR0_IAC2; + dbg_reg->iac2 = addr; + break; +#if CONFIG_PPC_ADV_DEBUG_IACS > 2 + case 2: + dbg_reg->dbcr0 |= DBCR0_IAC3; + dbg_reg->iac3 = addr; + break; + case 3: + dbg_reg->dbcr0 |= DBCR0_IAC4; + dbg_reg->iac4 = addr; + break; +#endif + default: + return -EINVAL; + } + + dbg_reg->dbcr0 |= DBCR0_IDM; + return 0; +} + +static int kvmppc_booke_add_watchpoint(struct debug_reg *dbg_reg, uint64_t addr, + int type, int index) +{ + switch (index) { + case 0: + if (type & KVMPPC_DEBUG_WATCH_READ) + dbg_reg->dbcr0 |= DBCR0_DAC1R; + if (type & KVMPPC_DEBUG_WATCH_WRITE) + dbg_reg->dbcr0 |= DBCR0_DAC1W; + dbg_reg->dac1 = addr; + break; + case 1: + if (type & KVMPPC_DEBUG_WATCH_READ) + dbg_reg->dbcr0 |= DBCR0_DAC2R; + if (type & KVMPPC_DEBUG_WATCH_WRITE) + dbg_reg->dbcr0 |= DBCR0_DAC2W; + dbg_reg->dac2 = addr; + break; + default: + return -EINVAL; + } + + dbg_reg->dbcr0 |= DBCR0_IDM; + return 0; +} +void kvm_guest_protect_msr(struct kvm_vcpu *vcpu, ulong prot_bitmap, bool set) +{ + /* XXX: Add similar MSR protection for BookE-PR */ +#ifdef CONFIG_KVM_BOOKE_HV + BUG_ON(prot_bitmap & ~(MSRP_UCLEP | MSRP_DEP | MSRP_PMMP)); + if (set) { + if (prot_bitmap & MSR_UCLE) + vcpu->arch.shadow_msrp |= MSRP_UCLEP; + if (prot_bitmap & MSR_DE) + vcpu->arch.shadow_msrp |= MSRP_DEP; + if (prot_bitmap & MSR_PMM) + vcpu->arch.shadow_msrp |= MSRP_PMMP; + } else { + if (prot_bitmap & MSR_UCLE) + vcpu->arch.shadow_msrp &= ~MSRP_UCLEP; + if (prot_bitmap & MSR_DE) + vcpu->arch.shadow_msrp &= ~MSRP_DEP; + if (prot_bitmap & MSR_PMM) + vcpu->arch.shadow_msrp &= ~MSRP_PMMP; + } +#endif +} + +int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, + struct kvm_guest_debug *dbg) +{ + struct debug_reg *dbg_reg; + int n, b = 0, w = 0; + + if (!(dbg->control & KVM_GUESTDBG_ENABLE)) { + vcpu->arch.shadow_dbg_reg.dbcr0 = 0; + vcpu->guest_debug = 0; + kvm_guest_protect_msr(vcpu, MSR_DE, false); + return 0; + } + + kvm_guest_protect_msr(vcpu, MSR_DE, true); + vcpu->guest_debug = dbg->control; + vcpu->arch.shadow_dbg_reg.dbcr0 = 0; + /* Set DBCR0_EDM in guest visible DBCR0 register. */ + vcpu->arch.dbg_reg.dbcr0 = DBCR0_EDM; + + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) + vcpu->arch.shadow_dbg_reg.dbcr0 |= DBCR0_IDM | DBCR0_IC; + + /* Code below handles only HW breakpoints */ + dbg_reg = &(vcpu->arch.shadow_dbg_reg); + +#ifdef CONFIG_KVM_BOOKE_HV + /* + * On BookE-HV (e500mc) the guest is always executed with MSR.GS=1 + * DBCR1 and DBCR2 are set to trigger debug events when MSR.PR is 0 + */ + dbg_reg->dbcr1 = 0; + dbg_reg->dbcr2 = 0; +#else + /* + * On BookE-PR (e500v2) the guest is always executed with MSR.PR=1 + * We set DBCR1 and DBCR2 to only trigger debug events when MSR.PR + * is set. + */ + dbg_reg->dbcr1 = DBCR1_IAC1US | DBCR1_IAC2US | DBCR1_IAC3US | + DBCR1_IAC4US; + dbg_reg->dbcr2 = DBCR2_DAC1US | DBCR2_DAC2US; +#endif + + if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) + return 0; + + for (n = 0; n < (KVMPPC_BOOKE_IAC_NUM + KVMPPC_BOOKE_DAC_NUM); n++) { + uint64_t addr = dbg->arch.bp[n].addr; + uint32_t type = dbg->arch.bp[n].type; + + if (type == KVMPPC_DEBUG_NONE) + continue; + + if (type & !(KVMPPC_DEBUG_WATCH_READ | + KVMPPC_DEBUG_WATCH_WRITE | + KVMPPC_DEBUG_BREAKPOINT)) + return -EINVAL; + + if (type & KVMPPC_DEBUG_BREAKPOINT) { + /* Setting H/W breakpoint */ + if (kvmppc_booke_add_breakpoint(dbg_reg, addr, b++)) + return -EINVAL; + } else { + /* Setting H/W watchpoint */ + if (kvmppc_booke_add_watchpoint(dbg_reg, addr, + type, w++)) + return -EINVAL; + } + } + + return 0; +} + void kvmppc_booke_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { vcpu->cpu = smp_processor_id(); @@ -1680,6 +1903,44 @@ void kvmppc_booke_vcpu_put(struct kvm_vcpu *vcpu) { current->thread.kvm_vcpu = NULL; vcpu->cpu = -1; + + /* Clear pending debug event in DBSR */ + kvmppc_clear_dbsr(); +} + +void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) +{ + vcpu->kvm->arch.kvm_ops->mmu_destroy(vcpu); +} + +int kvmppc_core_init_vm(struct kvm *kvm) +{ + return kvm->arch.kvm_ops->init_vm(kvm); +} + +struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) +{ + return kvm->arch.kvm_ops->vcpu_create(kvm, id); +} + +void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) +{ + vcpu->kvm->arch.kvm_ops->vcpu_free(vcpu); +} + +void kvmppc_core_destroy_vm(struct kvm *kvm) +{ + kvm->arch.kvm_ops->destroy_vm(kvm); +} + +void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + vcpu->kvm->arch.kvm_ops->vcpu_load(vcpu, cpu); +} + +void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) +{ + vcpu->kvm->arch.kvm_ops->vcpu_put(vcpu); } int __init kvmppc_booke_init(void) diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h index 5fd1ba6..09bfd9b 100644 --- a/arch/powerpc/kvm/booke.h +++ b/arch/powerpc/kvm/booke.h @@ -99,6 +99,30 @@ enum int_class { void kvmppc_set_pending_interrupt(struct kvm_vcpu *vcpu, enum int_class type); +extern void kvmppc_mmu_destroy_44x(struct kvm_vcpu *vcpu); +extern int kvmppc_core_emulate_op_44x(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int inst, int *advance); +extern int kvmppc_core_emulate_mtspr_44x(struct kvm_vcpu *vcpu, int sprn, + ulong spr_val); +extern int kvmppc_core_emulate_mfspr_44x(struct kvm_vcpu *vcpu, int sprn, + ulong *spr_val); +extern void kvmppc_mmu_destroy_e500(struct kvm_vcpu *vcpu); +extern int kvmppc_core_emulate_op_e500(struct kvm_run *run, + struct kvm_vcpu *vcpu, + unsigned int inst, int *advance); +extern int kvmppc_core_emulate_mtspr_e500(struct kvm_vcpu *vcpu, int sprn, + ulong spr_val); +extern int kvmppc_core_emulate_mfspr_e500(struct kvm_vcpu *vcpu, int sprn, + ulong *spr_val); +extern void kvmppc_mmu_destroy_e500(struct kvm_vcpu *vcpu); +extern int kvmppc_core_emulate_op_e500(struct kvm_run *run, + struct kvm_vcpu *vcpu, + unsigned int inst, int *advance); +extern int kvmppc_core_emulate_mtspr_e500(struct kvm_vcpu *vcpu, int sprn, + ulong spr_val); +extern int kvmppc_core_emulate_mfspr_e500(struct kvm_vcpu *vcpu, int sprn, + ulong *spr_val); + /* * Load up guest vcpu FP state if it's needed. * It also set the MSR_FP in thread so that host know @@ -129,4 +153,9 @@ static inline void kvmppc_save_guest_fp(struct kvm_vcpu *vcpu) giveup_fpu(current); #endif } + +static inline void kvmppc_clear_dbsr(void) +{ + mtspr(SPRN_DBSR, mfspr(SPRN_DBSR)); +} #endif /* __KVM_BOOKE_H__ */ diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index ce6b73c..497b142 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -305,7 +305,7 @@ void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu) { } -void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +static void kvmppc_core_vcpu_load_e500(struct kvm_vcpu *vcpu, int cpu) { kvmppc_booke_vcpu_load(vcpu, cpu); @@ -313,7 +313,7 @@ void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvmppc_e500_recalc_shadow_pid(to_e500(vcpu)); } -void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) +static void kvmppc_core_vcpu_put_e500(struct kvm_vcpu *vcpu) { #ifdef CONFIG_SPE if (vcpu->arch.shadow_msr & MSR_SPE) @@ -367,7 +367,8 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu) return 0; } -void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +static int kvmppc_core_get_sregs_e500(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); @@ -388,9 +389,11 @@ void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) kvmppc_get_sregs_ivor(vcpu, sregs); kvmppc_get_sregs_e500_tlb(vcpu, sregs); + return 0; } -int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +static int kvmppc_core_set_sregs_e500(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); int ret; @@ -425,21 +428,22 @@ int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) return kvmppc_set_sregs_ivor(vcpu, sregs); } -int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, - union kvmppc_one_reg *val) +static int kvmppc_get_one_reg_e500(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) { int r = kvmppc_get_one_reg_e500_tlb(vcpu, id, val); return r; } -int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, - union kvmppc_one_reg *val) +static int kvmppc_set_one_reg_e500(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) { int r = kvmppc_get_one_reg_e500_tlb(vcpu, id, val); return r; } -struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) +static struct kvm_vcpu *kvmppc_core_vcpu_create_e500(struct kvm *kvm, + unsigned int id) { struct kvmppc_vcpu_e500 *vcpu_e500; struct kvm_vcpu *vcpu; @@ -481,7 +485,7 @@ out: return ERR_PTR(err); } -void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) +static void kvmppc_core_vcpu_free_e500(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); @@ -492,15 +496,32 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) kmem_cache_free(kvm_vcpu_cache, vcpu_e500); } -int kvmppc_core_init_vm(struct kvm *kvm) +static int kvmppc_core_init_vm_e500(struct kvm *kvm) { return 0; } -void kvmppc_core_destroy_vm(struct kvm *kvm) +static void kvmppc_core_destroy_vm_e500(struct kvm *kvm) { } +static struct kvmppc_ops kvm_ops_e500 = { + .get_sregs = kvmppc_core_get_sregs_e500, + .set_sregs = kvmppc_core_set_sregs_e500, + .get_one_reg = kvmppc_get_one_reg_e500, + .set_one_reg = kvmppc_set_one_reg_e500, + .vcpu_load = kvmppc_core_vcpu_load_e500, + .vcpu_put = kvmppc_core_vcpu_put_e500, + .vcpu_create = kvmppc_core_vcpu_create_e500, + .vcpu_free = kvmppc_core_vcpu_free_e500, + .mmu_destroy = kvmppc_mmu_destroy_e500, + .init_vm = kvmppc_core_init_vm_e500, + .destroy_vm = kvmppc_core_destroy_vm_e500, + .emulate_op = kvmppc_core_emulate_op_e500, + .emulate_mtspr = kvmppc_core_emulate_mtspr_e500, + .emulate_mfspr = kvmppc_core_emulate_mfspr_e500, +}; + static int __init kvmppc_e500_init(void) { int r, i; @@ -512,11 +533,11 @@ static int __init kvmppc_e500_init(void) r = kvmppc_core_check_processor_compat(); if (r) - return r; + goto err_out; r = kvmppc_booke_init(); if (r) - return r; + goto err_out; /* copy extra E500 exception handlers */ ivor[0] = mfspr(SPRN_IVOR32); @@ -534,11 +555,19 @@ static int __init kvmppc_e500_init(void) flush_icache_range(kvmppc_booke_handlers, kvmppc_booke_handlers + ivor[max_ivor] + handler_len); - return kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE); + r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE); + if (r) + goto err_out; + kvm_ops_e500.owner = THIS_MODULE; + kvmppc_pr_ops = &kvm_ops_e500; + +err_out: + return r; } static void __exit kvmppc_e500_exit(void) { + kvmppc_pr_ops = NULL; kvmppc_booke_exit(); } diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h index c2e5e98..4fd9650 100644 --- a/arch/powerpc/kvm/e500.h +++ b/arch/powerpc/kvm/e500.h @@ -117,7 +117,7 @@ static inline struct kvmppc_vcpu_e500 *to_e500(struct kvm_vcpu *vcpu) #define E500_TLB_USER_PERM_MASK (MAS3_UX|MAS3_UR|MAS3_UW) #define E500_TLB_SUPER_PERM_MASK (MAS3_SX|MAS3_SR|MAS3_SW) #define MAS2_ATTRIB_MASK \ - (MAS2_X0 | MAS2_X1) + (MAS2_X0 | MAS2_X1 | MAS2_E | MAS2_G) #define MAS3_ATTRIB_MASK \ (MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3 \ | E500_TLB_USER_PERM_MASK | E500_TLB_SUPER_PERM_MASK) diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c index b10a012..89b7f82 100644 --- a/arch/powerpc/kvm/e500_emulate.c +++ b/arch/powerpc/kvm/e500_emulate.c @@ -26,6 +26,7 @@ #define XOP_TLBRE 946 #define XOP_TLBWE 978 #define XOP_TLBILX 18 +#define XOP_EHPRIV 270 #ifdef CONFIG_KVM_E500MC static int dbell2prio(ulong param) @@ -82,8 +83,28 @@ static int kvmppc_e500_emul_msgsnd(struct kvm_vcpu *vcpu, int rb) } #endif -int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, - unsigned int inst, int *advance) +static int kvmppc_e500_emul_ehpriv(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int inst, int *advance) +{ + int emulated = EMULATE_DONE; + + switch (get_oc(inst)) { + case EHPRIV_OC_DEBUG: + run->exit_reason = KVM_EXIT_DEBUG; + run->debug.arch.address = vcpu->arch.pc; + run->debug.arch.status = 0; + kvmppc_account_exit(vcpu, DEBUG_EXITS); + emulated = EMULATE_EXIT_USER; + *advance = 0; + break; + default: + emulated = EMULATE_FAIL; + } + return emulated; +} + +int kvmppc_core_emulate_op_e500(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int inst, int *advance) { int emulated = EMULATE_DONE; int ra = get_ra(inst); @@ -130,6 +151,11 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, emulated = kvmppc_e500_emul_tlbivax(vcpu, ea); break; + case XOP_EHPRIV: + emulated = kvmppc_e500_emul_ehpriv(run, vcpu, inst, + advance); + break; + default: emulated = EMULATE_FAIL; } @@ -146,7 +172,7 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, return emulated; } -int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) +int kvmppc_core_emulate_mtspr_e500(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); int emulated = EMULATE_DONE; @@ -237,7 +263,7 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) return emulated; } -int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) +int kvmppc_core_emulate_mfspr_e500(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); int emulated = EMULATE_DONE; diff --git a/arch/powerpc/kvm/e500_mmu.c b/arch/powerpc/kvm/e500_mmu.c index 6d6f153..ebca6b8 100644 --- a/arch/powerpc/kvm/e500_mmu.c +++ b/arch/powerpc/kvm/e500_mmu.c @@ -32,7 +32,7 @@ #include <asm/kvm_ppc.h> #include "e500.h" -#include "trace.h" +#include "trace_booke.h" #include "timing.h" #include "e500_mmu_host.h" @@ -536,7 +536,7 @@ gpa_t kvmppc_mmu_xlate(struct kvm_vcpu *vcpu, unsigned int index, return get_tlb_raddr(gtlbe) | (eaddr & pgmask); } -void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) +void kvmppc_mmu_destroy_e500(struct kvm_vcpu *vcpu) { } diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index c65593a..ecf2247 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -32,10 +32,11 @@ #include <asm/kvm_ppc.h> #include "e500.h" -#include "trace.h" #include "timing.h" #include "e500_mmu_host.h" +#include "trace_booke.h" + #define to_htlb1_esel(esel) (host_tlb_params[1].entries - (esel) - 1) static struct kvmppc_e500_tlb_params host_tlb_params[E500_TLB_NUM]; @@ -253,6 +254,9 @@ static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref, ref->pfn = pfn; ref->flags |= E500_TLB_VALID; + /* Mark the page accessed */ + kvm_set_pfn_accessed(pfn); + if (tlbe_is_writable(gtlbe)) kvm_set_pfn_dirty(pfn); } diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c index 19c8379..4132cd2 100644 --- a/arch/powerpc/kvm/e500mc.c +++ b/arch/powerpc/kvm/e500mc.c @@ -110,7 +110,7 @@ void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr) static DEFINE_PER_CPU(struct kvm_vcpu *, last_vcpu_on_cpu); -void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +static void kvmppc_core_vcpu_load_e500mc(struct kvm_vcpu *vcpu, int cpu) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); @@ -147,7 +147,7 @@ void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvmppc_load_guest_fp(vcpu); } -void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) +static void kvmppc_core_vcpu_put_e500mc(struct kvm_vcpu *vcpu) { vcpu->arch.eplc = mfspr(SPRN_EPLC); vcpu->arch.epsc = mfspr(SPRN_EPSC); @@ -204,7 +204,8 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu) return 0; } -void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +static int kvmppc_core_get_sregs_e500mc(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); @@ -224,10 +225,11 @@ void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) sregs->u.e.ivor_high[4] = vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL]; sregs->u.e.ivor_high[5] = vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL_CRIT]; - kvmppc_get_sregs_ivor(vcpu, sregs); + return kvmppc_get_sregs_ivor(vcpu, sregs); } -int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +static int kvmppc_core_set_sregs_e500mc(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); int ret; @@ -260,21 +262,22 @@ int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) return kvmppc_set_sregs_ivor(vcpu, sregs); } -int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, - union kvmppc_one_reg *val) +static int kvmppc_get_one_reg_e500mc(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) { int r = kvmppc_get_one_reg_e500_tlb(vcpu, id, val); return r; } -int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, - union kvmppc_one_reg *val) +static int kvmppc_set_one_reg_e500mc(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) { int r = kvmppc_set_one_reg_e500_tlb(vcpu, id, val); return r; } -struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) +static struct kvm_vcpu *kvmppc_core_vcpu_create_e500mc(struct kvm *kvm, + unsigned int id) { struct kvmppc_vcpu_e500 *vcpu_e500; struct kvm_vcpu *vcpu; @@ -315,7 +318,7 @@ out: return ERR_PTR(err); } -void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) +static void kvmppc_core_vcpu_free_e500mc(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); @@ -325,7 +328,7 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) kmem_cache_free(kvm_vcpu_cache, vcpu_e500); } -int kvmppc_core_init_vm(struct kvm *kvm) +static int kvmppc_core_init_vm_e500mc(struct kvm *kvm) { int lpid; @@ -337,27 +340,52 @@ int kvmppc_core_init_vm(struct kvm *kvm) return 0; } -void kvmppc_core_destroy_vm(struct kvm *kvm) +static void kvmppc_core_destroy_vm_e500mc(struct kvm *kvm) { kvmppc_free_lpid(kvm->arch.lpid); } +static struct kvmppc_ops kvm_ops_e500mc = { + .get_sregs = kvmppc_core_get_sregs_e500mc, + .set_sregs = kvmppc_core_set_sregs_e500mc, + .get_one_reg = kvmppc_get_one_reg_e500mc, + .set_one_reg = kvmppc_set_one_reg_e500mc, + .vcpu_load = kvmppc_core_vcpu_load_e500mc, + .vcpu_put = kvmppc_core_vcpu_put_e500mc, + .vcpu_create = kvmppc_core_vcpu_create_e500mc, + .vcpu_free = kvmppc_core_vcpu_free_e500mc, + .mmu_destroy = kvmppc_mmu_destroy_e500, + .init_vm = kvmppc_core_init_vm_e500mc, + .destroy_vm = kvmppc_core_destroy_vm_e500mc, + .emulate_op = kvmppc_core_emulate_op_e500, + .emulate_mtspr = kvmppc_core_emulate_mtspr_e500, + .emulate_mfspr = kvmppc_core_emulate_mfspr_e500, +}; + static int __init kvmppc_e500mc_init(void) { int r; r = kvmppc_booke_init(); if (r) - return r; + goto err_out; kvmppc_init_lpid(64); kvmppc_claim_lpid(0); /* host */ - return kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE); + r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE); + if (r) + goto err_out; + kvm_ops_e500mc.owner = THIS_MODULE; + kvmppc_pr_ops = &kvm_ops_e500mc; + +err_out: + return r; } static void __exit kvmppc_e500mc_exit(void) { + kvmppc_pr_ops = NULL; kvmppc_booke_exit(); } diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c index 751cd45..2f9a087 100644 --- a/arch/powerpc/kvm/emulate.c +++ b/arch/powerpc/kvm/emulate.c @@ -130,8 +130,8 @@ static int kvmppc_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) case SPRN_PIR: break; default: - emulated = kvmppc_core_emulate_mtspr(vcpu, sprn, - spr_val); + emulated = vcpu->kvm->arch.kvm_ops->emulate_mtspr(vcpu, sprn, + spr_val); if (emulated == EMULATE_FAIL) printk(KERN_INFO "mtspr: unknown spr " "0x%x\n", sprn); @@ -191,8 +191,8 @@ static int kvmppc_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) spr_val = kvmppc_get_dec(vcpu, get_tb()); break; default: - emulated = kvmppc_core_emulate_mfspr(vcpu, sprn, - &spr_val); + emulated = vcpu->kvm->arch.kvm_ops->emulate_mfspr(vcpu, sprn, + &spr_val); if (unlikely(emulated == EMULATE_FAIL)) { printk(KERN_INFO "mfspr: unknown spr " "0x%x\n", sprn); @@ -464,7 +464,8 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) } if (emulated == EMULATE_FAIL) { - emulated = kvmppc_core_emulate_op(run, vcpu, inst, &advance); + emulated = vcpu->kvm->arch.kvm_ops->emulate_op(run, vcpu, inst, + &advance); if (emulated == EMULATE_AGAIN) { advance = 0; } else if (emulated == EMULATE_FAIL) { @@ -483,3 +484,4 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) return emulated; } +EXPORT_SYMBOL_GPL(kvmppc_emulate_instruction); diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 07c0106..9ae9768 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -26,6 +26,7 @@ #include <linux/fs.h> #include <linux/slab.h> #include <linux/file.h> +#include <linux/module.h> #include <asm/cputable.h> #include <asm/uaccess.h> #include <asm/kvm_ppc.h> @@ -39,6 +40,12 @@ #define CREATE_TRACE_POINTS #include "trace.h" +struct kvmppc_ops *kvmppc_hv_ops; +EXPORT_SYMBOL_GPL(kvmppc_hv_ops); +struct kvmppc_ops *kvmppc_pr_ops; +EXPORT_SYMBOL_GPL(kvmppc_pr_ops); + + int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) { return !!(v->arch.pending_exceptions) || @@ -50,7 +57,6 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) return 1; } -#ifndef CONFIG_KVM_BOOK3S_64_HV /* * Common checks before entering the guest world. Call with interrupts * disabled. @@ -125,7 +131,7 @@ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu) return r; } -#endif /* CONFIG_KVM_BOOK3S_64_HV */ +EXPORT_SYMBOL_GPL(kvmppc_prepare_to_enter); int kvmppc_kvm_pv(struct kvm_vcpu *vcpu) { @@ -179,6 +185,7 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu) return r; } +EXPORT_SYMBOL_GPL(kvmppc_kvm_pv); int kvmppc_sanity_check(struct kvm_vcpu *vcpu) { @@ -192,11 +199,9 @@ int kvmppc_sanity_check(struct kvm_vcpu *vcpu) if ((vcpu->arch.cpu_type != KVM_CPU_3S_64) && vcpu->arch.papr_enabled) goto out; -#ifdef CONFIG_KVM_BOOK3S_64_HV /* HV KVM can only do PAPR mode for now */ - if (!vcpu->arch.papr_enabled) + if (!vcpu->arch.papr_enabled && is_kvmppc_hv_enabled(vcpu->kvm)) goto out; -#endif #ifdef CONFIG_KVM_BOOKE_HV if (!cpu_has_feature(CPU_FTR_EMB_HV)) @@ -209,6 +214,7 @@ out: vcpu->arch.sane = r; return r ? 0 : -EINVAL; } +EXPORT_SYMBOL_GPL(kvmppc_sanity_check); int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu) { @@ -243,6 +249,7 @@ int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu) return r; } +EXPORT_SYMBOL_GPL(kvmppc_emulate_mmio); int kvm_arch_hardware_enable(void *garbage) { @@ -269,10 +276,35 @@ void kvm_arch_check_processor_compat(void *rtn) int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { - if (type) - return -EINVAL; - + struct kvmppc_ops *kvm_ops = NULL; + /* + * if we have both HV and PR enabled, default is HV + */ + if (type == 0) { + if (kvmppc_hv_ops) + kvm_ops = kvmppc_hv_ops; + else + kvm_ops = kvmppc_pr_ops; + if (!kvm_ops) + goto err_out; + } else if (type == KVM_VM_PPC_HV) { + if (!kvmppc_hv_ops) + goto err_out; + kvm_ops = kvmppc_hv_ops; + } else if (type == KVM_VM_PPC_PR) { + if (!kvmppc_pr_ops) + goto err_out; + kvm_ops = kvmppc_pr_ops; + } else + goto err_out; + + if (kvm_ops->owner && !try_module_get(kvm_ops->owner)) + return -ENOENT; + + kvm->arch.kvm_ops = kvm_ops; return kvmppc_core_init_vm(kvm); +err_out: + return -EINVAL; } void kvm_arch_destroy_vm(struct kvm *kvm) @@ -292,6 +324,9 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kvmppc_core_destroy_vm(kvm); mutex_unlock(&kvm->lock); + + /* drop the module reference */ + module_put(kvm->arch.kvm_ops->owner); } void kvm_arch_sync_events(struct kvm *kvm) @@ -301,6 +336,10 @@ void kvm_arch_sync_events(struct kvm *kvm) int kvm_dev_ioctl_check_extension(long ext) { int r; + /* FIXME!! + * Should some of this be vm ioctl ? is it possible now ? + */ + int hv_enabled = kvmppc_hv_ops ? 1 : 0; switch (ext) { #ifdef CONFIG_BOOKE @@ -320,22 +359,26 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_DEVICE_CTRL: r = 1; break; -#ifndef CONFIG_KVM_BOOK3S_64_HV case KVM_CAP_PPC_PAIRED_SINGLES: case KVM_CAP_PPC_OSI: case KVM_CAP_PPC_GET_PVINFO: #if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC) case KVM_CAP_SW_TLB: #endif -#ifdef CONFIG_KVM_MPIC - case KVM_CAP_IRQ_MPIC: -#endif - r = 1; + /* We support this only for PR */ + r = !hv_enabled; break; +#ifdef CONFIG_KVM_MMIO case KVM_CAP_COALESCED_MMIO: r = KVM_COALESCED_MMIO_PAGE_OFFSET; break; #endif +#ifdef CONFIG_KVM_MPIC + case KVM_CAP_IRQ_MPIC: + r = 1; + break; +#endif + #ifdef CONFIG_PPC_BOOK3S_64 case KVM_CAP_SPAPR_TCE: case KVM_CAP_PPC_ALLOC_HTAB: @@ -346,32 +389,37 @@ int kvm_dev_ioctl_check_extension(long ext) r = 1; break; #endif /* CONFIG_PPC_BOOK3S_64 */ -#ifdef CONFIG_KVM_BOOK3S_64_HV +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE case KVM_CAP_PPC_SMT: - r = threads_per_core; + if (hv_enabled) + r = threads_per_core; + else + r = 0; break; case KVM_CAP_PPC_RMA: - r = 1; + r = hv_enabled; /* PPC970 requires an RMA */ - if (cpu_has_feature(CPU_FTR_ARCH_201)) + if (r && cpu_has_feature(CPU_FTR_ARCH_201)) r = 2; break; #endif case KVM_CAP_SYNC_MMU: -#ifdef CONFIG_KVM_BOOK3S_64_HV - r = cpu_has_feature(CPU_FTR_ARCH_206) ? 1 : 0; +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + if (hv_enabled) + r = cpu_has_feature(CPU_FTR_ARCH_206) ? 1 : 0; + else + r = 0; #elif defined(KVM_ARCH_WANT_MMU_NOTIFIER) r = 1; #else r = 0; - break; #endif -#ifdef CONFIG_KVM_BOOK3S_64_HV + break; +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE case KVM_CAP_PPC_HTAB_FD: - r = 1; + r = hv_enabled; break; #endif - break; case KVM_CAP_NR_VCPUS: /* * Recommending a number of CPUs is somewhat arbitrary; we @@ -379,11 +427,10 @@ int kvm_dev_ioctl_check_extension(long ext) * will have secondary threads "offline"), and for other KVM * implementations just count online CPUs. */ -#ifdef CONFIG_KVM_BOOK3S_64_HV - r = num_present_cpus(); -#else - r = num_online_cpus(); -#endif + if (hv_enabled) + r = num_present_cpus(); + else + r = num_online_cpus(); break; case KVM_CAP_MAX_VCPUS: r = KVM_MAX_VCPUS; @@ -407,15 +454,16 @@ long kvm_arch_dev_ioctl(struct file *filp, return -EINVAL; } -void kvm_arch_free_memslot(struct kvm_memory_slot *free, +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, struct kvm_memory_slot *dont) { - kvmppc_core_free_memslot(free, dont); + kvmppc_core_free_memslot(kvm, free, dont); } -int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) +int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, + unsigned long npages) { - return kvmppc_core_create_memslot(slot, npages); + return kvmppc_core_create_memslot(kvm, slot, npages); } void kvm_arch_memslots_updated(struct kvm *kvm) @@ -659,6 +707,7 @@ int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu, return EMULATE_DO_MMIO; } +EXPORT_SYMBOL_GPL(kvmppc_handle_load); /* Same as above, but sign extends */ int kvmppc_handle_loads(struct kvm_run *run, struct kvm_vcpu *vcpu, @@ -720,6 +769,7 @@ int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu, return EMULATE_DO_MMIO; } +EXPORT_SYMBOL_GPL(kvmppc_handle_store); int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) { @@ -1024,52 +1074,12 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce); goto out; } -#endif /* CONFIG_PPC_BOOK3S_64 */ - -#ifdef CONFIG_KVM_BOOK3S_64_HV - case KVM_ALLOCATE_RMA: { - struct kvm_allocate_rma rma; - struct kvm *kvm = filp->private_data; - - r = kvm_vm_ioctl_allocate_rma(kvm, &rma); - if (r >= 0 && copy_to_user(argp, &rma, sizeof(rma))) - r = -EFAULT; - break; - } - - case KVM_PPC_ALLOCATE_HTAB: { - u32 htab_order; - - r = -EFAULT; - if (get_user(htab_order, (u32 __user *)argp)) - break; - r = kvmppc_alloc_reset_hpt(kvm, &htab_order); - if (r) - break; - r = -EFAULT; - if (put_user(htab_order, (u32 __user *)argp)) - break; - r = 0; - break; - } - - case KVM_PPC_GET_HTAB_FD: { - struct kvm_get_htab_fd ghf; - - r = -EFAULT; - if (copy_from_user(&ghf, argp, sizeof(ghf))) - break; - r = kvm_vm_ioctl_get_htab_fd(kvm, &ghf); - break; - } -#endif /* CONFIG_KVM_BOOK3S_64_HV */ - -#ifdef CONFIG_PPC_BOOK3S_64 case KVM_PPC_GET_SMMU_INFO: { struct kvm_ppc_smmu_info info; + struct kvm *kvm = filp->private_data; memset(&info, 0, sizeof(info)); - r = kvm_vm_ioctl_get_smmu_info(kvm, &info); + r = kvm->arch.kvm_ops->get_smmu_info(kvm, &info); if (r >= 0 && copy_to_user(argp, &info, sizeof(info))) r = -EFAULT; break; @@ -1080,11 +1090,15 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_vm_ioctl_rtas_define_token(kvm, argp); break; } -#endif /* CONFIG_PPC_BOOK3S_64 */ + default: { + struct kvm *kvm = filp->private_data; + r = kvm->arch.kvm_ops->arch_vm_ioctl(filp, ioctl, arg); + } +#else /* CONFIG_PPC_BOOK3S_64 */ default: r = -ENOTTY; +#endif } - out: return r; } @@ -1106,22 +1120,26 @@ long kvmppc_alloc_lpid(void) return lpid; } +EXPORT_SYMBOL_GPL(kvmppc_alloc_lpid); void kvmppc_claim_lpid(long lpid) { set_bit(lpid, lpid_inuse); } +EXPORT_SYMBOL_GPL(kvmppc_claim_lpid); void kvmppc_free_lpid(long lpid) { clear_bit(lpid, lpid_inuse); } +EXPORT_SYMBOL_GPL(kvmppc_free_lpid); void kvmppc_init_lpid(unsigned long nr_lpids_param) { nr_lpids = min_t(unsigned long, KVMPPC_NR_LPIDS, nr_lpids_param); memset(lpid_inuse, 0, sizeof(lpid_inuse)); } +EXPORT_SYMBOL_GPL(kvmppc_init_lpid); int kvm_arch_init(void *opaque) { @@ -1130,4 +1148,5 @@ int kvm_arch_init(void *opaque) void kvm_arch_exit(void) { + } diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h index e326489..2e0e67e 100644 --- a/arch/powerpc/kvm/trace.h +++ b/arch/powerpc/kvm/trace.h @@ -31,126 +31,6 @@ TRACE_EVENT(kvm_ppc_instr, __entry->inst, __entry->pc, __entry->emulate) ); -#ifdef CONFIG_PPC_BOOK3S -#define kvm_trace_symbol_exit \ - {0x100, "SYSTEM_RESET"}, \ - {0x200, "MACHINE_CHECK"}, \ - {0x300, "DATA_STORAGE"}, \ - {0x380, "DATA_SEGMENT"}, \ - {0x400, "INST_STORAGE"}, \ - {0x480, "INST_SEGMENT"}, \ - {0x500, "EXTERNAL"}, \ - {0x501, "EXTERNAL_LEVEL"}, \ - {0x502, "EXTERNAL_HV"}, \ - {0x600, "ALIGNMENT"}, \ - {0x700, "PROGRAM"}, \ - {0x800, "FP_UNAVAIL"}, \ - {0x900, "DECREMENTER"}, \ - {0x980, "HV_DECREMENTER"}, \ - {0xc00, "SYSCALL"}, \ - {0xd00, "TRACE"}, \ - {0xe00, "H_DATA_STORAGE"}, \ - {0xe20, "H_INST_STORAGE"}, \ - {0xe40, "H_EMUL_ASSIST"}, \ - {0xf00, "PERFMON"}, \ - {0xf20, "ALTIVEC"}, \ - {0xf40, "VSX"} -#else -#define kvm_trace_symbol_exit \ - {0, "CRITICAL"}, \ - {1, "MACHINE_CHECK"}, \ - {2, "DATA_STORAGE"}, \ - {3, "INST_STORAGE"}, \ - {4, "EXTERNAL"}, \ - {5, "ALIGNMENT"}, \ - {6, "PROGRAM"}, \ - {7, "FP_UNAVAIL"}, \ - {8, "SYSCALL"}, \ - {9, "AP_UNAVAIL"}, \ - {10, "DECREMENTER"}, \ - {11, "FIT"}, \ - {12, "WATCHDOG"}, \ - {13, "DTLB_MISS"}, \ - {14, "ITLB_MISS"}, \ - {15, "DEBUG"}, \ - {32, "SPE_UNAVAIL"}, \ - {33, "SPE_FP_DATA"}, \ - {34, "SPE_FP_ROUND"}, \ - {35, "PERFORMANCE_MONITOR"}, \ - {36, "DOORBELL"}, \ - {37, "DOORBELL_CRITICAL"}, \ - {38, "GUEST_DBELL"}, \ - {39, "GUEST_DBELL_CRIT"}, \ - {40, "HV_SYSCALL"}, \ - {41, "HV_PRIV"} -#endif - -TRACE_EVENT(kvm_exit, - TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu), - TP_ARGS(exit_nr, vcpu), - - TP_STRUCT__entry( - __field( unsigned int, exit_nr ) - __field( unsigned long, pc ) - __field( unsigned long, msr ) - __field( unsigned long, dar ) -#ifdef CONFIG_KVM_BOOK3S_PR - __field( unsigned long, srr1 ) -#endif - __field( unsigned long, last_inst ) - ), - - TP_fast_assign( -#ifdef CONFIG_KVM_BOOK3S_PR - struct kvmppc_book3s_shadow_vcpu *svcpu; -#endif - __entry->exit_nr = exit_nr; - __entry->pc = kvmppc_get_pc(vcpu); - __entry->dar = kvmppc_get_fault_dar(vcpu); - __entry->msr = vcpu->arch.shared->msr; -#ifdef CONFIG_KVM_BOOK3S_PR - svcpu = svcpu_get(vcpu); - __entry->srr1 = svcpu->shadow_srr1; - svcpu_put(svcpu); -#endif - __entry->last_inst = vcpu->arch.last_inst; - ), - - TP_printk("exit=%s" - " | pc=0x%lx" - " | msr=0x%lx" - " | dar=0x%lx" -#ifdef CONFIG_KVM_BOOK3S_PR - " | srr1=0x%lx" -#endif - " | last_inst=0x%lx" - , - __print_symbolic(__entry->exit_nr, kvm_trace_symbol_exit), - __entry->pc, - __entry->msr, - __entry->dar, -#ifdef CONFIG_KVM_BOOK3S_PR - __entry->srr1, -#endif - __entry->last_inst - ) -); - -TRACE_EVENT(kvm_unmap_hva, - TP_PROTO(unsigned long hva), - TP_ARGS(hva), - - TP_STRUCT__entry( - __field( unsigned long, hva ) - ), - - TP_fast_assign( - __entry->hva = hva; - ), - - TP_printk("unmap hva 0x%lx\n", __entry->hva) -); - TRACE_EVENT(kvm_stlb_inval, TP_PROTO(unsigned int stlb_index), TP_ARGS(stlb_index), @@ -236,315 +116,6 @@ TRACE_EVENT(kvm_check_requests, __entry->cpu_nr, __entry->requests) ); - -/************************************************************************* - * Book3S trace points * - *************************************************************************/ - -#ifdef CONFIG_KVM_BOOK3S_PR - -TRACE_EVENT(kvm_book3s_reenter, - TP_PROTO(int r, struct kvm_vcpu *vcpu), - TP_ARGS(r, vcpu), - - TP_STRUCT__entry( - __field( unsigned int, r ) - __field( unsigned long, pc ) - ), - - TP_fast_assign( - __entry->r = r; - __entry->pc = kvmppc_get_pc(vcpu); - ), - - TP_printk("reentry r=%d | pc=0x%lx", __entry->r, __entry->pc) -); - -#ifdef CONFIG_PPC_BOOK3S_64 - -TRACE_EVENT(kvm_book3s_64_mmu_map, - TP_PROTO(int rflags, ulong hpteg, ulong va, pfn_t hpaddr, - struct kvmppc_pte *orig_pte), - TP_ARGS(rflags, hpteg, va, hpaddr, orig_pte), - - TP_STRUCT__entry( - __field( unsigned char, flag_w ) - __field( unsigned char, flag_x ) - __field( unsigned long, eaddr ) - __field( unsigned long, hpteg ) - __field( unsigned long, va ) - __field( unsigned long long, vpage ) - __field( unsigned long, hpaddr ) - ), - - TP_fast_assign( - __entry->flag_w = ((rflags & HPTE_R_PP) == 3) ? '-' : 'w'; - __entry->flag_x = (rflags & HPTE_R_N) ? '-' : 'x'; - __entry->eaddr = orig_pte->eaddr; - __entry->hpteg = hpteg; - __entry->va = va; - __entry->vpage = orig_pte->vpage; - __entry->hpaddr = hpaddr; - ), - - TP_printk("KVM: %c%c Map 0x%lx: [%lx] 0x%lx (0x%llx) -> %lx", - __entry->flag_w, __entry->flag_x, __entry->eaddr, - __entry->hpteg, __entry->va, __entry->vpage, __entry->hpaddr) -); - -#endif /* CONFIG_PPC_BOOK3S_64 */ - -TRACE_EVENT(kvm_book3s_mmu_map, - TP_PROTO(struct hpte_cache *pte), - TP_ARGS(pte), - - TP_STRUCT__entry( - __field( u64, host_vpn ) - __field( u64, pfn ) - __field( ulong, eaddr ) - __field( u64, vpage ) - __field( ulong, raddr ) - __field( int, flags ) - ), - - TP_fast_assign( - __entry->host_vpn = pte->host_vpn; - __entry->pfn = pte->pfn; - __entry->eaddr = pte->pte.eaddr; - __entry->vpage = pte->pte.vpage; - __entry->raddr = pte->pte.raddr; - __entry->flags = (pte->pte.may_read ? 0x4 : 0) | - (pte->pte.may_write ? 0x2 : 0) | - (pte->pte.may_execute ? 0x1 : 0); - ), - - TP_printk("Map: hvpn=%llx pfn=%llx ea=%lx vp=%llx ra=%lx [%x]", - __entry->host_vpn, __entry->pfn, __entry->eaddr, - __entry->vpage, __entry->raddr, __entry->flags) -); - -TRACE_EVENT(kvm_book3s_mmu_invalidate, - TP_PROTO(struct hpte_cache *pte), - TP_ARGS(pte), - - TP_STRUCT__entry( - __field( u64, host_vpn ) - __field( u64, pfn ) - __field( ulong, eaddr ) - __field( u64, vpage ) - __field( ulong, raddr ) - __field( int, flags ) - ), - - TP_fast_assign( - __entry->host_vpn = pte->host_vpn; - __entry->pfn = pte->pfn; - __entry->eaddr = pte->pte.eaddr; - __entry->vpage = pte->pte.vpage; - __entry->raddr = pte->pte.raddr; - __entry->flags = (pte->pte.may_read ? 0x4 : 0) | - (pte->pte.may_write ? 0x2 : 0) | - (pte->pte.may_execute ? 0x1 : 0); - ), - - TP_printk("Flush: hva=%llx pfn=%llx ea=%lx vp=%llx ra=%lx [%x]", - __entry->host_vpn, __entry->pfn, __entry->eaddr, - __entry->vpage, __entry->raddr, __entry->flags) -); - -TRACE_EVENT(kvm_book3s_mmu_flush, - TP_PROTO(const char *type, struct kvm_vcpu *vcpu, unsigned long long p1, - unsigned long long p2), - TP_ARGS(type, vcpu, p1, p2), - - TP_STRUCT__entry( - __field( int, count ) - __field( unsigned long long, p1 ) - __field( unsigned long long, p2 ) - __field( const char *, type ) - ), - - TP_fast_assign( - __entry->count = to_book3s(vcpu)->hpte_cache_count; - __entry->p1 = p1; - __entry->p2 = p2; - __entry->type = type; - ), - - TP_printk("Flush %d %sPTEs: %llx - %llx", - __entry->count, __entry->type, __entry->p1, __entry->p2) -); - -TRACE_EVENT(kvm_book3s_slb_found, - TP_PROTO(unsigned long long gvsid, unsigned long long hvsid), - TP_ARGS(gvsid, hvsid), - - TP_STRUCT__entry( - __field( unsigned long long, gvsid ) - __field( unsigned long long, hvsid ) - ), - - TP_fast_assign( - __entry->gvsid = gvsid; - __entry->hvsid = hvsid; - ), - - TP_printk("%llx -> %llx", __entry->gvsid, __entry->hvsid) -); - -TRACE_EVENT(kvm_book3s_slb_fail, - TP_PROTO(u16 sid_map_mask, unsigned long long gvsid), - TP_ARGS(sid_map_mask, gvsid), - - TP_STRUCT__entry( - __field( unsigned short, sid_map_mask ) - __field( unsigned long long, gvsid ) - ), - - TP_fast_assign( - __entry->sid_map_mask = sid_map_mask; - __entry->gvsid = gvsid; - ), - - TP_printk("%x/%x: %llx", __entry->sid_map_mask, - SID_MAP_MASK - __entry->sid_map_mask, __entry->gvsid) -); - -TRACE_EVENT(kvm_book3s_slb_map, - TP_PROTO(u16 sid_map_mask, unsigned long long gvsid, - unsigned long long hvsid), - TP_ARGS(sid_map_mask, gvsid, hvsid), - - TP_STRUCT__entry( - __field( unsigned short, sid_map_mask ) - __field( unsigned long long, guest_vsid ) - __field( unsigned long long, host_vsid ) - ), - - TP_fast_assign( - __entry->sid_map_mask = sid_map_mask; - __entry->guest_vsid = gvsid; - __entry->host_vsid = hvsid; - ), - - TP_printk("%x: %llx -> %llx", __entry->sid_map_mask, - __entry->guest_vsid, __entry->host_vsid) -); - -TRACE_EVENT(kvm_book3s_slbmte, - TP_PROTO(u64 slb_vsid, u64 slb_esid), - TP_ARGS(slb_vsid, slb_esid), - - TP_STRUCT__entry( - __field( u64, slb_vsid ) - __field( u64, slb_esid ) - ), - - TP_fast_assign( - __entry->slb_vsid = slb_vsid; - __entry->slb_esid = slb_esid; - ), - - TP_printk("%llx, %llx", __entry->slb_vsid, __entry->slb_esid) -); - -#endif /* CONFIG_PPC_BOOK3S */ - - -/************************************************************************* - * Book3E trace points * - *************************************************************************/ - -#ifdef CONFIG_BOOKE - -TRACE_EVENT(kvm_booke206_stlb_write, - TP_PROTO(__u32 mas0, __u32 mas8, __u32 mas1, __u64 mas2, __u64 mas7_3), - TP_ARGS(mas0, mas8, mas1, mas2, mas7_3), - - TP_STRUCT__entry( - __field( __u32, mas0 ) - __field( __u32, mas8 ) - __field( __u32, mas1 ) - __field( __u64, mas2 ) - __field( __u64, mas7_3 ) - ), - - TP_fast_assign( - __entry->mas0 = mas0; - __entry->mas8 = mas8; - __entry->mas1 = mas1; - __entry->mas2 = mas2; - __entry->mas7_3 = mas7_3; - ), - - TP_printk("mas0=%x mas8=%x mas1=%x mas2=%llx mas7_3=%llx", - __entry->mas0, __entry->mas8, __entry->mas1, - __entry->mas2, __entry->mas7_3) -); - -TRACE_EVENT(kvm_booke206_gtlb_write, - TP_PROTO(__u32 mas0, __u32 mas1, __u64 mas2, __u64 mas7_3), - TP_ARGS(mas0, mas1, mas2, mas7_3), - - TP_STRUCT__entry( - __field( __u32, mas0 ) - __field( __u32, mas1 ) - __field( __u64, mas2 ) - __field( __u64, mas7_3 ) - ), - - TP_fast_assign( - __entry->mas0 = mas0; - __entry->mas1 = mas1; - __entry->mas2 = mas2; - __entry->mas7_3 = mas7_3; - ), - - TP_printk("mas0=%x mas1=%x mas2=%llx mas7_3=%llx", - __entry->mas0, __entry->mas1, - __entry->mas2, __entry->mas7_3) -); - -TRACE_EVENT(kvm_booke206_ref_release, - TP_PROTO(__u64 pfn, __u32 flags), - TP_ARGS(pfn, flags), - - TP_STRUCT__entry( - __field( __u64, pfn ) - __field( __u32, flags ) - ), - - TP_fast_assign( - __entry->pfn = pfn; - __entry->flags = flags; - ), - - TP_printk("pfn=%llx flags=%x", - __entry->pfn, __entry->flags) -); - -TRACE_EVENT(kvm_booke_queue_irqprio, - TP_PROTO(struct kvm_vcpu *vcpu, unsigned int priority), - TP_ARGS(vcpu, priority), - - TP_STRUCT__entry( - __field( __u32, cpu_nr ) - __field( __u32, priority ) - __field( unsigned long, pending ) - ), - - TP_fast_assign( - __entry->cpu_nr = vcpu->vcpu_id; - __entry->priority = priority; - __entry->pending = vcpu->arch.pending_exceptions; - ), - - TP_printk("vcpu=%x prio=%x pending=%lx", - __entry->cpu_nr, __entry->priority, __entry->pending) -); - -#endif - #endif /* _TRACE_KVM_H */ /* This part must be outside protection */ diff --git a/arch/powerpc/kvm/trace_booke.h b/arch/powerpc/kvm/trace_booke.h new file mode 100644 index 0000000..f7537cf --- /dev/null +++ b/arch/powerpc/kvm/trace_booke.h @@ -0,0 +1,177 @@ +#if !defined(_TRACE_KVM_BOOKE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_KVM_BOOKE_H + +#include <linux/tracepoint.h> + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kvm_booke +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_booke + +#define kvm_trace_symbol_exit \ + {0, "CRITICAL"}, \ + {1, "MACHINE_CHECK"}, \ + {2, "DATA_STORAGE"}, \ + {3, "INST_STORAGE"}, \ + {4, "EXTERNAL"}, \ + {5, "ALIGNMENT"}, \ + {6, "PROGRAM"}, \ + {7, "FP_UNAVAIL"}, \ + {8, "SYSCALL"}, \ + {9, "AP_UNAVAIL"}, \ + {10, "DECREMENTER"}, \ + {11, "FIT"}, \ + {12, "WATCHDOG"}, \ + {13, "DTLB_MISS"}, \ + {14, "ITLB_MISS"}, \ + {15, "DEBUG"}, \ + {32, "SPE_UNAVAIL"}, \ + {33, "SPE_FP_DATA"}, \ + {34, "SPE_FP_ROUND"}, \ + {35, "PERFORMANCE_MONITOR"}, \ + {36, "DOORBELL"}, \ + {37, "DOORBELL_CRITICAL"}, \ + {38, "GUEST_DBELL"}, \ + {39, "GUEST_DBELL_CRIT"}, \ + {40, "HV_SYSCALL"}, \ + {41, "HV_PRIV"} + +TRACE_EVENT(kvm_exit, + TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu), + TP_ARGS(exit_nr, vcpu), + + TP_STRUCT__entry( + __field( unsigned int, exit_nr ) + __field( unsigned long, pc ) + __field( unsigned long, msr ) + __field( unsigned long, dar ) + __field( unsigned long, last_inst ) + ), + + TP_fast_assign( + __entry->exit_nr = exit_nr; + __entry->pc = kvmppc_get_pc(vcpu); + __entry->dar = kvmppc_get_fault_dar(vcpu); + __entry->msr = vcpu->arch.shared->msr; + __entry->last_inst = vcpu->arch.last_inst; + ), + + TP_printk("exit=%s" + " | pc=0x%lx" + " | msr=0x%lx" + " | dar=0x%lx" + " | last_inst=0x%lx" + , + __print_symbolic(__entry->exit_nr, kvm_trace_symbol_exit), + __entry->pc, + __entry->msr, + __entry->dar, + __entry->last_inst + ) +); + +TRACE_EVENT(kvm_unmap_hva, + TP_PROTO(unsigned long hva), + TP_ARGS(hva), + + TP_STRUCT__entry( + __field( unsigned long, hva ) + ), + + TP_fast_assign( + __entry->hva = hva; + ), + + TP_printk("unmap hva 0x%lx\n", __entry->hva) +); + +TRACE_EVENT(kvm_booke206_stlb_write, + TP_PROTO(__u32 mas0, __u32 mas8, __u32 mas1, __u64 mas2, __u64 mas7_3), + TP_ARGS(mas0, mas8, mas1, mas2, mas7_3), + + TP_STRUCT__entry( + __field( __u32, mas0 ) + __field( __u32, mas8 ) + __field( __u32, mas1 ) + __field( __u64, mas2 ) + __field( __u64, mas7_3 ) + ), + + TP_fast_assign( + __entry->mas0 = mas0; + __entry->mas8 = mas8; + __entry->mas1 = mas1; + __entry->mas2 = mas2; + __entry->mas7_3 = mas7_3; + ), + + TP_printk("mas0=%x mas8=%x mas1=%x mas2=%llx mas7_3=%llx", + __entry->mas0, __entry->mas8, __entry->mas1, + __entry->mas2, __entry->mas7_3) +); + +TRACE_EVENT(kvm_booke206_gtlb_write, + TP_PROTO(__u32 mas0, __u32 mas1, __u64 mas2, __u64 mas7_3), + TP_ARGS(mas0, mas1, mas2, mas7_3), + + TP_STRUCT__entry( + __field( __u32, mas0 ) + __field( __u32, mas1 ) + __field( __u64, mas2 ) + __field( __u64, mas7_3 ) + ), + + TP_fast_assign( + __entry->mas0 = mas0; + __entry->mas1 = mas1; + __entry->mas2 = mas2; + __entry->mas7_3 = mas7_3; + ), + + TP_printk("mas0=%x mas1=%x mas2=%llx mas7_3=%llx", + __entry->mas0, __entry->mas1, + __entry->mas2, __entry->mas7_3) +); + +TRACE_EVENT(kvm_booke206_ref_release, + TP_PROTO(__u64 pfn, __u32 flags), + TP_ARGS(pfn, flags), + + TP_STRUCT__entry( + __field( __u64, pfn ) + __field( __u32, flags ) + ), + + TP_fast_assign( + __entry->pfn = pfn; + __entry->flags = flags; + ), + + TP_printk("pfn=%llx flags=%x", + __entry->pfn, __entry->flags) +); + +TRACE_EVENT(kvm_booke_queue_irqprio, + TP_PROTO(struct kvm_vcpu *vcpu, unsigned int priority), + TP_ARGS(vcpu, priority), + + TP_STRUCT__entry( + __field( __u32, cpu_nr ) + __field( __u32, priority ) + __field( unsigned long, pending ) + ), + + TP_fast_assign( + __entry->cpu_nr = vcpu->vcpu_id; + __entry->priority = priority; + __entry->pending = vcpu->arch.pending_exceptions; + ), + + TP_printk("vcpu=%x prio=%x pending=%lx", + __entry->cpu_nr, __entry->priority, __entry->pending) +); + +#endif + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/arch/powerpc/kvm/trace_pr.h b/arch/powerpc/kvm/trace_pr.h new file mode 100644 index 0000000..8b22e47 --- /dev/null +++ b/arch/powerpc/kvm/trace_pr.h @@ -0,0 +1,297 @@ + +#if !defined(_TRACE_KVM_PR_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_KVM_PR_H + +#include <linux/tracepoint.h> + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kvm_pr +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_pr + +#define kvm_trace_symbol_exit \ + {0x100, "SYSTEM_RESET"}, \ + {0x200, "MACHINE_CHECK"}, \ + {0x300, "DATA_STORAGE"}, \ + {0x380, "DATA_SEGMENT"}, \ + {0x400, "INST_STORAGE"}, \ + {0x480, "INST_SEGMENT"}, \ + {0x500, "EXTERNAL"}, \ + {0x501, "EXTERNAL_LEVEL"}, \ + {0x502, "EXTERNAL_HV"}, \ + {0x600, "ALIGNMENT"}, \ + {0x700, "PROGRAM"}, \ + {0x800, "FP_UNAVAIL"}, \ + {0x900, "DECREMENTER"}, \ + {0x980, "HV_DECREMENTER"}, \ + {0xc00, "SYSCALL"}, \ + {0xd00, "TRACE"}, \ + {0xe00, "H_DATA_STORAGE"}, \ + {0xe20, "H_INST_STORAGE"}, \ + {0xe40, "H_EMUL_ASSIST"}, \ + {0xf00, "PERFMON"}, \ + {0xf20, "ALTIVEC"}, \ + {0xf40, "VSX"} + +TRACE_EVENT(kvm_book3s_reenter, + TP_PROTO(int r, struct kvm_vcpu *vcpu), + TP_ARGS(r, vcpu), + + TP_STRUCT__entry( + __field( unsigned int, r ) + __field( unsigned long, pc ) + ), + + TP_fast_assign( + __entry->r = r; + __entry->pc = kvmppc_get_pc(vcpu); + ), + + TP_printk("reentry r=%d | pc=0x%lx", __entry->r, __entry->pc) +); + +#ifdef CONFIG_PPC_BOOK3S_64 + +TRACE_EVENT(kvm_book3s_64_mmu_map, + TP_PROTO(int rflags, ulong hpteg, ulong va, pfn_t hpaddr, + struct kvmppc_pte *orig_pte), + TP_ARGS(rflags, hpteg, va, hpaddr, orig_pte), + + TP_STRUCT__entry( + __field( unsigned char, flag_w ) + __field( unsigned char, flag_x ) + __field( unsigned long, eaddr ) + __field( unsigned long, hpteg ) + __field( unsigned long, va ) + __field( unsigned long long, vpage ) + __field( unsigned long, hpaddr ) + ), + + TP_fast_assign( + __entry->flag_w = ((rflags & HPTE_R_PP) == 3) ? '-' : 'w'; + __entry->flag_x = (rflags & HPTE_R_N) ? '-' : 'x'; + __entry->eaddr = orig_pte->eaddr; + __entry->hpteg = hpteg; + __entry->va = va; + __entry->vpage = orig_pte->vpage; + __entry->hpaddr = hpaddr; + ), + + TP_printk("KVM: %c%c Map 0x%lx: [%lx] 0x%lx (0x%llx) -> %lx", + __entry->flag_w, __entry->flag_x, __entry->eaddr, + __entry->hpteg, __entry->va, __entry->vpage, __entry->hpaddr) +); + +#endif /* CONFIG_PPC_BOOK3S_64 */ + +TRACE_EVENT(kvm_book3s_mmu_map, + TP_PROTO(struct hpte_cache *pte), + TP_ARGS(pte), + + TP_STRUCT__entry( + __field( u64, host_vpn ) + __field( u64, pfn ) + __field( ulong, eaddr ) + __field( u64, vpage ) + __field( ulong, raddr ) + __field( int, flags ) + ), + + TP_fast_assign( + __entry->host_vpn = pte->host_vpn; + __entry->pfn = pte->pfn; + __entry->eaddr = pte->pte.eaddr; + __entry->vpage = pte->pte.vpage; + __entry->raddr = pte->pte.raddr; + __entry->flags = (pte->pte.may_read ? 0x4 : 0) | + (pte->pte.may_write ? 0x2 : 0) | + (pte->pte.may_execute ? 0x1 : 0); + ), + + TP_printk("Map: hvpn=%llx pfn=%llx ea=%lx vp=%llx ra=%lx [%x]", + __entry->host_vpn, __entry->pfn, __entry->eaddr, + __entry->vpage, __entry->raddr, __entry->flags) +); + +TRACE_EVENT(kvm_book3s_mmu_invalidate, + TP_PROTO(struct hpte_cache *pte), + TP_ARGS(pte), + + TP_STRUCT__entry( + __field( u64, host_vpn ) + __field( u64, pfn ) + __field( ulong, eaddr ) + __field( u64, vpage ) + __field( ulong, raddr ) + __field( int, flags ) + ), + + TP_fast_assign( + __entry->host_vpn = pte->host_vpn; + __entry->pfn = pte->pfn; + __entry->eaddr = pte->pte.eaddr; + __entry->vpage = pte->pte.vpage; + __entry->raddr = pte->pte.raddr; + __entry->flags = (pte->pte.may_read ? 0x4 : 0) | + (pte->pte.may_write ? 0x2 : 0) | + (pte->pte.may_execute ? 0x1 : 0); + ), + + TP_printk("Flush: hva=%llx pfn=%llx ea=%lx vp=%llx ra=%lx [%x]", + __entry->host_vpn, __entry->pfn, __entry->eaddr, + __entry->vpage, __entry->raddr, __entry->flags) +); + +TRACE_EVENT(kvm_book3s_mmu_flush, + TP_PROTO(const char *type, struct kvm_vcpu *vcpu, unsigned long long p1, + unsigned long long p2), + TP_ARGS(type, vcpu, p1, p2), + + TP_STRUCT__entry( + __field( int, count ) + __field( unsigned long long, p1 ) + __field( unsigned long long, p2 ) + __field( const char *, type ) + ), + + TP_fast_assign( + __entry->count = to_book3s(vcpu)->hpte_cache_count; + __entry->p1 = p1; + __entry->p2 = p2; + __entry->type = type; + ), + + TP_printk("Flush %d %sPTEs: %llx - %llx", + __entry->count, __entry->type, __entry->p1, __entry->p2) +); + +TRACE_EVENT(kvm_book3s_slb_found, + TP_PROTO(unsigned long long gvsid, unsigned long long hvsid), + TP_ARGS(gvsid, hvsid), + + TP_STRUCT__entry( + __field( unsigned long long, gvsid ) + __field( unsigned long long, hvsid ) + ), + + TP_fast_assign( + __entry->gvsid = gvsid; + __entry->hvsid = hvsid; + ), + + TP_printk("%llx -> %llx", __entry->gvsid, __entry->hvsid) +); + +TRACE_EVENT(kvm_book3s_slb_fail, + TP_PROTO(u16 sid_map_mask, unsigned long long gvsid), + TP_ARGS(sid_map_mask, gvsid), + + TP_STRUCT__entry( + __field( unsigned short, sid_map_mask ) + __field( unsigned long long, gvsid ) + ), + + TP_fast_assign( + __entry->sid_map_mask = sid_map_mask; + __entry->gvsid = gvsid; + ), + + TP_printk("%x/%x: %llx", __entry->sid_map_mask, + SID_MAP_MASK - __entry->sid_map_mask, __entry->gvsid) +); + +TRACE_EVENT(kvm_book3s_slb_map, + TP_PROTO(u16 sid_map_mask, unsigned long long gvsid, + unsigned long long hvsid), + TP_ARGS(sid_map_mask, gvsid, hvsid), + + TP_STRUCT__entry( + __field( unsigned short, sid_map_mask ) + __field( unsigned long long, guest_vsid ) + __field( unsigned long long, host_vsid ) + ), + + TP_fast_assign( + __entry->sid_map_mask = sid_map_mask; + __entry->guest_vsid = gvsid; + __entry->host_vsid = hvsid; + ), + + TP_printk("%x: %llx -> %llx", __entry->sid_map_mask, + __entry->guest_vsid, __entry->host_vsid) +); + +TRACE_EVENT(kvm_book3s_slbmte, + TP_PROTO(u64 slb_vsid, u64 slb_esid), + TP_ARGS(slb_vsid, slb_esid), + + TP_STRUCT__entry( + __field( u64, slb_vsid ) + __field( u64, slb_esid ) + ), + + TP_fast_assign( + __entry->slb_vsid = slb_vsid; + __entry->slb_esid = slb_esid; + ), + + TP_printk("%llx, %llx", __entry->slb_vsid, __entry->slb_esid) +); + +TRACE_EVENT(kvm_exit, + TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu), + TP_ARGS(exit_nr, vcpu), + + TP_STRUCT__entry( + __field( unsigned int, exit_nr ) + __field( unsigned long, pc ) + __field( unsigned long, msr ) + __field( unsigned long, dar ) + __field( unsigned long, srr1 ) + __field( unsigned long, last_inst ) + ), + + TP_fast_assign( + __entry->exit_nr = exit_nr; + __entry->pc = kvmppc_get_pc(vcpu); + __entry->dar = kvmppc_get_fault_dar(vcpu); + __entry->msr = vcpu->arch.shared->msr; + __entry->srr1 = vcpu->arch.shadow_srr1; + __entry->last_inst = vcpu->arch.last_inst; + ), + + TP_printk("exit=%s" + " | pc=0x%lx" + " | msr=0x%lx" + " | dar=0x%lx" + " | srr1=0x%lx" + " | last_inst=0x%lx" + , + __print_symbolic(__entry->exit_nr, kvm_trace_symbol_exit), + __entry->pc, + __entry->msr, + __entry->dar, + __entry->srr1, + __entry->last_inst + ) +); + +TRACE_EVENT(kvm_unmap_hva, + TP_PROTO(unsigned long hva), + TP_ARGS(hva), + + TP_STRUCT__entry( + __field( unsigned long, hva ) + ), + + TP_fast_assign( + __entry->hva = hva; + ), + + TP_printk("unmap hva 0x%lx\n", __entry->hva) +); + +#endif /* _TRACE_KVM_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index e87ecaa..d5bc375 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -38,13 +38,6 @@ struct sca_block { struct sca_entry cpu[64]; } __attribute__((packed)); -#define KVM_NR_PAGE_SIZES 2 -#define KVM_HPAGE_GFN_SHIFT(x) (((x) - 1) * 8) -#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + KVM_HPAGE_GFN_SHIFT(x)) -#define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x)) -#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1)) -#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE) - #define CPUSTAT_STOPPED 0x80000000 #define CPUSTAT_WAIT 0x10000000 #define CPUSTAT_ECALL_PEND 0x08000000 @@ -220,7 +213,6 @@ struct kvm_s390_interrupt_info { /* for local_interrupt.action_flags */ #define ACTION_STORE_ON_STOP (1<<0) #define ACTION_STOP_ON_STOP (1<<1) -#define ACTION_RELOADVCPU_ON_STOP (1<<2) struct kvm_s390_local_interrupt { spinlock_t lock; diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c index 3a74d8a..78d967f 100644 --- a/arch/s390/kvm/diag.c +++ b/arch/s390/kvm/diag.c @@ -107,14 +107,13 @@ static int __diag_ipl_functions(struct kvm_vcpu *vcpu) static int __diag_virtio_hypercall(struct kvm_vcpu *vcpu) { - int ret, idx; + int ret; /* No virtio-ccw notification? Get out quickly. */ if (!vcpu->kvm->arch.css_support || (vcpu->run->s.regs.gprs[1] != KVM_S390_VIRTIO_CCW_NOTIFY)) return -EOPNOTSUPP; - idx = srcu_read_lock(&vcpu->kvm->srcu); /* * The layout is as follows: * - gpr 2 contains the subchannel id (passed as addr) @@ -125,7 +124,6 @@ static int __diag_virtio_hypercall(struct kvm_vcpu *vcpu) vcpu->run->s.regs.gprs[2], 8, &vcpu->run->s.regs.gprs[3], vcpu->run->s.regs.gprs[4]); - srcu_read_unlock(&vcpu->kvm->srcu, idx); /* * Return cookie in gpr 2, but don't overwrite the register if the diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h index 99d789e..374a439 100644 --- a/arch/s390/kvm/gaccess.h +++ b/arch/s390/kvm/gaccess.h @@ -18,20 +18,27 @@ #include <asm/uaccess.h> #include "kvm-s390.h" +/* Convert real to absolute address by applying the prefix of the CPU */ +static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu, + unsigned long gaddr) +{ + unsigned long prefix = vcpu->arch.sie_block->prefix; + if (gaddr < 2 * PAGE_SIZE) + gaddr += prefix; + else if (gaddr >= prefix && gaddr < prefix + 2 * PAGE_SIZE) + gaddr -= prefix; + return gaddr; +} + static inline void __user *__gptr_to_uptr(struct kvm_vcpu *vcpu, void __user *gptr, int prefixing) { - unsigned long prefix = vcpu->arch.sie_block->prefix; unsigned long gaddr = (unsigned long) gptr; unsigned long uaddr; - if (prefixing) { - if (gaddr < 2 * PAGE_SIZE) - gaddr += prefix; - else if ((gaddr >= prefix) && (gaddr < prefix + 2 * PAGE_SIZE)) - gaddr -= prefix; - } + if (prefixing) + gaddr = kvm_s390_real_to_abs(vcpu, gaddr); uaddr = gmap_fault(gaddr, vcpu->arch.gmap); if (IS_ERR_VALUE(uaddr)) uaddr = -EFAULT; diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 5ee56e5..5ddbbde 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -62,12 +62,6 @@ static int handle_stop(struct kvm_vcpu *vcpu) trace_kvm_s390_stop_request(vcpu->arch.local_int.action_bits); - if (vcpu->arch.local_int.action_bits & ACTION_RELOADVCPU_ON_STOP) { - vcpu->arch.local_int.action_bits &= ~ACTION_RELOADVCPU_ON_STOP; - rc = SIE_INTERCEPT_RERUNVCPU; - vcpu->run->exit_reason = KVM_EXIT_INTR; - } - if (vcpu->arch.local_int.action_bits & ACTION_STOP_ON_STOP) { atomic_set_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags); diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 7f1f7ac..5f79d2d 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -436,6 +436,7 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu) hrtimer_start(&vcpu->arch.ckc_timer, ktime_set (0, sltime) , HRTIMER_MODE_REL); VCPU_EVENT(vcpu, 5, "enabled wait via clock comparator: %llx ns", sltime); no_timer: + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); spin_lock(&vcpu->arch.local_int.float_int->lock); spin_lock_bh(&vcpu->arch.local_int.lock); add_wait_queue(&vcpu->wq, &wait); @@ -455,6 +456,8 @@ no_timer: remove_wait_queue(&vcpu->wq, &wait); spin_unlock_bh(&vcpu->arch.local_int.lock); spin_unlock(&vcpu->arch.local_int.float_int->lock); + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + hrtimer_try_to_cancel(&vcpu->arch.ckc_timer); return 0; } diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index ed8064c..569494e 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -695,9 +695,9 @@ static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu) return 0; } -static int __vcpu_run(struct kvm_vcpu *vcpu) +static int vcpu_pre_run(struct kvm_vcpu *vcpu) { - int rc; + int rc, cpuflags; memcpy(&vcpu->arch.sie_block->gg14, &vcpu->run->s.regs.gprs[14], 16); @@ -715,28 +715,24 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) return rc; vcpu->arch.sie_block->icptcode = 0; - VCPU_EVENT(vcpu, 6, "entering sie flags %x", - atomic_read(&vcpu->arch.sie_block->cpuflags)); - trace_kvm_s390_sie_enter(vcpu, - atomic_read(&vcpu->arch.sie_block->cpuflags)); + cpuflags = atomic_read(&vcpu->arch.sie_block->cpuflags); + VCPU_EVENT(vcpu, 6, "entering sie flags %x", cpuflags); + trace_kvm_s390_sie_enter(vcpu, cpuflags); - /* - * As PF_VCPU will be used in fault handler, between guest_enter - * and guest_exit should be no uaccess. - */ - preempt_disable(); - kvm_guest_enter(); - preempt_enable(); - rc = sie64a(vcpu->arch.sie_block, vcpu->run->s.regs.gprs); - kvm_guest_exit(); + return 0; +} + +static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason) +{ + int rc; VCPU_EVENT(vcpu, 6, "exit sie icptcode %d", vcpu->arch.sie_block->icptcode); trace_kvm_s390_sie_exit(vcpu, vcpu->arch.sie_block->icptcode); - if (rc > 0) + if (exit_reason >= 0) { rc = 0; - if (rc < 0) { + } else { if (kvm_is_ucontrol(vcpu->kvm)) { rc = SIE_INTERCEPT_UCONTROL; } else { @@ -747,6 +743,49 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) } memcpy(&vcpu->run->s.regs.gprs[14], &vcpu->arch.sie_block->gg14, 16); + + if (rc == 0) { + if (kvm_is_ucontrol(vcpu->kvm)) + rc = -EOPNOTSUPP; + else + rc = kvm_handle_sie_intercept(vcpu); + } + + return rc; +} + +static int __vcpu_run(struct kvm_vcpu *vcpu) +{ + int rc, exit_reason; + + /* + * We try to hold kvm->srcu during most of vcpu_run (except when run- + * ning the guest), so that memslots (and other stuff) are protected + */ + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + + do { + rc = vcpu_pre_run(vcpu); + if (rc) + break; + + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + /* + * As PF_VCPU will be used in fault handler, between + * guest_enter and guest_exit should be no uaccess. + */ + preempt_disable(); + kvm_guest_enter(); + preempt_enable(); + exit_reason = sie64a(vcpu->arch.sie_block, + vcpu->run->s.regs.gprs); + kvm_guest_exit(); + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + + rc = vcpu_post_run(vcpu, exit_reason); + } while (!signal_pending(current) && !rc); + + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); return rc; } @@ -755,7 +794,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) int rc; sigset_t sigsaved; -rerun_vcpu: if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); @@ -788,19 +826,7 @@ rerun_vcpu: } might_fault(); - - do { - rc = __vcpu_run(vcpu); - if (rc) - break; - if (kvm_is_ucontrol(vcpu->kvm)) - rc = -EOPNOTSUPP; - else - rc = kvm_handle_sie_intercept(vcpu); - } while (!signal_pending(current) && !rc); - - if (rc == SIE_INTERCEPT_RERUNVCPU) - goto rerun_vcpu; + rc = __vcpu_run(vcpu); if (signal_pending(current) && !rc) { kvm_run->exit_reason = KVM_EXIT_INTR; @@ -958,6 +984,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, { struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; + int idx; long r; switch (ioctl) { @@ -971,7 +998,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } case KVM_S390_STORE_STATUS: + idx = srcu_read_lock(&vcpu->kvm->srcu); r = kvm_s390_vcpu_store_status(vcpu, arg); + srcu_read_unlock(&vcpu->kvm->srcu, idx); break; case KVM_S390_SET_INITIAL_PSW: { psw_t psw; @@ -1067,12 +1096,13 @@ int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) return VM_FAULT_SIGBUS; } -void kvm_arch_free_memslot(struct kvm_memory_slot *free, +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, struct kvm_memory_slot *dont) { } -int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) +int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, + unsigned long npages) { return 0; } diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index dc99f1c..b44912a 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -28,8 +28,7 @@ typedef int (*intercept_handler_t)(struct kvm_vcpu *vcpu); extern unsigned long *vfacilities; /* negativ values are error codes, positive values for internal conditions */ -#define SIE_INTERCEPT_RERUNVCPU (1<<0) -#define SIE_INTERCEPT_UCONTROL (1<<1) +#define SIE_INTERCEPT_UCONTROL (1<<0) int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu); #define VM_EVENT(d_kvm, d_loglevel, d_string, d_args...)\ @@ -91,8 +90,10 @@ static inline void kvm_s390_get_base_disp_sse(struct kvm_vcpu *vcpu, static inline void kvm_s390_get_regs_rre(struct kvm_vcpu *vcpu, int *r1, int *r2) { - *r1 = (vcpu->arch.sie_block->ipb & 0x00f00000) >> 20; - *r2 = (vcpu->arch.sie_block->ipb & 0x000f0000) >> 16; + if (r1) + *r1 = (vcpu->arch.sie_block->ipb & 0x00f00000) >> 20; + if (r2) + *r2 = (vcpu->arch.sie_block->ipb & 0x000f0000) >> 16; } static inline u64 kvm_s390_get_base_disp_rsy(struct kvm_vcpu *vcpu) diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 59200ee..2440602 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -30,6 +30,38 @@ #include "kvm-s390.h" #include "trace.h" +/* Handle SCK (SET CLOCK) interception */ +static int handle_set_clock(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu *cpup; + s64 hostclk, val; + u64 op2; + int i; + + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + + op2 = kvm_s390_get_base_disp_s(vcpu); + if (op2 & 7) /* Operand must be on a doubleword boundary */ + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + if (get_guest(vcpu, val, (u64 __user *) op2)) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + + if (store_tod_clock(&hostclk)) { + kvm_s390_set_psw_cc(vcpu, 3); + return 0; + } + val = (val - hostclk) & ~0x3fUL; + + mutex_lock(&vcpu->kvm->lock); + kvm_for_each_vcpu(i, cpup, vcpu->kvm) + cpup->arch.sie_block->epoch = val; + mutex_unlock(&vcpu->kvm->lock); + + kvm_s390_set_psw_cc(vcpu, 0); + return 0; +} + static int handle_set_prefix(struct kvm_vcpu *vcpu) { u64 operand2; @@ -128,6 +160,33 @@ static int handle_skey(struct kvm_vcpu *vcpu) return 0; } +static int handle_test_block(struct kvm_vcpu *vcpu) +{ + unsigned long hva; + gpa_t addr; + int reg2; + + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + + kvm_s390_get_regs_rre(vcpu, NULL, ®2); + addr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK; + addr = kvm_s390_real_to_abs(vcpu, addr); + + hva = gfn_to_hva(vcpu->kvm, gpa_to_gfn(addr)); + if (kvm_is_error_hva(hva)) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + /* + * We don't expect errors on modern systems, and do not care + * about storage keys (yet), so let's just clear the page. + */ + if (clear_user((void __user *)hva, PAGE_SIZE) != 0) + return -EFAULT; + kvm_s390_set_psw_cc(vcpu, 0); + vcpu->run->s.regs.gprs[0] = 0; + return 0; +} + static int handle_tpi(struct kvm_vcpu *vcpu) { struct kvm_s390_interrupt_info *inti; @@ -438,12 +497,14 @@ out_exception: static const intercept_handler_t b2_handlers[256] = { [0x02] = handle_stidp, + [0x04] = handle_set_clock, [0x10] = handle_set_prefix, [0x11] = handle_store_prefix, [0x12] = handle_store_cpu_address, [0x29] = handle_skey, [0x2a] = handle_skey, [0x2b] = handle_skey, + [0x2c] = handle_test_block, [0x30] = handle_io_inst, [0x31] = handle_io_inst, [0x32] = handle_io_inst, diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 15f960c..24ec121 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -274,13 +274,17 @@ struct x86_emulate_ctxt { bool guest_mode; /* guest running a nested guest */ bool perm_ok; /* do not check permissions if true */ - bool only_vendor_specific_insn; + bool ud; /* inject an #UD if host doesn't support insn */ bool have_exception; struct x86_exception exception; - /* decode cache */ - u8 twobyte; + /* + * decode cache + */ + + /* current opcode length in bytes */ + u8 opcode_len; u8 b; u8 intercept; u8 lock_prefix; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c76ff74..ae5d783 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -79,6 +79,13 @@ #define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1)) #define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE) +static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) +{ + /* KVM_HPAGE_GFN_SHIFT(PT_PAGE_TABLE_LEVEL) must be 0. */ + return (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - + (base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); +} + #define SELECTOR_TI_MASK (1 << 2) #define SELECTOR_RPL_MASK 0x03 @@ -253,7 +260,6 @@ struct kvm_pio_request { * mode. */ struct kvm_mmu { - void (*new_cr3)(struct kvm_vcpu *vcpu); void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root); unsigned long (*get_cr3)(struct kvm_vcpu *vcpu); u64 (*get_pdptr)(struct kvm_vcpu *vcpu, int index); @@ -261,7 +267,6 @@ struct kvm_mmu { bool prefault); void (*inject_page_fault)(struct kvm_vcpu *vcpu, struct x86_exception *fault); - void (*free)(struct kvm_vcpu *vcpu); gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access, struct x86_exception *exception); gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access); @@ -389,6 +394,8 @@ struct kvm_vcpu_arch { struct fpu guest_fpu; u64 xcr0; + u64 guest_supported_xcr0; + u32 guest_xstate_size; struct kvm_pio_request pio; void *pio_data; @@ -557,7 +564,9 @@ struct kvm_arch { struct list_head assigned_dev_head; struct iommu_domain *iommu_domain; - int iommu_flags; + bool iommu_noncoherent; +#define __KVM_HAVE_ARCH_NONCOHERENT_DMA + atomic_t noncoherent_dma_count; struct kvm_pic *vpic; struct kvm_ioapic *vioapic; struct kvm_pit *vpit; @@ -780,11 +789,11 @@ void kvm_mmu_module_exit(void); void kvm_mmu_destroy(struct kvm_vcpu *vcpu); int kvm_mmu_create(struct kvm_vcpu *vcpu); -int kvm_mmu_setup(struct kvm_vcpu *vcpu); +void kvm_mmu_setup(struct kvm_vcpu *vcpu); void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, u64 dirty_mask, u64 nx_mask, u64 x_mask); -int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); +void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, @@ -922,13 +931,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code, void *insn, int insn_len); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); +void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu); void kvm_enable_tdp(void); void kvm_disable_tdp(void); -int complete_pio(struct kvm_vcpu *vcpu); -bool kvm_check_iopl(struct kvm_vcpu *vcpu); - static inline gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) { return gpa; diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index be8269b..d6b078e 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -14,6 +14,8 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall, struct timespec *ts); void pvclock_resume(void); +void pvclock_touch_watchdogs(void); + /* * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, * yielding a 64-bit result. diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 5d9a303..d3a8778 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -211,9 +211,9 @@ struct kvm_cpuid_entry2 { __u32 padding[3]; }; -#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX 1 -#define KVM_CPUID_FLAG_STATEFUL_FUNC 2 -#define KVM_CPUID_FLAG_STATE_READ_NEXT 4 +#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX BIT(0) +#define KVM_CPUID_FLAG_STATEFUL_FUNC BIT(1) +#define KVM_CPUID_FLAG_STATE_READ_NEXT BIT(2) /* for KVM_SET_CPUID2 */ struct kvm_cpuid2 { diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h index bb04650..b93e09a 100644 --- a/arch/x86/include/uapi/asm/msr-index.h +++ b/arch/x86/include/uapi/asm/msr-index.h @@ -536,6 +536,7 @@ /* MSR_IA32_VMX_MISC bits */ #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29) +#define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE 0x1F /* AMD-V MSRs */ #define MSR_VM_CR 0xc0010114 diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 1570e07..e604109 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -139,6 +139,7 @@ bool kvm_check_and_clear_guest_paused(void) src = &hv_clock[cpu].pvti; if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) { src->flags &= ~PVCLOCK_GUEST_STOPPED; + pvclock_touch_watchdogs(); ret = true; } diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index a16bae3..2f355d2 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -43,6 +43,14 @@ unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src) return pv_tsc_khz; } +void pvclock_touch_watchdogs(void) +{ + touch_softlockup_watchdog_sync(); + clocksource_touch_watchdog(); + rcu_cpu_stall_reset(); + reset_hung_task_detector(); +} + static atomic64_t last_value = ATOMIC64_INIT(0); void pvclock_resume(void) @@ -74,6 +82,11 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) version = __pvclock_read_cycles(src, &ret, &flags); } while ((src->version & 1) || version != src->version); + if (unlikely((flags & PVCLOCK_GUEST_STOPPED) != 0)) { + src->flags &= ~PVCLOCK_GUEST_STOPPED; + pvclock_touch_watchdogs(); + } + if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) && (flags & PVCLOCK_TSC_STABLE_BIT)) return ret; diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index a47a3e5..b89c5db 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -38,6 +38,7 @@ config KVM select PERF_EVENTS select HAVE_KVM_MSI select HAVE_KVM_CPU_RELAX_INTERCEPT + select KVM_VFIO ---help--- Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index bf4fb04..25d22b2 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -9,7 +9,7 @@ KVM := ../../../virt/kvm kvm-y += $(KVM)/kvm_main.o $(KVM)/ioapic.o \ $(KVM)/coalesced_mmio.o $(KVM)/irq_comm.o \ - $(KVM)/eventfd.o $(KVM)/irqchip.o + $(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += $(KVM)/assigned-dev.o $(KVM)/iommu.o kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index b110fe6..c697625 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -23,6 +23,26 @@ #include "mmu.h" #include "trace.h" +static u32 xstate_required_size(u64 xstate_bv) +{ + int feature_bit = 0; + u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; + + xstate_bv &= ~XSTATE_FPSSE; + while (xstate_bv) { + if (xstate_bv & 0x1) { + u32 eax, ebx, ecx, edx; + cpuid_count(0xD, feature_bit, &eax, &ebx, &ecx, &edx); + ret = max(ret, eax + ebx); + } + + xstate_bv >>= 1; + feature_bit++; + } + + return ret; +} + void kvm_update_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; @@ -46,6 +66,18 @@ void kvm_update_cpuid(struct kvm_vcpu *vcpu) apic->lapic_timer.timer_mode_mask = 1 << 17; } + best = kvm_find_cpuid_entry(vcpu, 0xD, 0); + if (!best) { + vcpu->arch.guest_supported_xcr0 = 0; + vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; + } else { + vcpu->arch.guest_supported_xcr0 = + (best->eax | ((u64)best->edx << 32)) & + host_xcr0 & KVM_SUPPORTED_XCR0; + vcpu->arch.guest_xstate_size = + xstate_required_size(vcpu->arch.guest_supported_xcr0); + } + kvm_pmu_cpuid_update(vcpu); } @@ -182,13 +214,35 @@ static bool supported_xcr0_bit(unsigned bit) { u64 mask = ((u64)1 << bit); - return mask & (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) & host_xcr0; + return mask & KVM_SUPPORTED_XCR0 & host_xcr0; } #define F(x) bit(X86_FEATURE_##x) -static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, - u32 index, int *nent, int maxnent) +static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry2 *entry, + u32 func, u32 index, int *nent, int maxnent) +{ + switch (func) { + case 0: + entry->eax = 1; /* only one leaf currently */ + ++*nent; + break; + case 1: + entry->ecx = F(MOVBE); + ++*nent; + break; + default: + break; + } + + entry->function = func; + entry->index = index; + + return 0; +} + +static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, + u32 index, int *nent, int maxnent) { int r; unsigned f_nx = is_efer_nx() ? F(NX) : 0; @@ -383,6 +437,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, case 0xd: { int idx, i; + entry->eax &= host_xcr0 & KVM_SUPPORTED_XCR0; + entry->edx &= (host_xcr0 & KVM_SUPPORTED_XCR0) >> 32; entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; for (idx = 1, i = 1; idx < 64; ++idx) { if (*nent >= maxnent) @@ -481,6 +537,15 @@ out: return r; } +static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 func, + u32 idx, int *nent, int maxnent, unsigned int type) +{ + if (type == KVM_GET_EMULATED_CPUID) + return __do_cpuid_ent_emulated(entry, func, idx, nent, maxnent); + + return __do_cpuid_ent(entry, func, idx, nent, maxnent); +} + #undef F struct kvm_cpuid_param { @@ -495,8 +560,36 @@ static bool is_centaur_cpu(const struct kvm_cpuid_param *param) return boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR; } -int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries) +static bool sanity_check_entries(struct kvm_cpuid_entry2 __user *entries, + __u32 num_entries, unsigned int ioctl_type) +{ + int i; + __u32 pad[3]; + + if (ioctl_type != KVM_GET_EMULATED_CPUID) + return false; + + /* + * We want to make sure that ->padding is being passed clean from + * userspace in case we want to use it for something in the future. + * + * Sadly, this wasn't enforced for KVM_GET_SUPPORTED_CPUID and so we + * have to give ourselves satisfied only with the emulated side. /me + * sheds a tear. + */ + for (i = 0; i < num_entries; i++) { + if (copy_from_user(pad, entries[i].padding, sizeof(pad))) + return true; + + if (pad[0] || pad[1] || pad[2]) + return true; + } + return false; +} + +int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 __user *entries, + unsigned int type) { struct kvm_cpuid_entry2 *cpuid_entries; int limit, nent = 0, r = -E2BIG, i; @@ -513,8 +606,12 @@ int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, goto out; if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) cpuid->nent = KVM_MAX_CPUID_ENTRIES; + + if (sanity_check_entries(entries, cpuid->nent, type)) + return -EINVAL; + r = -ENOMEM; - cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent); + cpuid_entries = vzalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent); if (!cpuid_entries) goto out; @@ -526,7 +623,7 @@ int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, continue; r = do_cpuid_ent(&cpuid_entries[nent], ent->func, ent->idx, - &nent, cpuid->nent); + &nent, cpuid->nent, type); if (r) goto out_free; @@ -537,7 +634,7 @@ int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, limit = cpuid_entries[nent - 1].eax; for (func = ent->func + 1; func <= limit && nent < cpuid->nent && r == 0; ++func) r = do_cpuid_ent(&cpuid_entries[nent], func, ent->idx, - &nent, cpuid->nent); + &nent, cpuid->nent, type); if (r) goto out_free; @@ -661,6 +758,7 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) *edx = best->edx; } else *eax = *ebx = *ecx = *edx = 0; + trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx); } EXPORT_SYMBOL_GPL(kvm_cpuid); @@ -676,6 +774,5 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) kvm_register_write(vcpu, VCPU_REGS_RCX, ecx); kvm_register_write(vcpu, VCPU_REGS_RDX, edx); kvm_x86_ops->skip_emulated_instruction(vcpu); - trace_kvm_cpuid(function, eax, ebx, ecx, edx); } EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index b7fd079..f1e4895 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -6,8 +6,9 @@ void kvm_update_cpuid(struct kvm_vcpu *vcpu); struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, u32 function, u32 index); -int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries); +int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 __user *entries, + unsigned int type); int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid *cpuid, struct kvm_cpuid_entry __user *entries); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index ddc3f3d..07ffca0 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -130,7 +130,7 @@ #define Mov (1<<20) /* Misc flags */ #define Prot (1<<21) /* instruction generates #UD if not in prot-mode */ -#define VendorSpecific (1<<22) /* Vendor specific instruction */ +#define EmulateOnUD (1<<22) /* Emulate if unsupported by the host */ #define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */ #define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */ #define Undefined (1<<25) /* No Such Instruction */ @@ -785,9 +785,10 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, * @highbyte_regs specifies whether to decode AH,CH,DH,BH. */ static void *decode_register(struct x86_emulate_ctxt *ctxt, u8 modrm_reg, - int highbyte_regs) + int byteop) { void *p; + int highbyte_regs = (ctxt->rex_prefix == 0) && byteop; if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8) p = (unsigned char *)reg_rmw(ctxt, modrm_reg & 3) + 1; @@ -1024,7 +1025,6 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt, struct operand *op) { unsigned reg = ctxt->modrm_reg; - int highbyte_regs = ctxt->rex_prefix == 0; if (!(ctxt->d & ModRM)) reg = (ctxt->b & 7) | ((ctxt->rex_prefix & 1) << 3); @@ -1045,13 +1045,9 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt, } op->type = OP_REG; - if (ctxt->d & ByteOp) { - op->addr.reg = decode_register(ctxt, reg, highbyte_regs); - op->bytes = 1; - } else { - op->addr.reg = decode_register(ctxt, reg, 0); - op->bytes = ctxt->op_bytes; - } + op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; + op->addr.reg = decode_register(ctxt, reg, ctxt->d & ByteOp); + fetch_register_operand(op); op->orig_val = op->val; } @@ -1082,12 +1078,10 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, ctxt->modrm_seg = VCPU_SREG_DS; if (ctxt->modrm_mod == 3) { - int highbyte_regs = ctxt->rex_prefix == 0; - op->type = OP_REG; op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; op->addr.reg = decode_register(ctxt, ctxt->modrm_rm, - highbyte_regs && (ctxt->d & ByteOp)); + ctxt->d & ByteOp); if (ctxt->d & Sse) { op->type = OP_XMM; op->bytes = 16; @@ -2961,6 +2955,46 @@ static int em_mov(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +#define FFL(x) bit(X86_FEATURE_##x) + +static int em_movbe(struct x86_emulate_ctxt *ctxt) +{ + u32 ebx, ecx, edx, eax = 1; + u16 tmp; + + /* + * Check MOVBE is set in the guest-visible CPUID leaf. + */ + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); + if (!(ecx & FFL(MOVBE))) + return emulate_ud(ctxt); + + switch (ctxt->op_bytes) { + case 2: + /* + * From MOVBE definition: "...When the operand size is 16 bits, + * the upper word of the destination register remains unchanged + * ..." + * + * Both casting ->valptr and ->val to u16 breaks strict aliasing + * rules so we have to do the operation almost per hand. + */ + tmp = (u16)ctxt->src.val; + ctxt->dst.val &= ~0xffffUL; + ctxt->dst.val |= (unsigned long)swab16(tmp); + break; + case 4: + ctxt->dst.val = swab32((u32)ctxt->src.val); + break; + case 8: + ctxt->dst.val = swab64(ctxt->src.val); + break; + default: + return X86EMUL_PROPAGATE_FAULT; + } + return X86EMUL_CONTINUE; +} + static int em_cr_write(struct x86_emulate_ctxt *ctxt) { if (ctxt->ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val)) @@ -3256,6 +3290,18 @@ static int em_cpuid(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_sahf(struct x86_emulate_ctxt *ctxt) +{ + u32 flags; + + flags = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF; + flags &= *reg_rmw(ctxt, VCPU_REGS_RAX) >> 8; + + ctxt->eflags &= ~0xffUL; + ctxt->eflags |= flags | X86_EFLAGS_FIXED; + return X86EMUL_CONTINUE; +} + static int em_lahf(struct x86_emulate_ctxt *ctxt) { *reg_rmw(ctxt, VCPU_REGS_RAX) &= ~0xff00UL; @@ -3502,7 +3548,7 @@ static const struct opcode group7_rm1[] = { static const struct opcode group7_rm3[] = { DIP(SrcNone | Prot | Priv, vmrun, check_svme_pa), - II(SrcNone | Prot | VendorSpecific, em_vmmcall, vmmcall), + II(SrcNone | Prot | EmulateOnUD, em_vmmcall, vmmcall), DIP(SrcNone | Prot | Priv, vmload, check_svme_pa), DIP(SrcNone | Prot | Priv, vmsave, check_svme_pa), DIP(SrcNone | Prot | Priv, stgi, check_svme), @@ -3587,7 +3633,7 @@ static const struct group_dual group7 = { { II(SrcMem16 | Mov | Priv, em_lmsw, lmsw), II(SrcMem | ByteOp | Priv | NoAccess, em_invlpg, invlpg), }, { - I(SrcNone | Priv | VendorSpecific, em_vmcall), + I(SrcNone | Priv | EmulateOnUD, em_vmcall), EXT(0, group7_rm1), N, EXT(0, group7_rm3), II(SrcNone | DstMem | Mov, em_smsw, smsw), N, @@ -3750,7 +3796,8 @@ static const struct opcode opcode_table[256] = { D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd), I(SrcImmFAddr | No64, em_call_far), N, II(ImplicitOps | Stack, em_pushf, pushf), - II(ImplicitOps | Stack, em_popf, popf), N, I(ImplicitOps, em_lahf), + II(ImplicitOps | Stack, em_popf, popf), + I(ImplicitOps, em_sahf), I(ImplicitOps, em_lahf), /* 0xA0 - 0xA7 */ I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov), @@ -3810,7 +3857,7 @@ static const struct opcode opcode_table[256] = { static const struct opcode twobyte_table[256] = { /* 0x00 - 0x0F */ G(0, group6), GD(0, &group7), N, N, - N, I(ImplicitOps | VendorSpecific, em_syscall), + N, I(ImplicitOps | EmulateOnUD, em_syscall), II(ImplicitOps | Priv, em_clts, clts), N, DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N, N, D(ImplicitOps | ModRM), N, N, @@ -3830,8 +3877,8 @@ static const struct opcode twobyte_table[256] = { IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc), II(ImplicitOps | Priv, em_rdmsr, rdmsr), IIP(ImplicitOps, em_rdpmc, rdpmc, check_rdpmc), - I(ImplicitOps | VendorSpecific, em_sysenter), - I(ImplicitOps | Priv | VendorSpecific, em_sysexit), + I(ImplicitOps | EmulateOnUD, em_sysenter), + I(ImplicitOps | Priv | EmulateOnUD, em_sysexit), N, N, N, N, N, N, N, N, N, N, /* 0x40 - 0x4F */ @@ -3892,6 +3939,30 @@ static const struct opcode twobyte_table[256] = { N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N }; +static const struct gprefix three_byte_0f_38_f0 = { + I(DstReg | SrcMem | Mov, em_movbe), N, N, N +}; + +static const struct gprefix three_byte_0f_38_f1 = { + I(DstMem | SrcReg | Mov, em_movbe), N, N, N +}; + +/* + * Insns below are selected by the prefix which indexed by the third opcode + * byte. + */ +static const struct opcode opcode_map_0f_38[256] = { + /* 0x00 - 0x7f */ + X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), + /* 0x80 - 0xef */ + X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), + /* 0xf0 - 0xf1 */ + GP(EmulateOnUD | ModRM | Prefix, &three_byte_0f_38_f0), + GP(EmulateOnUD | ModRM | Prefix, &three_byte_0f_38_f1), + /* 0xf2 - 0xff */ + N, N, X4(N), X8(N) +}; + #undef D #undef N #undef G @@ -4040,7 +4111,8 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, case OpMem8: ctxt->memop.bytes = 1; if (ctxt->memop.type == OP_REG) { - ctxt->memop.addr.reg = decode_register(ctxt, ctxt->modrm_rm, 1); + ctxt->memop.addr.reg = decode_register(ctxt, + ctxt->modrm_rm, true); fetch_register_operand(&ctxt->memop); } goto mem_common; @@ -4126,6 +4198,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) ctxt->_eip = ctxt->eip; ctxt->fetch.start = ctxt->_eip; ctxt->fetch.end = ctxt->fetch.start + insn_len; + ctxt->opcode_len = 1; if (insn_len > 0) memcpy(ctxt->fetch.data, insn, insn_len); @@ -4208,9 +4281,16 @@ done_prefixes: opcode = opcode_table[ctxt->b]; /* Two-byte opcode? */ if (ctxt->b == 0x0f) { - ctxt->twobyte = 1; + ctxt->opcode_len = 2; ctxt->b = insn_fetch(u8, ctxt); opcode = twobyte_table[ctxt->b]; + + /* 0F_38 opcode map */ + if (ctxt->b == 0x38) { + ctxt->opcode_len = 3; + ctxt->b = insn_fetch(u8, ctxt); + opcode = opcode_map_0f_38[ctxt->b]; + } } ctxt->d = opcode.flags; @@ -4267,7 +4347,7 @@ done_prefixes: if (ctxt->d == 0 || (ctxt->d & NotImpl)) return EMULATION_FAILED; - if (!(ctxt->d & VendorSpecific) && ctxt->only_vendor_specific_insn) + if (!(ctxt->d & EmulateOnUD) && ctxt->ud) return EMULATION_FAILED; if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack)) @@ -4540,8 +4620,10 @@ special_insn: goto writeback; } - if (ctxt->twobyte) + if (ctxt->opcode_len == 2) goto twobyte_insn; + else if (ctxt->opcode_len == 3) + goto threebyte_insn; switch (ctxt->b) { case 0x63: /* movsxd */ @@ -4726,6 +4808,8 @@ twobyte_insn: goto cannot_emulate; } +threebyte_insn: + if (rc != X86EMUL_CONTINUE) goto done; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index dce0df8..40772ef 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2570,11 +2570,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, kvm_release_pfn_clean(pfn); } -static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) -{ - mmu_free_roots(vcpu); -} - static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log) { @@ -3424,18 +3419,11 @@ out_unlock: return 0; } -static void nonpaging_free(struct kvm_vcpu *vcpu) -{ - mmu_free_roots(vcpu); -} - -static int nonpaging_init_context(struct kvm_vcpu *vcpu, - struct kvm_mmu *context) +static void nonpaging_init_context(struct kvm_vcpu *vcpu, + struct kvm_mmu *context) { - context->new_cr3 = nonpaging_new_cr3; context->page_fault = nonpaging_page_fault; context->gva_to_gpa = nonpaging_gva_to_gpa; - context->free = nonpaging_free; context->sync_page = nonpaging_sync_page; context->invlpg = nonpaging_invlpg; context->update_pte = nonpaging_update_pte; @@ -3444,7 +3432,6 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu, context->root_hpa = INVALID_PAGE; context->direct_map = true; context->nx = false; - return 0; } void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) @@ -3454,9 +3441,8 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_mmu_flush_tlb); -static void paging_new_cr3(struct kvm_vcpu *vcpu) +void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu) { - pgprintk("%s: cr3 %lx\n", __func__, kvm_read_cr3(vcpu)); mmu_free_roots(vcpu); } @@ -3471,11 +3457,6 @@ static void inject_page_fault(struct kvm_vcpu *vcpu, vcpu->arch.mmu.inject_page_fault(vcpu, fault); } -static void paging_free(struct kvm_vcpu *vcpu) -{ - nonpaging_free(vcpu); -} - static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn, unsigned access, int *nr_present) { @@ -3665,9 +3646,9 @@ static void update_last_pte_bitmap(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) mmu->last_pte_bitmap = map; } -static int paging64_init_context_common(struct kvm_vcpu *vcpu, - struct kvm_mmu *context, - int level) +static void paging64_init_context_common(struct kvm_vcpu *vcpu, + struct kvm_mmu *context, + int level) { context->nx = is_nx(vcpu); context->root_level = level; @@ -3677,27 +3658,24 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, update_last_pte_bitmap(vcpu, context); ASSERT(is_pae(vcpu)); - context->new_cr3 = paging_new_cr3; context->page_fault = paging64_page_fault; context->gva_to_gpa = paging64_gva_to_gpa; context->sync_page = paging64_sync_page; context->invlpg = paging64_invlpg; context->update_pte = paging64_update_pte; - context->free = paging_free; context->shadow_root_level = level; context->root_hpa = INVALID_PAGE; context->direct_map = false; - return 0; } -static int paging64_init_context(struct kvm_vcpu *vcpu, - struct kvm_mmu *context) +static void paging64_init_context(struct kvm_vcpu *vcpu, + struct kvm_mmu *context) { - return paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL); + paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL); } -static int paging32_init_context(struct kvm_vcpu *vcpu, - struct kvm_mmu *context) +static void paging32_init_context(struct kvm_vcpu *vcpu, + struct kvm_mmu *context) { context->nx = false; context->root_level = PT32_ROOT_LEVEL; @@ -3706,33 +3684,28 @@ static int paging32_init_context(struct kvm_vcpu *vcpu, update_permission_bitmask(vcpu, context, false); update_last_pte_bitmap(vcpu, context); - context->new_cr3 = paging_new_cr3; context->page_fault = paging32_page_fault; context->gva_to_gpa = paging32_gva_to_gpa; - context->free = paging_free; context->sync_page = paging32_sync_page; context->invlpg = paging32_invlpg; context->update_pte = paging32_update_pte; context->shadow_root_level = PT32E_ROOT_LEVEL; context->root_hpa = INVALID_PAGE; context->direct_map = false; - return 0; } -static int paging32E_init_context(struct kvm_vcpu *vcpu, - struct kvm_mmu *context) +static void paging32E_init_context(struct kvm_vcpu *vcpu, + struct kvm_mmu *context) { - return paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL); + paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL); } -static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) +static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) { struct kvm_mmu *context = vcpu->arch.walk_mmu; context->base_role.word = 0; - context->new_cr3 = nonpaging_new_cr3; context->page_fault = tdp_page_fault; - context->free = nonpaging_free; context->sync_page = nonpaging_sync_page; context->invlpg = nonpaging_invlpg; context->update_pte = nonpaging_update_pte; @@ -3767,37 +3740,32 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) update_permission_bitmask(vcpu, context, false); update_last_pte_bitmap(vcpu, context); - - return 0; } -int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) +void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) { - int r; bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); ASSERT(vcpu); ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); if (!is_paging(vcpu)) - r = nonpaging_init_context(vcpu, context); + nonpaging_init_context(vcpu, context); else if (is_long_mode(vcpu)) - r = paging64_init_context(vcpu, context); + paging64_init_context(vcpu, context); else if (is_pae(vcpu)) - r = paging32E_init_context(vcpu, context); + paging32E_init_context(vcpu, context); else - r = paging32_init_context(vcpu, context); + paging32_init_context(vcpu, context); vcpu->arch.mmu.base_role.nxe = is_nx(vcpu); vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); vcpu->arch.mmu.base_role.smep_andnot_wp = smep && !is_write_protection(vcpu); - - return r; } EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); -int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context, +void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context, bool execonly) { ASSERT(vcpu); @@ -3806,37 +3774,30 @@ int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context, context->shadow_root_level = kvm_x86_ops->get_tdp_level(); context->nx = true; - context->new_cr3 = paging_new_cr3; context->page_fault = ept_page_fault; context->gva_to_gpa = ept_gva_to_gpa; context->sync_page = ept_sync_page; context->invlpg = ept_invlpg; context->update_pte = ept_update_pte; - context->free = paging_free; context->root_level = context->shadow_root_level; context->root_hpa = INVALID_PAGE; context->direct_map = false; update_permission_bitmask(vcpu, context, true); reset_rsvds_bits_mask_ept(vcpu, context, execonly); - - return 0; } EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); -static int init_kvm_softmmu(struct kvm_vcpu *vcpu) +static void init_kvm_softmmu(struct kvm_vcpu *vcpu) { - int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu); - + kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu); vcpu->arch.walk_mmu->set_cr3 = kvm_x86_ops->set_cr3; vcpu->arch.walk_mmu->get_cr3 = get_cr3; vcpu->arch.walk_mmu->get_pdptr = kvm_pdptr_read; vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; - - return r; } -static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu) +static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) { struct kvm_mmu *g_context = &vcpu->arch.nested_mmu; @@ -3873,11 +3834,9 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu) update_permission_bitmask(vcpu, g_context, false); update_last_pte_bitmap(vcpu, g_context); - - return 0; } -static int init_kvm_mmu(struct kvm_vcpu *vcpu) +static void init_kvm_mmu(struct kvm_vcpu *vcpu) { if (mmu_is_nested(vcpu)) return init_kvm_nested_mmu(vcpu); @@ -3887,18 +3846,12 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu) return init_kvm_softmmu(vcpu); } -static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) +void kvm_mmu_reset_context(struct kvm_vcpu *vcpu) { ASSERT(vcpu); - if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) - /* mmu.free() should set root_hpa = INVALID_PAGE */ - vcpu->arch.mmu.free(vcpu); -} -int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) -{ - destroy_kvm_mmu(vcpu); - return init_kvm_mmu(vcpu); + kvm_mmu_unload(vcpu); + init_kvm_mmu(vcpu); } EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); @@ -3923,6 +3876,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_load); void kvm_mmu_unload(struct kvm_vcpu *vcpu) { mmu_free_roots(vcpu); + WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa)); } EXPORT_SYMBOL_GPL(kvm_mmu_unload); @@ -4281,12 +4235,12 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) return alloc_mmu_pages(vcpu); } -int kvm_mmu_setup(struct kvm_vcpu *vcpu) +void kvm_mmu_setup(struct kvm_vcpu *vcpu) { ASSERT(vcpu); ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); - return init_kvm_mmu(vcpu); + init_kvm_mmu(vcpu); } void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) @@ -4428,7 +4382,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) int nr_to_scan = sc->nr_to_scan; unsigned long freed = 0; - raw_spin_lock(&kvm_lock); + spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { int idx; @@ -4478,9 +4432,8 @@ unlock: break; } - raw_spin_unlock(&kvm_lock); + spin_unlock(&kvm_lock); return freed; - } static unsigned long @@ -4574,7 +4527,7 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu) { ASSERT(vcpu); - destroy_kvm_mmu(vcpu); + kvm_mmu_unload(vcpu); free_mmu_pages(vcpu); mmu_free_memory_caches(vcpu); } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 77e044a..2926152 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -70,8 +70,8 @@ enum { }; int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); -int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); -int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context, +void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); +void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context, bool execonly); static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index c0bc803..c7168a5 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1959,11 +1959,9 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, nested_svm_vmexit(svm); } -static int nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) +static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) { - int r; - - r = kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu); + kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu); vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3; vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3; @@ -1971,8 +1969,6 @@ static int nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit; vcpu->arch.mmu.shadow_root_level = get_npt_level(); vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; - - return r; } static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2b2fce1..b2fe1c2 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1498,7 +1498,7 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, break; if (i == NR_AUTOLOAD_MSRS) { - printk_once(KERN_WARNING"Not enough mst switch entries. " + printk_once(KERN_WARNING "Not enough msr switch entries. " "Can't add msr %x\n", msr); return; } else if (i == m->nr) { @@ -1898,16 +1898,12 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) /* * KVM wants to inject page-faults which it got to the guest. This function * checks whether in a nested guest, we need to inject them to L1 or L2. - * This function assumes it is called with the exit reason in vmcs02 being - * a #PF exception (this is the only case in which KVM injects a #PF when L2 - * is running). */ -static int nested_pf_handled(struct kvm_vcpu *vcpu) +static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */ - if (!(vmcs12->exception_bitmap & (1u << PF_VECTOR))) + if (!(vmcs12->exception_bitmap & (1u << nr))) return 0; nested_vmx_vmexit(vcpu); @@ -1921,8 +1917,8 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, struct vcpu_vmx *vmx = to_vmx(vcpu); u32 intr_info = nr | INTR_INFO_VALID_MASK; - if (nr == PF_VECTOR && is_guest_mode(vcpu) && - !vmx->nested.nested_run_pending && nested_pf_handled(vcpu)) + if (!reinject && is_guest_mode(vcpu) && + nested_vmx_check_exception(vcpu, nr)) return; if (has_error_code) { @@ -2204,9 +2200,15 @@ static __init void nested_vmx_setup_ctls_msrs(void) #ifdef CONFIG_X86_64 VM_EXIT_HOST_ADDR_SPACE_SIZE | #endif - VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; + VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | + VM_EXIT_SAVE_VMX_PREEMPTION_TIMER; + if (!(nested_vmx_pinbased_ctls_high & PIN_BASED_VMX_PREEMPTION_TIMER) || + !(nested_vmx_exit_ctls_high & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) { + nested_vmx_exit_ctls_high &= ~VM_EXIT_SAVE_VMX_PREEMPTION_TIMER; + nested_vmx_pinbased_ctls_high &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + } nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | - VM_EXIT_LOAD_IA32_EFER); + VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER); /* entry controls */ rdmsr(MSR_IA32_VMX_ENTRY_CTLS, @@ -2226,7 +2228,8 @@ static __init void nested_vmx_setup_ctls_msrs(void) nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high); nested_vmx_procbased_ctls_low = 0; nested_vmx_procbased_ctls_high &= - CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_USE_TSC_OFFSETING | + CPU_BASED_VIRTUAL_INTR_PENDING | + CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING | CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING | @@ -2252,13 +2255,15 @@ static __init void nested_vmx_setup_ctls_msrs(void) nested_vmx_secondary_ctls_low = 0; nested_vmx_secondary_ctls_high &= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_UNRESTRICTED_GUEST | SECONDARY_EXEC_WBINVD_EXITING; if (enable_ept) { /* nested EPT: emulate EPT also to L1 */ nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT; nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT | - VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT; + VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT | + VMX_EPT_INVEPT_BIT; nested_vmx_ept_caps &= vmx_capability.ept; /* * Since invept is completely emulated we support both global @@ -3380,8 +3385,10 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) if (enable_ept) { eptp = construct_eptp(cr3); vmcs_write64(EPT_POINTER, eptp); - guest_cr3 = is_paging(vcpu) ? kvm_read_cr3(vcpu) : - vcpu->kvm->arch.ept_identity_map_addr; + if (is_paging(vcpu) || is_guest_mode(vcpu)) + guest_cr3 = kvm_read_cr3(vcpu); + else + guest_cr3 = vcpu->kvm->arch.ept_identity_map_addr; ept_load_pdptrs(vcpu); } @@ -4879,6 +4886,17 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) hypercall[2] = 0xc1; } +static bool nested_cr0_valid(struct vmcs12 *vmcs12, unsigned long val) +{ + unsigned long always_on = VMXON_CR0_ALWAYSON; + + if (nested_vmx_secondary_ctls_high & + SECONDARY_EXEC_UNRESTRICTED_GUEST && + nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) + always_on &= ~(X86_CR0_PE | X86_CR0_PG); + return (val & always_on) == always_on; +} + /* called to set cr0 as appropriate for a mov-to-cr0 exit. */ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) { @@ -4897,9 +4915,7 @@ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) val = (val & ~vmcs12->cr0_guest_host_mask) | (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask); - /* TODO: will have to take unrestricted guest mode into - * account */ - if ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) + if (!nested_cr0_valid(vmcs12, val)) return 1; if (kvm_set_cr0(vcpu, val)) @@ -6627,6 +6643,9 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) return 0; else if (is_page_fault(intr_info)) return enable_ept; + else if (is_no_device(intr_info) && + !(nested_read_cr0(vmcs12) & X86_CR0_TS)) + return 0; return vmcs12->exception_bitmap & (1u << (intr_info & INTR_INFO_VECTOR_MASK)); case EXIT_REASON_EXTERNAL_INTERRUPT: @@ -6722,6 +6741,27 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) *info2 = vmcs_read32(VM_EXIT_INTR_INFO); } +static void nested_adjust_preemption_timer(struct kvm_vcpu *vcpu) +{ + u64 delta_tsc_l1; + u32 preempt_val_l1, preempt_val_l2, preempt_scale; + + if (!(get_vmcs12(vcpu)->pin_based_vm_exec_control & + PIN_BASED_VMX_PREEMPTION_TIMER)) + return; + preempt_scale = native_read_msr(MSR_IA32_VMX_MISC) & + MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE; + preempt_val_l2 = vmcs_read32(VMX_PREEMPTION_TIMER_VALUE); + delta_tsc_l1 = vmx_read_l1_tsc(vcpu, native_read_tsc()) + - vcpu->arch.last_guest_tsc; + preempt_val_l1 = delta_tsc_l1 >> preempt_scale; + if (preempt_val_l2 <= preempt_val_l1) + preempt_val_l2 = 0; + else + preempt_val_l2 -= preempt_val_l1; + vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, preempt_val_l2); +} + /* * The guest has exited. See if we can fix it or if we need userspace * assistance. @@ -6736,20 +6776,6 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) if (vmx->emulation_required) return handle_invalid_guest_state(vcpu); - /* - * the KVM_REQ_EVENT optimization bit is only on for one entry, and if - * we did not inject a still-pending event to L1 now because of - * nested_run_pending, we need to re-enable this bit. - */ - if (vmx->nested.nested_run_pending) - kvm_make_request(KVM_REQ_EVENT, vcpu); - - if (!is_guest_mode(vcpu) && (exit_reason == EXIT_REASON_VMLAUNCH || - exit_reason == EXIT_REASON_VMRESUME)) - vmx->nested.nested_run_pending = 1; - else - vmx->nested.nested_run_pending = 0; - if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) { nested_vmx_vmexit(vcpu); return 1; @@ -7061,9 +7087,9 @@ static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, case INTR_TYPE_HARD_EXCEPTION: if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { u32 err = vmcs_read32(error_code_field); - kvm_queue_exception_e(vcpu, vector, err); + kvm_requeue_exception_e(vcpu, vector, err); } else - kvm_queue_exception(vcpu, vector); + kvm_requeue_exception(vcpu, vector); break; case INTR_TYPE_SOFT_INTR: vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); @@ -7146,6 +7172,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) atomic_switch_perf_msrs(vmx); debugctlmsr = get_debugctlmsr(); + if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) + nested_adjust_preemption_timer(vcpu); vmx->__launched = vmx->loaded_vmcs->launched; asm( /* Store host registers */ @@ -7284,6 +7312,16 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX); + /* + * the KVM_REQ_EVENT optimization bit is only on for one entry, and if + * we did not inject a still-pending event to L1 now because of + * nested_run_pending, we need to re-enable this bit. + */ + if (vmx->nested.nested_run_pending) + kvm_make_request(KVM_REQ_EVENT, vcpu); + + vmx->nested.nested_run_pending = 0; + vmx_complete_atomic_exit(vmx); vmx_recover_nmi_blocking(vmx); vmx_complete_interrupts(vmx); @@ -7410,8 +7448,7 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) */ if (is_mmio) ret = MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT; - else if (vcpu->kvm->arch.iommu_domain && - !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY)) + else if (kvm_arch_has_noncoherent_dma(vcpu->kvm)) ret = kvm_get_guest_memory_type(vcpu, gfn) << VMX_EPT_MT_EPTE_SHIFT; else @@ -7501,9 +7538,9 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) return get_vmcs12(vcpu)->ept_pointer; } -static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) +static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) { - int r = kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu, + kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu, nested_vmx_ept_caps & VMX_EPT_EXECUTE_ONLY_BIT); vcpu->arch.mmu.set_cr3 = vmx_set_cr3; @@ -7511,8 +7548,6 @@ static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault; vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; - - return r; } static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) @@ -7520,6 +7555,20 @@ static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) vcpu->arch.walk_mmu = &vcpu->arch.mmu; } +static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, + struct x86_exception *fault) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + WARN_ON(!is_guest_mode(vcpu)); + + /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */ + if (vmcs12->exception_bitmap & (1u << PF_VECTOR)) + nested_vmx_vmexit(vcpu); + else + kvm_inject_page_fault(vcpu, fault); +} + /* * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it @@ -7533,6 +7582,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 exec_control; + u32 exit_control; vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); @@ -7706,7 +7756,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER * bits are further modified by vmx_set_efer() below. */ - vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); + exit_control = vmcs_config.vmexit_ctrl; + if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) + exit_control |= VM_EXIT_SAVE_VMX_PREEMPTION_TIMER; + vmcs_write32(VM_EXIT_CONTROLS, exit_control); /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are * emulated by vmx_set_efer(), below. @@ -7773,6 +7826,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) kvm_set_cr3(vcpu, vmcs12->guest_cr3); kvm_mmu_reset_context(vcpu); + if (!enable_ept) + vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; + /* * L1 may access the L2's PDPTR, so save them to construct vmcs12 */ @@ -7876,7 +7932,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) return 1; } - if (((vmcs12->guest_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) || + if (!nested_cr0_valid(vmcs12, vmcs12->guest_cr0) || ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) { nested_vmx_entry_failure(vcpu, vmcs12, EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); @@ -7938,6 +7994,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) enter_guest_mode(vcpu); + vmx->nested.nested_run_pending = 1; + vmx->nested.vmcs01_tsc_offset = vmcs_read64(TSC_OFFSET); cpu = get_cpu(); @@ -8005,7 +8063,7 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, u32 idt_vectoring; unsigned int nr; - if (vcpu->arch.exception.pending) { + if (vcpu->arch.exception.pending && vcpu->arch.exception.reinject) { nr = vcpu->arch.exception.nr; idt_vectoring = nr | VECTORING_INFO_VALID_MASK; @@ -8023,7 +8081,7 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, } vmcs12->idt_vectoring_info_field = idt_vectoring; - } else if (vcpu->arch.nmi_pending) { + } else if (vcpu->arch.nmi_injected) { vmcs12->idt_vectoring_info_field = INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; } else if (vcpu->arch.interrupt.pending) { @@ -8105,6 +8163,11 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_pending_dbg_exceptions = vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); + if ((vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) && + (vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) + vmcs12->vmx_preemption_timer_value = + vmcs_read32(VMX_PREEMPTION_TIMER_VALUE); + /* * In some cases (usually, nested EPT), L2 is allowed to change its * own CR3 without exiting. If it has changed it, we must keep it. @@ -8130,6 +8193,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT); + if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) + vmcs12->guest_ia32_efer = vcpu->arch.efer; vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); @@ -8201,7 +8266,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, * fpu_active (which may have changed). * Note that vmx_set_cr0 refers to efer set above. */ - kvm_set_cr0(vcpu, vmcs12->host_cr0); + vmx_set_cr0(vcpu, vmcs12->host_cr0); /* * If we did fpu_activate()/fpu_deactivate() during L2's run, we need * to apply the same changes to L1's vmcs. We just set cr0 correctly, @@ -8224,6 +8289,9 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, kvm_set_cr3(vcpu, vmcs12->host_cr3); kvm_mmu_reset_context(vcpu); + if (!enable_ept) + vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; + if (enable_vpid) { /* * Trivially support vpid by letting L2s share their parent diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e5ca72a..21ef1ba 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -577,6 +577,7 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) { u64 xcr0; + u64 valid_bits; /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */ if (index != XCR_XFEATURE_ENABLED_MASK) @@ -586,8 +587,16 @@ int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) return 1; if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE)) return 1; - if (xcr0 & ~host_xcr0) + + /* + * Do not allow the guest to set bits that we do not support + * saving. However, xcr0 bit 0 is always set, even if the + * emulated CPU does not support XSAVE (see fx_init). + */ + valid_bits = vcpu->arch.guest_supported_xcr0 | XSTATE_FP; + if (xcr0 & ~valid_bits) return 1; + kvm_put_guest_xcr0(vcpu); vcpu->arch.xcr0 = xcr0; return 0; @@ -684,7 +693,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) vcpu->arch.cr3 = cr3; __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); - vcpu->arch.mmu.new_cr3(vcpu); + kvm_mmu_new_cr3(vcpu); return 0; } EXPORT_SYMBOL_GPL(kvm_set_cr3); @@ -2564,6 +2573,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: case KVM_CAP_SET_TSS_ADDR: case KVM_CAP_EXT_CPUID: + case KVM_CAP_EXT_EMUL_CPUID: case KVM_CAP_CLOCKSOURCE: case KVM_CAP_PIT: case KVM_CAP_NOP_IO_DELAY: @@ -2673,15 +2683,17 @@ long kvm_arch_dev_ioctl(struct file *filp, r = 0; break; } - case KVM_GET_SUPPORTED_CPUID: { + case KVM_GET_SUPPORTED_CPUID: + case KVM_GET_EMULATED_CPUID: { struct kvm_cpuid2 __user *cpuid_arg = argp; struct kvm_cpuid2 cpuid; r = -EFAULT; if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) goto out; - r = kvm_dev_ioctl_get_supported_cpuid(&cpuid, - cpuid_arg->entries); + + r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries, + ioctl); if (r) goto out; @@ -2715,8 +2727,7 @@ static void wbinvd_ipi(void *garbage) static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu) { - return vcpu->kvm->arch.iommu_domain && - !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY); + return kvm_arch_has_noncoherent_dma(vcpu->kvm); } void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) @@ -2984,11 +2995,13 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, struct kvm_xsave *guest_xsave) { - if (cpu_has_xsave) + if (cpu_has_xsave) { memcpy(guest_xsave->region, &vcpu->arch.guest_fpu.state->xsave, - xstate_size); - else { + vcpu->arch.guest_xstate_size); + *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] &= + vcpu->arch.guest_supported_xcr0 | XSTATE_FPSSE; + } else { memcpy(guest_xsave->region, &vcpu->arch.guest_fpu.state->fxsave, sizeof(struct i387_fxsave_struct)); @@ -3003,10 +3016,19 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, u64 xstate_bv = *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)]; - if (cpu_has_xsave) + if (cpu_has_xsave) { + /* + * Here we allow setting states that are not present in + * CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility + * with old userspace. + */ + if (xstate_bv & ~KVM_SUPPORTED_XCR0) + return -EINVAL; + if (xstate_bv & ~host_xcr0) + return -EINVAL; memcpy(&vcpu->arch.guest_fpu.state->xsave, - guest_xsave->region, xstate_size); - else { + guest_xsave->region, vcpu->arch.guest_xstate_size); + } else { if (xstate_bv & ~XSTATE_FPSSE) return -EINVAL; memcpy(&vcpu->arch.guest_fpu.state->fxsave, @@ -3042,9 +3064,9 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, for (i = 0; i < guest_xcrs->nr_xcrs; i++) /* Only support XCR0 currently */ - if (guest_xcrs->xcrs[0].xcr == XCR_XFEATURE_ENABLED_MASK) { + if (guest_xcrs->xcrs[i].xcr == XCR_XFEATURE_ENABLED_MASK) { r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK, - guest_xcrs->xcrs[0].value); + guest_xcrs->xcrs[i].value); break; } if (r) @@ -4775,8 +4797,8 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu) static void init_decode_cache(struct x86_emulate_ctxt *ctxt) { - memset(&ctxt->twobyte, 0, - (void *)&ctxt->_regs - (void *)&ctxt->twobyte); + memset(&ctxt->opcode_len, 0, + (void *)&ctxt->_regs - (void *)&ctxt->opcode_len); ctxt->fetch.start = 0; ctxt->fetch.end = 0; @@ -5094,8 +5116,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, ctxt->have_exception = false; ctxt->perm_ok = false; - ctxt->only_vendor_specific_insn - = emulation_type & EMULTYPE_TRAP_UD; + ctxt->ud = emulation_type & EMULTYPE_TRAP_UD; r = x86_decode_insn(ctxt, insn, insn_len); @@ -5263,7 +5284,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); - raw_spin_lock(&kvm_lock); + spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { kvm_for_each_vcpu(i, vcpu, kvm) { if (vcpu->cpu != freq->cpu) @@ -5273,7 +5294,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va send_ipi = 1; } } - raw_spin_unlock(&kvm_lock); + spin_unlock(&kvm_lock); if (freq->old < freq->new && send_ipi) { /* @@ -5426,12 +5447,12 @@ static void pvclock_gtod_update_fn(struct work_struct *work) struct kvm_vcpu *vcpu; int i; - raw_spin_lock(&kvm_lock); + spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) kvm_for_each_vcpu(i, vcpu, kvm) set_bit(KVM_REQ_MASTERCLOCK_UPDATE, &vcpu->requests); atomic_set(&kvm_guest_has_master_clock, 0); - raw_spin_unlock(&kvm_lock); + spin_unlock(&kvm_lock); } static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn); @@ -5945,10 +5966,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->mode = IN_GUEST_MODE; + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + /* We should set ->mode before check ->requests, * see the comment in make_all_cpus_request. */ - smp_mb(); + smp_mb__after_srcu_read_unlock(); local_irq_disable(); @@ -5958,12 +5981,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) smp_wmb(); local_irq_enable(); preempt_enable(); + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); r = 1; goto cancel_injection; } - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); - if (req_immediate_exit) smp_send_reschedule(vcpu->cpu); @@ -6688,7 +6710,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) if (r) return r; kvm_vcpu_reset(vcpu); - r = kvm_mmu_setup(vcpu); + kvm_mmu_setup(vcpu); vcpu_put(vcpu); return r; @@ -6940,6 +6962,10 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) vcpu->arch.ia32_tsc_adjust_msr = 0x0; vcpu->arch.pv_time_enabled = false; + + vcpu->arch.guest_supported_xcr0 = 0; + vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; + kvm_async_pf_hash_reset(vcpu); kvm_pmu_init(vcpu); @@ -6981,6 +7007,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); + atomic_set(&kvm->arch.noncoherent_dma_count, 0); /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); @@ -7065,7 +7092,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kfree(rcu_dereference_check(kvm->arch.apic_map, 1)); } -void kvm_arch_free_memslot(struct kvm_memory_slot *free, +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, struct kvm_memory_slot *dont) { int i; @@ -7086,7 +7113,8 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free, } } -int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) +int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, + unsigned long npages) { int i; @@ -7283,7 +7311,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) int r; if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) || - is_error_page(work->page)) + work->wakeup_all) return; r = kvm_mmu_reload(vcpu); @@ -7393,7 +7421,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, struct x86_exception fault; trace_kvm_async_pf_ready(work->arch.token, work->gva); - if (is_error_page(work->page)) + if (work->wakeup_all) work->arch.token = ~0; /* broadcast wakeup */ else kvm_del_async_pf_gfn(vcpu, work->arch.gfn); @@ -7420,6 +7448,24 @@ bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu) kvm_x86_ops->interrupt_allowed(vcpu); } +void kvm_arch_register_noncoherent_dma(struct kvm *kvm) +{ + atomic_inc(&kvm->arch.noncoherent_dma_count); +} +EXPORT_SYMBOL_GPL(kvm_arch_register_noncoherent_dma); + +void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm) +{ + atomic_dec(&kvm->arch.noncoherent_dma_count); +} +EXPORT_SYMBOL_GPL(kvm_arch_unregister_noncoherent_dma); + +bool kvm_arch_has_noncoherent_dma(struct kvm *kvm) +{ + return atomic_read(&kvm->arch.noncoherent_dma_count); +} +EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma); + EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index e224f7a..587fb9e 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -122,6 +122,7 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception); +#define KVM_SUPPORTED_XCR0 (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) extern u64 host_xcr0; extern struct static_key kvm_no_apic_vcpu; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 0fbbc7a..9523d2a 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -142,7 +142,7 @@ struct kvm; struct kvm_vcpu; extern struct kmem_cache *kvm_vcpu_cache; -extern raw_spinlock_t kvm_lock; +extern spinlock_t kvm_lock; extern struct list_head vm_list; struct kvm_io_range { @@ -189,8 +189,7 @@ struct kvm_async_pf { gva_t gva; unsigned long addr; struct kvm_arch_async_pf arch; - struct page *page; - bool done; + bool wakeup_all; }; void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu); @@ -508,9 +507,10 @@ int kvm_set_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem); int __kvm_set_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem); -void kvm_arch_free_memslot(struct kvm_memory_slot *free, +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, struct kvm_memory_slot *dont); -int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages); +int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, + unsigned long npages); void kvm_arch_memslots_updated(struct kvm *kvm); int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, @@ -671,6 +671,25 @@ static inline void kvm_arch_free_vm(struct kvm *kvm) } #endif +#ifdef __KVM_HAVE_ARCH_NONCOHERENT_DMA +void kvm_arch_register_noncoherent_dma(struct kvm *kvm); +void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm); +bool kvm_arch_has_noncoherent_dma(struct kvm *kvm); +#else +static inline void kvm_arch_register_noncoherent_dma(struct kvm *kvm) +{ +} + +static inline void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm) +{ +} + +static inline bool kvm_arch_has_noncoherent_dma(struct kvm *kvm) +{ + return false; +} +#endif + static inline wait_queue_head_t *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu) { #ifdef __KVM_HAVE_ARCH_WQP @@ -747,9 +766,6 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm, int kvm_request_irq_source_id(struct kvm *kvm); void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id); -/* For vcpu->arch.iommu_flags */ -#define KVM_IOMMU_CACHE_COHERENCY 0x1 - #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot); void kvm_iommu_unmap_pages(struct kvm *kvm, struct kvm_memory_slot *slot); @@ -789,7 +805,7 @@ static inline void kvm_guest_enter(void) /* KVM does not hold any references to rcu protected data when it * switches CPU into a guest mode. In fact switching to a guest mode - * is very similar to exiting to userspase from rcu point of view. In + * is very similar to exiting to userspace from rcu point of view. In * addition CPU may stay in a guest mode for quite a long time (up to * one time slice). Lets treat guest mode as quiescent state, just like * we do with user-mode execution. @@ -842,13 +858,6 @@ static inline int memslot_id(struct kvm *kvm, gfn_t gfn) return gfn_to_memslot(kvm, gfn)->id; } -static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) -{ - /* KVM_HPAGE_GFN_SHIFT(PT_PAGE_TABLE_LEVEL) must be 0. */ - return (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - - (base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); -} - static inline gfn_t hva_to_gfn_memslot(unsigned long hva, struct kvm_memory_slot *slot) { @@ -1066,6 +1075,7 @@ struct kvm_device *kvm_device_from_filp(struct file *filp); extern struct kvm_device_ops kvm_mpic_ops; extern struct kvm_device_ops kvm_xics_ops; +extern struct kvm_device_ops kvm_vfio_ops; #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT diff --git a/include/linux/sched.h b/include/linux/sched.h index f7efc86..6f7ffa4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -286,6 +286,14 @@ static inline void lockup_detector_init(void) } #endif +#ifdef CONFIG_DETECT_HUNG_TASK +void reset_hung_task_detector(void); +#else +static inline void reset_hung_task_detector(void) +{ +} +#endif + /* Attach to any functions which should be ignored in wchan output. */ #define __sched __attribute__((__section__(".sched.text"))) diff --git a/include/linux/srcu.h b/include/linux/srcu.h index c114614..9b058ee 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -237,4 +237,18 @@ static inline void srcu_read_unlock(struct srcu_struct *sp, int idx) __srcu_read_unlock(sp, idx); } +/** + * smp_mb__after_srcu_read_unlock - ensure full ordering after srcu_read_unlock + * + * Converts the preceding srcu_read_unlock into a two-way memory barrier. + * + * Call this after srcu_read_unlock, to guarantee that all memory operations + * that occur after smp_mb__after_srcu_read_unlock will appear to happen after + * the preceding srcu_read_unlock. + */ +static inline void smp_mb__after_srcu_read_unlock(void) +{ + /* __srcu_read_unlock has smp_mb() internally so nothing to do here. */ +} + #endif diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index 7005d11..131a0bd 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -296,23 +296,21 @@ DEFINE_EVENT(kvm_async_pf_nopresent_ready, kvm_async_pf_ready, TRACE_EVENT( kvm_async_pf_completed, - TP_PROTO(unsigned long address, struct page *page, u64 gva), - TP_ARGS(address, page, gva), + TP_PROTO(unsigned long address, u64 gva), + TP_ARGS(address, gva), TP_STRUCT__entry( __field(unsigned long, address) - __field(pfn_t, pfn) __field(u64, gva) ), TP_fast_assign( __entry->address = address; - __entry->pfn = page ? page_to_pfn(page) : 0; __entry->gva = gva; ), - TP_printk("gva %#llx address %#lx pfn %#llx", __entry->gva, - __entry->address, __entry->pfn) + TP_printk("gva %#llx address %#lx", __entry->gva, + __entry->address) ); #endif diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 99c2533..902f124 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -518,6 +518,10 @@ struct kvm_ppc_smmu_info { /* machine type bits, to be used as argument to KVM_CREATE_VM */ #define KVM_VM_S390_UCONTROL 1 +/* on ppc, 0 indicate default, 1 should force HV and 2 PR */ +#define KVM_VM_PPC_HV 1 +#define KVM_VM_PPC_PR 2 + #define KVM_S390_SIE_PAGE_OFFSET 1 /* @@ -541,6 +545,7 @@ struct kvm_ppc_smmu_info { #define KVM_TRACE_ENABLE __KVM_DEPRECATED_MAIN_W_0x06 #define KVM_TRACE_PAUSE __KVM_DEPRECATED_MAIN_0x07 #define KVM_TRACE_DISABLE __KVM_DEPRECATED_MAIN_0x08 +#define KVM_GET_EMULATED_CPUID _IOWR(KVMIO, 0x09, struct kvm_cpuid2) /* * Extension capability list. @@ -668,6 +673,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_IRQ_XICS 92 #define KVM_CAP_ARM_EL1_32BIT 93 #define KVM_CAP_SPAPR_MULTITCE 94 +#define KVM_CAP_EXT_EMUL_CPUID 95 #ifdef KVM_CAP_IRQ_ROUTING @@ -843,6 +849,10 @@ struct kvm_device_attr { #define KVM_DEV_TYPE_FSL_MPIC_20 1 #define KVM_DEV_TYPE_FSL_MPIC_42 2 #define KVM_DEV_TYPE_XICS 3 +#define KVM_DEV_TYPE_VFIO 4 +#define KVM_DEV_VFIO_GROUP 1 +#define KVM_DEV_VFIO_GROUP_ADD 1 +#define KVM_DEV_VFIO_GROUP_DEL 2 /* * ioctls for VM fds @@ -1012,6 +1022,7 @@ struct kvm_s390_ucas_mapping { /* VM is being stopped by host */ #define KVM_KVMCLOCK_CTRL _IO(KVMIO, 0xad) #define KVM_ARM_VCPU_INIT _IOW(KVMIO, 0xae, struct kvm_vcpu_init) +#define KVM_ARM_PREFERRED_TARGET _IOR(KVMIO, 0xaf, struct kvm_vcpu_init) #define KVM_GET_REG_LIST _IOWR(KVMIO, 0xb0, struct kvm_reg_list) #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 8807061..9328b80 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -207,6 +207,14 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, return ret; } +static atomic_t reset_hung_task = ATOMIC_INIT(0); + +void reset_hung_task_detector(void) +{ + atomic_set(&reset_hung_task, 1); +} +EXPORT_SYMBOL_GPL(reset_hung_task_detector); + /* * kthread which checks for tasks stuck in D state */ @@ -220,6 +228,9 @@ static int watchdog(void *dummy) while (schedule_timeout_interruptible(timeout_jiffies(timeout))) timeout = sysctl_hung_task_timeout_secs; + if (atomic_xchg(&reset_hung_task, 0)) + continue; + check_hung_uninterruptible_tasks(timeout); } diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 779262f..fbe1a48 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -27,3 +27,6 @@ config HAVE_KVM_MSI config HAVE_KVM_CPU_RELAX_INTERCEPT bool + +config KVM_VFIO + bool diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index 8a39dda..8631d9c 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -56,7 +56,6 @@ void kvm_async_pf_vcpu_init(struct kvm_vcpu *vcpu) static void async_pf_execute(struct work_struct *work) { - struct page *page = NULL; struct kvm_async_pf *apf = container_of(work, struct kvm_async_pf, work); struct mm_struct *mm = apf->mm; @@ -68,14 +67,12 @@ static void async_pf_execute(struct work_struct *work) use_mm(mm); down_read(&mm->mmap_sem); - get_user_pages(current, mm, addr, 1, 1, 0, &page, NULL); + get_user_pages(current, mm, addr, 1, 1, 0, NULL, NULL); up_read(&mm->mmap_sem); unuse_mm(mm); spin_lock(&vcpu->async_pf.lock); list_add_tail(&apf->link, &vcpu->async_pf.done); - apf->page = page; - apf->done = true; spin_unlock(&vcpu->async_pf.lock); /* @@ -83,7 +80,7 @@ static void async_pf_execute(struct work_struct *work) * this point */ - trace_kvm_async_pf_completed(addr, page, gva); + trace_kvm_async_pf_completed(addr, gva); if (waitqueue_active(&vcpu->wq)) wake_up_interruptible(&vcpu->wq); @@ -99,9 +96,8 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu) struct kvm_async_pf *work = list_entry(vcpu->async_pf.queue.next, typeof(*work), queue); - cancel_work_sync(&work->work); list_del(&work->queue); - if (!work->done) { /* work was canceled */ + if (cancel_work_sync(&work->work)) { mmdrop(work->mm); kvm_put_kvm(vcpu->kvm); /* == work->vcpu->kvm */ kmem_cache_free(async_pf_cache, work); @@ -114,8 +110,6 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu) list_entry(vcpu->async_pf.done.next, typeof(*work), link); list_del(&work->link); - if (!is_error_page(work->page)) - kvm_release_page_clean(work->page); kmem_cache_free(async_pf_cache, work); } spin_unlock(&vcpu->async_pf.lock); @@ -135,14 +129,11 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu) list_del(&work->link); spin_unlock(&vcpu->async_pf.lock); - if (work->page) - kvm_arch_async_page_ready(vcpu, work); + kvm_arch_async_page_ready(vcpu, work); kvm_arch_async_page_present(vcpu, work); list_del(&work->queue); vcpu->async_pf.queued--; - if (!is_error_page(work->page)) - kvm_release_page_clean(work->page); kmem_cache_free(async_pf_cache, work); } } @@ -165,8 +156,7 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, if (!work) return 0; - work->page = NULL; - work->done = false; + work->wakeup_all = false; work->vcpu = vcpu; work->gva = gva; work->addr = gfn_to_hva(vcpu->kvm, gfn); @@ -206,7 +196,7 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu) if (!work) return -ENOMEM; - work->page = KVM_ERR_PTR_BAD_PAGE; + work->wakeup_all = true; INIT_LIST_HEAD(&work->queue); /* for list_del to work */ spin_lock(&vcpu->async_pf.lock); diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c index 72a130b..0df7d4b 100644 --- a/virt/kvm/iommu.c +++ b/virt/kvm/iommu.c @@ -79,7 +79,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot) flags = IOMMU_READ; if (!(slot->flags & KVM_MEM_READONLY)) flags |= IOMMU_WRITE; - if (kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY) + if (!kvm->arch.iommu_noncoherent) flags |= IOMMU_CACHE; @@ -103,6 +103,10 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot) while ((gfn << PAGE_SHIFT) & (page_size - 1)) page_size >>= 1; + /* Make sure hva is aligned to the page size we want to map */ + while (__gfn_to_hva_memslot(slot, gfn) & (page_size - 1)) + page_size >>= 1; + /* * Pin all pages we are about to map in memory. This is * important because we unmap and unpin in 4kb steps later. @@ -140,6 +144,9 @@ static int kvm_iommu_map_memslots(struct kvm *kvm) struct kvm_memslots *slots; struct kvm_memory_slot *memslot; + if (kvm->arch.iommu_noncoherent) + kvm_arch_register_noncoherent_dma(kvm); + idx = srcu_read_lock(&kvm->srcu); slots = kvm_memslots(kvm); @@ -158,7 +165,8 @@ int kvm_assign_device(struct kvm *kvm, { struct pci_dev *pdev = NULL; struct iommu_domain *domain = kvm->arch.iommu_domain; - int r, last_flags; + int r; + bool noncoherent; /* check if iommu exists and in use */ if (!domain) @@ -174,15 +182,13 @@ int kvm_assign_device(struct kvm *kvm, return r; } - last_flags = kvm->arch.iommu_flags; - if (iommu_domain_has_cap(kvm->arch.iommu_domain, - IOMMU_CAP_CACHE_COHERENCY)) - kvm->arch.iommu_flags |= KVM_IOMMU_CACHE_COHERENCY; + noncoherent = !iommu_domain_has_cap(kvm->arch.iommu_domain, + IOMMU_CAP_CACHE_COHERENCY); /* Check if need to update IOMMU page table for guest memory */ - if ((last_flags ^ kvm->arch.iommu_flags) == - KVM_IOMMU_CACHE_COHERENCY) { + if (noncoherent != kvm->arch.iommu_noncoherent) { kvm_iommu_unmap_memslots(kvm); + kvm->arch.iommu_noncoherent = noncoherent; r = kvm_iommu_map_memslots(kvm); if (r) goto out_unmap; @@ -190,11 +196,7 @@ int kvm_assign_device(struct kvm *kvm, pdev->dev_flags |= PCI_DEV_FLAGS_ASSIGNED; - printk(KERN_DEBUG "assign device %x:%x:%x.%x\n", - assigned_dev->host_segnr, - assigned_dev->host_busnr, - PCI_SLOT(assigned_dev->host_devfn), - PCI_FUNC(assigned_dev->host_devfn)); + dev_info(&pdev->dev, "kvm assign device\n"); return 0; out_unmap: @@ -220,11 +222,7 @@ int kvm_deassign_device(struct kvm *kvm, pdev->dev_flags &= ~PCI_DEV_FLAGS_ASSIGNED; - printk(KERN_DEBUG "deassign device %x:%x:%x.%x\n", - assigned_dev->host_segnr, - assigned_dev->host_busnr, - PCI_SLOT(assigned_dev->host_devfn), - PCI_FUNC(assigned_dev->host_devfn)); + dev_info(&pdev->dev, "kvm deassign device\n"); return 0; } @@ -336,6 +334,9 @@ static int kvm_iommu_unmap_memslots(struct kvm *kvm) srcu_read_unlock(&kvm->srcu, idx); + if (kvm->arch.iommu_noncoherent) + kvm_arch_unregister_noncoherent_dma(kvm); + return 0; } @@ -350,6 +351,7 @@ int kvm_iommu_unmap_guest(struct kvm *kvm) mutex_lock(&kvm->slots_lock); kvm_iommu_unmap_memslots(kvm); kvm->arch.iommu_domain = NULL; + kvm->arch.iommu_noncoherent = false; mutex_unlock(&kvm->slots_lock); iommu_domain_free(domain); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 1cf9ccb..662f34c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -70,7 +70,8 @@ MODULE_LICENSE("GPL"); * kvm->lock --> kvm->slots_lock --> kvm->irq_lock */ -DEFINE_RAW_SPINLOCK(kvm_lock); +DEFINE_SPINLOCK(kvm_lock); +static DEFINE_RAW_SPINLOCK(kvm_count_lock); LIST_HEAD(vm_list); static cpumask_var_t cpus_hardware_enabled; @@ -186,6 +187,7 @@ void kvm_flush_remote_tlbs(struct kvm *kvm) ++kvm->stat.remote_tlb_flush; cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); } +EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs); void kvm_reload_remote_mmus(struct kvm *kvm) { @@ -490,9 +492,9 @@ static struct kvm *kvm_create_vm(unsigned long type) if (r) goto out_err; - raw_spin_lock(&kvm_lock); + spin_lock(&kvm_lock); list_add(&kvm->vm_list, &vm_list); - raw_spin_unlock(&kvm_lock); + spin_unlock(&kvm_lock); return kvm; @@ -540,13 +542,13 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) /* * Free any memory in @free but not in @dont. */ -static void kvm_free_physmem_slot(struct kvm_memory_slot *free, +static void kvm_free_physmem_slot(struct kvm *kvm, struct kvm_memory_slot *free, struct kvm_memory_slot *dont) { if (!dont || free->dirty_bitmap != dont->dirty_bitmap) kvm_destroy_dirty_bitmap(free); - kvm_arch_free_memslot(free, dont); + kvm_arch_free_memslot(kvm, free, dont); free->npages = 0; } @@ -557,7 +559,7 @@ void kvm_free_physmem(struct kvm *kvm) struct kvm_memory_slot *memslot; kvm_for_each_memslot(memslot, slots) - kvm_free_physmem_slot(memslot, NULL); + kvm_free_physmem_slot(kvm, memslot, NULL); kfree(kvm->memslots); } @@ -581,9 +583,9 @@ static void kvm_destroy_vm(struct kvm *kvm) struct mm_struct *mm = kvm->mm; kvm_arch_sync_events(kvm); - raw_spin_lock(&kvm_lock); + spin_lock(&kvm_lock); list_del(&kvm->vm_list); - raw_spin_unlock(&kvm_lock); + spin_unlock(&kvm_lock); kvm_free_irq_routing(kvm); for (i = 0; i < KVM_NR_BUSES; i++) kvm_io_bus_destroy(kvm->buses[i]); @@ -821,7 +823,7 @@ int __kvm_set_memory_region(struct kvm *kvm, if (change == KVM_MR_CREATE) { new.userspace_addr = mem->userspace_addr; - if (kvm_arch_create_memslot(&new, npages)) + if (kvm_arch_create_memslot(kvm, &new, npages)) goto out_free; } @@ -872,6 +874,19 @@ int __kvm_set_memory_region(struct kvm *kvm, goto out_free; } + /* actual memory is freed via old in kvm_free_physmem_slot below */ + if (change == KVM_MR_DELETE) { + new.dirty_bitmap = NULL; + memset(&new.arch, 0, sizeof(new.arch)); + } + + old_memslots = install_new_memslots(kvm, slots, &new); + + kvm_arch_commit_memory_region(kvm, mem, &old, change); + + kvm_free_physmem_slot(kvm, &old, &new); + kfree(old_memslots); + /* * IOMMU mapping: New slots need to be mapped. Old slots need to be * un-mapped and re-mapped if their base changes. Since base change @@ -883,29 +898,15 @@ int __kvm_set_memory_region(struct kvm *kvm, */ if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) { r = kvm_iommu_map_pages(kvm, &new); - if (r) - goto out_slots; - } - - /* actual memory is freed via old in kvm_free_physmem_slot below */ - if (change == KVM_MR_DELETE) { - new.dirty_bitmap = NULL; - memset(&new.arch, 0, sizeof(new.arch)); + return r; } - old_memslots = install_new_memslots(kvm, slots, &new); - - kvm_arch_commit_memory_region(kvm, mem, &old, change); - - kvm_free_physmem_slot(&old, &new); - kfree(old_memslots); - return 0; out_slots: kfree(slots); out_free: - kvm_free_physmem_slot(&new, &old); + kvm_free_physmem_slot(kvm, &new, &old); out: return r; } @@ -964,6 +965,7 @@ int kvm_get_dirty_log(struct kvm *kvm, out: return r; } +EXPORT_SYMBOL_GPL(kvm_get_dirty_log); bool kvm_largepages_enabled(void) { @@ -1654,6 +1656,7 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn) memslot = gfn_to_memslot(kvm, gfn); mark_page_dirty_in_slot(kvm, memslot, gfn); } +EXPORT_SYMBOL_GPL(mark_page_dirty); /* * The vCPU has executed a HLT instruction with in-kernel mode enabled. @@ -1679,6 +1682,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) finish_wait(&vcpu->wq, &wait); } +EXPORT_SYMBOL_GPL(kvm_vcpu_block); #ifndef CONFIG_S390 /* @@ -2271,6 +2275,11 @@ static int kvm_ioctl_create_device(struct kvm *kvm, ops = &kvm_xics_ops; break; #endif +#ifdef CONFIG_KVM_VFIO + case KVM_DEV_TYPE_VFIO: + ops = &kvm_vfio_ops; + break; +#endif default: return -ENODEV; } @@ -2519,44 +2528,12 @@ out: } #endif -static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct page *page[1]; - unsigned long addr; - int npages; - gfn_t gfn = vmf->pgoff; - struct kvm *kvm = vma->vm_file->private_data; - - addr = gfn_to_hva(kvm, gfn); - if (kvm_is_error_hva(addr)) - return VM_FAULT_SIGBUS; - - npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page, - NULL); - if (unlikely(npages != 1)) - return VM_FAULT_SIGBUS; - - vmf->page = page[0]; - return 0; -} - -static const struct vm_operations_struct kvm_vm_vm_ops = { - .fault = kvm_vm_fault, -}; - -static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma) -{ - vma->vm_ops = &kvm_vm_vm_ops; - return 0; -} - static struct file_operations kvm_vm_fops = { .release = kvm_vm_release, .unlocked_ioctl = kvm_vm_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = kvm_vm_compat_ioctl, #endif - .mmap = kvm_vm_mmap, .llseek = noop_llseek, }; @@ -2683,11 +2660,12 @@ static void hardware_enable_nolock(void *junk) } } -static void hardware_enable(void *junk) +static void hardware_enable(void) { - raw_spin_lock(&kvm_lock); - hardware_enable_nolock(junk); - raw_spin_unlock(&kvm_lock); + raw_spin_lock(&kvm_count_lock); + if (kvm_usage_count) + hardware_enable_nolock(NULL); + raw_spin_unlock(&kvm_count_lock); } static void hardware_disable_nolock(void *junk) @@ -2700,11 +2678,12 @@ static void hardware_disable_nolock(void *junk) kvm_arch_hardware_disable(NULL); } -static void hardware_disable(void *junk) +static void hardware_disable(void) { - raw_spin_lock(&kvm_lock); - hardware_disable_nolock(junk); - raw_spin_unlock(&kvm_lock); + raw_spin_lock(&kvm_count_lock); + if (kvm_usage_count) + hardware_disable_nolock(NULL); + raw_spin_unlock(&kvm_count_lock); } static void hardware_disable_all_nolock(void) @@ -2718,16 +2697,16 @@ static void hardware_disable_all_nolock(void) static void hardware_disable_all(void) { - raw_spin_lock(&kvm_lock); + raw_spin_lock(&kvm_count_lock); hardware_disable_all_nolock(); - raw_spin_unlock(&kvm_lock); + raw_spin_unlock(&kvm_count_lock); } static int hardware_enable_all(void) { int r = 0; - raw_spin_lock(&kvm_lock); + raw_spin_lock(&kvm_count_lock); kvm_usage_count++; if (kvm_usage_count == 1) { @@ -2740,7 +2719,7 @@ static int hardware_enable_all(void) } } - raw_spin_unlock(&kvm_lock); + raw_spin_unlock(&kvm_count_lock); return r; } @@ -2750,20 +2729,17 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, { int cpu = (long)v; - if (!kvm_usage_count) - return NOTIFY_OK; - val &= ~CPU_TASKS_FROZEN; switch (val) { case CPU_DYING: printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", cpu); - hardware_disable(NULL); + hardware_disable(); break; case CPU_STARTING: printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", cpu); - hardware_enable(NULL); + hardware_enable(); break; } return NOTIFY_OK; @@ -3056,10 +3032,10 @@ static int vm_stat_get(void *_offset, u64 *val) struct kvm *kvm; *val = 0; - raw_spin_lock(&kvm_lock); + spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) *val += *(u32 *)((void *)kvm + offset); - raw_spin_unlock(&kvm_lock); + spin_unlock(&kvm_lock); return 0; } @@ -3073,12 +3049,12 @@ static int vcpu_stat_get(void *_offset, u64 *val) int i; *val = 0; - raw_spin_lock(&kvm_lock); + spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) kvm_for_each_vcpu(i, vcpu, kvm) *val += *(u32 *)((void *)vcpu + offset); - raw_spin_unlock(&kvm_lock); + spin_unlock(&kvm_lock); return 0; } @@ -3133,7 +3109,7 @@ static int kvm_suspend(void) static void kvm_resume(void) { if (kvm_usage_count) { - WARN_ON(raw_spin_is_locked(&kvm_lock)); + WARN_ON(raw_spin_is_locked(&kvm_count_lock)); hardware_enable_nolock(NULL); } } diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c new file mode 100644 index 0000000..ca4260e --- /dev/null +++ b/virt/kvm/vfio.c @@ -0,0 +1,264 @@ +/* + * VFIO-KVM bridge pseudo device + * + * Copyright (C) 2013 Red Hat, Inc. All rights reserved. + * Author: Alex Williamson <alex.williamson@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/errno.h> +#include <linux/file.h> +#include <linux/kvm_host.h> +#include <linux/list.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/vfio.h> + +struct kvm_vfio_group { + struct list_head node; + struct vfio_group *vfio_group; +}; + +struct kvm_vfio { + struct list_head group_list; + struct mutex lock; + bool noncoherent; +}; + +static struct vfio_group *kvm_vfio_group_get_external_user(struct file *filep) +{ + struct vfio_group *vfio_group; + struct vfio_group *(*fn)(struct file *); + + fn = symbol_get(vfio_group_get_external_user); + if (!fn) + return ERR_PTR(-EINVAL); + + vfio_group = fn(filep); + + symbol_put(vfio_group_get_external_user); + + return vfio_group; +} + +static void kvm_vfio_group_put_external_user(struct vfio_group *vfio_group) +{ + void (*fn)(struct vfio_group *); + + fn = symbol_get(vfio_group_put_external_user); + if (!fn) + return; + + fn(vfio_group); + + symbol_put(vfio_group_put_external_user); +} + +/* + * Groups can use the same or different IOMMU domains. If the same then + * adding a new group may change the coherency of groups we've previously + * been told about. We don't want to care about any of that so we retest + * each group and bail as soon as we find one that's noncoherent. This + * means we only ever [un]register_noncoherent_dma once for the whole device. + */ +static void kvm_vfio_update_coherency(struct kvm_device *dev) +{ + struct kvm_vfio *kv = dev->private; + bool noncoherent = false; + struct kvm_vfio_group *kvg; + + mutex_lock(&kv->lock); + + list_for_each_entry(kvg, &kv->group_list, node) { + /* + * TODO: We need an interface to check the coherency of + * the IOMMU domain this group is using. For now, assume + * it's always noncoherent. + */ + noncoherent = true; + break; + } + + if (noncoherent != kv->noncoherent) { + kv->noncoherent = noncoherent; + + if (kv->noncoherent) + kvm_arch_register_noncoherent_dma(dev->kvm); + else + kvm_arch_unregister_noncoherent_dma(dev->kvm); + } + + mutex_unlock(&kv->lock); +} + +static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg) +{ + struct kvm_vfio *kv = dev->private; + struct vfio_group *vfio_group; + struct kvm_vfio_group *kvg; + void __user *argp = (void __user *)arg; + struct fd f; + int32_t fd; + int ret; + + switch (attr) { + case KVM_DEV_VFIO_GROUP_ADD: + if (get_user(fd, (int32_t __user *)argp)) + return -EFAULT; + + f = fdget(fd); + if (!f.file) + return -EBADF; + + vfio_group = kvm_vfio_group_get_external_user(f.file); + fdput(f); + + if (IS_ERR(vfio_group)) + return PTR_ERR(vfio_group); + + mutex_lock(&kv->lock); + + list_for_each_entry(kvg, &kv->group_list, node) { + if (kvg->vfio_group == vfio_group) { + mutex_unlock(&kv->lock); + kvm_vfio_group_put_external_user(vfio_group); + return -EEXIST; + } + } + + kvg = kzalloc(sizeof(*kvg), GFP_KERNEL); + if (!kvg) { + mutex_unlock(&kv->lock); + kvm_vfio_group_put_external_user(vfio_group); + return -ENOMEM; + } + + list_add_tail(&kvg->node, &kv->group_list); + kvg->vfio_group = vfio_group; + + mutex_unlock(&kv->lock); + + kvm_vfio_update_coherency(dev); + + return 0; + + case KVM_DEV_VFIO_GROUP_DEL: + if (get_user(fd, (int32_t __user *)argp)) + return -EFAULT; + + f = fdget(fd); + if (!f.file) + return -EBADF; + + vfio_group = kvm_vfio_group_get_external_user(f.file); + fdput(f); + + if (IS_ERR(vfio_group)) + return PTR_ERR(vfio_group); + + ret = -ENOENT; + + mutex_lock(&kv->lock); + + list_for_each_entry(kvg, &kv->group_list, node) { + if (kvg->vfio_group != vfio_group) + continue; + + list_del(&kvg->node); + kvm_vfio_group_put_external_user(kvg->vfio_group); + kfree(kvg); + ret = 0; + break; + } + + mutex_unlock(&kv->lock); + + kvm_vfio_group_put_external_user(vfio_group); + + kvm_vfio_update_coherency(dev); + + return ret; + } + + return -ENXIO; +} + +static int kvm_vfio_set_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_VFIO_GROUP: + return kvm_vfio_set_group(dev, attr->attr, attr->addr); + } + + return -ENXIO; +} + +static int kvm_vfio_has_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_VFIO_GROUP: + switch (attr->attr) { + case KVM_DEV_VFIO_GROUP_ADD: + case KVM_DEV_VFIO_GROUP_DEL: + return 0; + } + + break; + } + + return -ENXIO; +} + +static void kvm_vfio_destroy(struct kvm_device *dev) +{ + struct kvm_vfio *kv = dev->private; + struct kvm_vfio_group *kvg, *tmp; + + list_for_each_entry_safe(kvg, tmp, &kv->group_list, node) { + kvm_vfio_group_put_external_user(kvg->vfio_group); + list_del(&kvg->node); + kfree(kvg); + } + + kvm_vfio_update_coherency(dev); + + kfree(kv); + kfree(dev); /* alloc by kvm_ioctl_create_device, free by .destroy */ +} + +static int kvm_vfio_create(struct kvm_device *dev, u32 type) +{ + struct kvm_device *tmp; + struct kvm_vfio *kv; + + /* Only one VFIO "device" per VM */ + list_for_each_entry(tmp, &dev->kvm->devices, vm_node) + if (tmp->ops == &kvm_vfio_ops) + return -EBUSY; + + kv = kzalloc(sizeof(*kv), GFP_KERNEL); + if (!kv) + return -ENOMEM; + + INIT_LIST_HEAD(&kv->group_list); + mutex_init(&kv->lock); + + dev->private = kv; + + return 0; +} + +struct kvm_device_ops kvm_vfio_ops = { + .name = "kvm-vfio", + .create = kvm_vfio_create, + .destroy = kvm_vfio_destroy, + .set_attr = kvm_vfio_set_attr, + .has_attr = kvm_vfio_has_attr, +}; |