diff options
124 files changed, 5419 insertions, 1968 deletions
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 50d82ae..4ba1eb7 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -588,3 +588,10 @@ Why: Remount currently allows changing bound subsystems and replaced with conventional fsnotify. ---------------------------- + +What: KVM debugfs statistics +When: 2013 +Why: KVM tracepoints provide mostly equivalent information in a much more + flexible fashion. + +---------------------------- diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 6386f8c..9301266 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2,6 +2,7 @@ The Definitive KVM (Kernel-based Virtual Machine) API Documentation =================================================================== 1. General description +---------------------- The kvm API is a set of ioctls that are issued to control various aspects of a virtual machine. The ioctls belong to three classes @@ -23,7 +24,9 @@ of a virtual machine. The ioctls belong to three classes Only run vcpu ioctls from the same thread that was used to create the vcpu. + 2. File descriptors +------------------- The kvm API is centered around file descriptors. An initial open("/dev/kvm") obtains a handle to the kvm subsystem; this handle @@ -41,7 +44,9 @@ not cause harm to the host, their actual behavior is not guaranteed by the API. The only supported use is one virtual machine per process, and one vcpu per thread. + 3. Extensions +------------- As of Linux 2.6.22, the KVM ABI has been stabilized: no backward incompatible change are allowed. However, there is an extension @@ -53,7 +58,9 @@ Instead, kvm defines extension identifiers and a facility to query whether a particular extension identifier is available. If it is, a set of ioctls is available for application use. + 4. API description +------------------ This section describes ioctls that can be used to control kvm guests. For each ioctl, the following information is provided along with a @@ -75,6 +82,7 @@ description: Returns: the return value. General error numbers (EBADF, ENOMEM, EINVAL) are not detailed, but errors with specific meanings are. + 4.1 KVM_GET_API_VERSION Capability: basic @@ -90,6 +98,7 @@ supported. Applications should refuse to run if KVM_GET_API_VERSION returns a value other than 12. If this check passes, all ioctls described as 'basic' will be available. + 4.2 KVM_CREATE_VM Capability: basic @@ -109,6 +118,7 @@ In order to create user controlled virtual machines on S390, check KVM_CAP_S390_UCONTROL and use the flag KVM_VM_S390_UCONTROL as privileged user (CAP_SYS_ADMIN). + 4.3 KVM_GET_MSR_INDEX_LIST Capability: basic @@ -135,6 +145,7 @@ Note: if kvm indicates supports MCE (KVM_CAP_MCE), then the MCE bank MSRs are not returned in the MSR list, as different vcpus can have a different number of banks, as set via the KVM_X86_SETUP_MCE ioctl. + 4.4 KVM_CHECK_EXTENSION Capability: basic @@ -149,6 +160,7 @@ receives an integer that describes the extension availability. Generally 0 means no and 1 means yes, but some extensions may report additional information in the integer return value. + 4.5 KVM_GET_VCPU_MMAP_SIZE Capability: basic @@ -161,6 +173,7 @@ The KVM_RUN ioctl (cf.) communicates with userspace via a shared memory region. This ioctl returns the size of that region. See the KVM_RUN documentation for details. + 4.6 KVM_SET_MEMORY_REGION Capability: basic @@ -171,6 +184,7 @@ Returns: 0 on success, -1 on error This ioctl is obsolete and has been removed. + 4.7 KVM_CREATE_VCPU Capability: basic @@ -223,6 +237,7 @@ machines, the resulting vcpu fd can be memory mapped at page offset KVM_S390_SIE_PAGE_OFFSET in order to obtain a memory map of the virtual cpu's hardware control block. + 4.8 KVM_GET_DIRTY_LOG (vm ioctl) Capability: basic @@ -246,6 +261,7 @@ since the last call to this ioctl. Bit 0 is the first page in the memory slot. Ensure the entire structure is cleared to avoid padding issues. + 4.9 KVM_SET_MEMORY_ALIAS Capability: basic @@ -256,6 +272,7 @@ Returns: 0 (success), -1 (error) This ioctl is obsolete and has been removed. + 4.10 KVM_RUN Capability: basic @@ -272,6 +289,7 @@ obtained by mmap()ing the vcpu fd at offset 0, with the size given by KVM_GET_VCPU_MMAP_SIZE. The parameter block is formatted as a 'struct kvm_run' (see below). + 4.11 KVM_GET_REGS Capability: basic @@ -292,6 +310,7 @@ struct kvm_regs { __u64 rip, rflags; }; + 4.12 KVM_SET_REGS Capability: basic @@ -304,6 +323,7 @@ Writes the general purpose registers into the vcpu. See KVM_GET_REGS for the data structure. + 4.13 KVM_GET_SREGS Capability: basic @@ -331,6 +351,7 @@ interrupt_bitmap is a bitmap of pending external interrupts. At most one bit may be set. This interrupt has been acknowledged by the APIC but not yet injected into the cpu core. + 4.14 KVM_SET_SREGS Capability: basic @@ -342,6 +363,7 @@ Returns: 0 on success, -1 on error Writes special registers into the vcpu. See KVM_GET_SREGS for the data structures. + 4.15 KVM_TRANSLATE Capability: basic @@ -365,6 +387,7 @@ struct kvm_translation { __u8 pad[5]; }; + 4.16 KVM_INTERRUPT Capability: basic @@ -413,6 +436,7 @@ c) KVM_INTERRUPT_SET_LEVEL Note that any value for 'irq' other than the ones stated above is invalid and incurs unexpected behavior. + 4.17 KVM_DEBUG_GUEST Capability: basic @@ -423,6 +447,7 @@ Returns: -1 on error Support for this has been removed. Use KVM_SET_GUEST_DEBUG instead. + 4.18 KVM_GET_MSRS Capability: basic @@ -451,6 +476,7 @@ Application code should set the 'nmsrs' member (which indicates the size of the entries array) and the 'index' member of each array entry. kvm will fill in the 'data' member. + 4.19 KVM_SET_MSRS Capability: basic @@ -466,6 +492,7 @@ Application code should set the 'nmsrs' member (which indicates the size of the entries array), and the 'index' and 'data' members of each array entry. + 4.20 KVM_SET_CPUID Capability: basic @@ -494,6 +521,7 @@ struct kvm_cpuid { struct kvm_cpuid_entry entries[0]; }; + 4.21 KVM_SET_SIGNAL_MASK Capability: basic @@ -516,6 +544,7 @@ struct kvm_signal_mask { __u8 sigset[0]; }; + 4.22 KVM_GET_FPU Capability: basic @@ -541,6 +570,7 @@ struct kvm_fpu { __u32 pad2; }; + 4.23 KVM_SET_FPU Capability: basic @@ -566,6 +596,7 @@ struct kvm_fpu { __u32 pad2; }; + 4.24 KVM_CREATE_IRQCHIP Capability: KVM_CAP_IRQCHIP @@ -579,6 +610,7 @@ ioapic, a virtual PIC (two PICs, nested), and sets up future vcpus to have a local APIC. IRQ routing for GSIs 0-15 is set to both PIC and IOAPIC; GSI 16-23 only go to the IOAPIC. On ia64, a IOSAPIC is created. + 4.25 KVM_IRQ_LINE Capability: KVM_CAP_IRQCHIP @@ -600,6 +632,7 @@ struct kvm_irq_level { __u32 level; /* 0 or 1 */ }; + 4.26 KVM_GET_IRQCHIP Capability: KVM_CAP_IRQCHIP @@ -621,6 +654,7 @@ struct kvm_irqchip { } chip; }; + 4.27 KVM_SET_IRQCHIP Capability: KVM_CAP_IRQCHIP @@ -642,6 +676,7 @@ struct kvm_irqchip { } chip; }; + 4.28 KVM_XEN_HVM_CONFIG Capability: KVM_CAP_XEN_HVM @@ -666,6 +701,7 @@ struct kvm_xen_hvm_config { __u8 pad2[30]; }; + 4.29 KVM_GET_CLOCK Capability: KVM_CAP_ADJUST_CLOCK @@ -684,6 +720,7 @@ struct kvm_clock_data { __u32 pad[9]; }; + 4.30 KVM_SET_CLOCK Capability: KVM_CAP_ADJUST_CLOCK @@ -702,6 +739,7 @@ struct kvm_clock_data { __u32 pad[9]; }; + 4.31 KVM_GET_VCPU_EVENTS Capability: KVM_CAP_VCPU_EVENTS @@ -741,6 +779,7 @@ struct kvm_vcpu_events { KVM_VCPUEVENT_VALID_SHADOW may be set in the flags field to signal that interrupt.shadow contains a valid state. Otherwise, this field is undefined. + 4.32 KVM_SET_VCPU_EVENTS Capability: KVM_CAP_VCPU_EVENTS @@ -767,6 +806,7 @@ If KVM_CAP_INTR_SHADOW is available, KVM_VCPUEVENT_VALID_SHADOW can be set in the flags field to signal that interrupt.shadow contains a valid state and shall be written into the VCPU. + 4.33 KVM_GET_DEBUGREGS Capability: KVM_CAP_DEBUGREGS @@ -785,6 +825,7 @@ struct kvm_debugregs { __u64 reserved[9]; }; + 4.34 KVM_SET_DEBUGREGS Capability: KVM_CAP_DEBUGREGS @@ -798,6 +839,7 @@ Writes debug registers into the vcpu. See KVM_GET_DEBUGREGS for the data structure. The flags field is unused yet and must be cleared on entry. + 4.35 KVM_SET_USER_MEMORY_REGION Capability: KVM_CAP_USER_MEM @@ -844,6 +886,7 @@ It is recommended to use this API instead of the KVM_SET_MEMORY_REGION ioctl. The KVM_SET_MEMORY_REGION does not allow fine grained control over memory allocation and is deprecated. + 4.36 KVM_SET_TSS_ADDR Capability: KVM_CAP_SET_TSS_ADDR @@ -862,6 +905,7 @@ This ioctl is required on Intel-based hosts. This is needed on Intel hardware because of a quirk in the virtualization implementation (see the internals documentation when it pops into existence). + 4.37 KVM_ENABLE_CAP Capability: KVM_CAP_ENABLE_CAP @@ -897,6 +941,7 @@ function properly, this is the place to put them. __u8 pad[64]; }; + 4.38 KVM_GET_MP_STATE Capability: KVM_CAP_MP_STATE @@ -927,6 +972,7 @@ Possible values are: This ioctl is only useful after KVM_CREATE_IRQCHIP. Without an in-kernel irqchip, the multiprocessing state must be maintained by userspace. + 4.39 KVM_SET_MP_STATE Capability: KVM_CAP_MP_STATE @@ -941,6 +987,7 @@ arguments. This ioctl is only useful after KVM_CREATE_IRQCHIP. Without an in-kernel irqchip, the multiprocessing state must be maintained by userspace. + 4.40 KVM_SET_IDENTITY_MAP_ADDR Capability: KVM_CAP_SET_IDENTITY_MAP_ADDR @@ -959,6 +1006,7 @@ This ioctl is required on Intel-based hosts. This is needed on Intel hardware because of a quirk in the virtualization implementation (see the internals documentation when it pops into existence). + 4.41 KVM_SET_BOOT_CPU_ID Capability: KVM_CAP_SET_BOOT_CPU_ID @@ -971,6 +1019,7 @@ Define which vcpu is the Bootstrap Processor (BSP). Values are the same as the vcpu id in KVM_CREATE_VCPU. If this ioctl is not called, the default is vcpu 0. + 4.42 KVM_GET_XSAVE Capability: KVM_CAP_XSAVE @@ -985,6 +1034,7 @@ struct kvm_xsave { This ioctl would copy current vcpu's xsave struct to the userspace. + 4.43 KVM_SET_XSAVE Capability: KVM_CAP_XSAVE @@ -999,6 +1049,7 @@ struct kvm_xsave { This ioctl would copy userspace's xsave struct to the kernel. + 4.44 KVM_GET_XCRS Capability: KVM_CAP_XCRS @@ -1022,6 +1073,7 @@ struct kvm_xcrs { This ioctl would copy current vcpu's xcrs to the userspace. + 4.45 KVM_SET_XCRS Capability: KVM_CAP_XCRS @@ -1045,6 +1097,7 @@ struct kvm_xcrs { This ioctl would set vcpu's xcr to the value userspace specified. + 4.46 KVM_GET_SUPPORTED_CPUID Capability: KVM_CAP_EXT_CPUID @@ -1119,6 +1172,7 @@ support. Instead it is reported via if that returns true and you use KVM_CREATE_IRQCHIP, or if you emulate the feature in userspace, then you can enable the feature for KVM_SET_CPUID2. + 4.47 KVM_PPC_GET_PVINFO Capability: KVM_CAP_PPC_GET_PVINFO @@ -1142,6 +1196,7 @@ of 4 instructions that make up a hypercall. If any additional field gets added to this structure later on, a bit for that additional piece of information will be set in the flags bitmap. + 4.48 KVM_ASSIGN_PCI_DEVICE Capability: KVM_CAP_DEVICE_ASSIGNMENT @@ -1185,6 +1240,7 @@ Only PCI header type 0 devices with PCI BAR resources are supported by device assignment. The user requesting this ioctl must have read/write access to the PCI sysfs resource files associated with the device. + 4.49 KVM_DEASSIGN_PCI_DEVICE Capability: KVM_CAP_DEVICE_DEASSIGNMENT @@ -1198,6 +1254,7 @@ Ends PCI device assignment, releasing all associated resources. See KVM_CAP_DEVICE_ASSIGNMENT for the data structure. Only assigned_dev_id is used in kvm_assigned_pci_dev to identify the device. + 4.50 KVM_ASSIGN_DEV_IRQ Capability: KVM_CAP_ASSIGN_DEV_IRQ @@ -1231,6 +1288,7 @@ The following flags are defined: It is not valid to specify multiple types per host or guest IRQ. However, the IRQ type of host and guest can differ or can even be null. + 4.51 KVM_DEASSIGN_DEV_IRQ Capability: KVM_CAP_ASSIGN_DEV_IRQ @@ -1245,6 +1303,7 @@ See KVM_ASSIGN_DEV_IRQ for the data structure. The target device is specified by assigned_dev_id, flags must correspond to the IRQ type specified on KVM_ASSIGN_DEV_IRQ. Partial deassignment of host or guest IRQ is allowed. + 4.52 KVM_SET_GSI_ROUTING Capability: KVM_CAP_IRQ_ROUTING @@ -1293,6 +1352,7 @@ struct kvm_irq_routing_msi { __u32 pad; }; + 4.53 KVM_ASSIGN_SET_MSIX_NR Capability: KVM_CAP_DEVICE_MSIX @@ -1314,6 +1374,7 @@ struct kvm_assigned_msix_nr { #define KVM_MAX_MSIX_PER_DEV 256 + 4.54 KVM_ASSIGN_SET_MSIX_ENTRY Capability: KVM_CAP_DEVICE_MSIX @@ -1332,7 +1393,8 @@ struct kvm_assigned_msix_entry { __u16 padding[3]; }; -4.54 KVM_SET_TSC_KHZ + +4.55 KVM_SET_TSC_KHZ Capability: KVM_CAP_TSC_CONTROL Architectures: x86 @@ -1343,7 +1405,8 @@ Returns: 0 on success, -1 on error Specifies the tsc frequency for the virtual machine. The unit of the frequency is KHz. -4.55 KVM_GET_TSC_KHZ + +4.56 KVM_GET_TSC_KHZ Capability: KVM_CAP_GET_TSC_KHZ Architectures: x86 @@ -1355,7 +1418,8 @@ Returns the tsc frequency of the guest. The unit of the return value is KHz. If the host has unstable tsc this ioctl returns -EIO instead as an error. -4.56 KVM_GET_LAPIC + +4.57 KVM_GET_LAPIC Capability: KVM_CAP_IRQCHIP Architectures: x86 @@ -1371,7 +1435,8 @@ struct kvm_lapic_state { Reads the Local APIC registers and copies them into the input argument. The data format and layout are the same as documented in the architecture manual. -4.57 KVM_SET_LAPIC + +4.58 KVM_SET_LAPIC Capability: KVM_CAP_IRQCHIP Architectures: x86 @@ -1387,7 +1452,8 @@ struct kvm_lapic_state { Copies the input argument into the the Local APIC registers. The data format and layout are the same as documented in the architecture manual. -4.58 KVM_IOEVENTFD + +4.59 KVM_IOEVENTFD Capability: KVM_CAP_IOEVENTFD Architectures: all @@ -1417,7 +1483,8 @@ The following flags are defined: If datamatch flag is set, the event will be signaled only if the written value to the registered address is equal to datamatch in struct kvm_ioeventfd. -4.59 KVM_DIRTY_TLB + +4.60 KVM_DIRTY_TLB Capability: KVM_CAP_SW_TLB Architectures: ppc @@ -1449,7 +1516,8 @@ The "num_dirty" field is a performance hint for KVM to determine whether it should skip processing the bitmap and just invalidate everything. It must be set to the number of set bits in the bitmap. -4.60 KVM_ASSIGN_SET_INTX_MASK + +4.61 KVM_ASSIGN_SET_INTX_MASK Capability: KVM_CAP_PCI_2_3 Architectures: x86 @@ -1482,6 +1550,7 @@ See KVM_ASSIGN_DEV_IRQ for the data structure. The target device is specified by assigned_dev_id. In the flags field, only KVM_DEV_ASSIGN_MASK_INTX is evaluated. + 4.62 KVM_CREATE_SPAPR_TCE Capability: KVM_CAP_SPAPR_TCE @@ -1517,6 +1586,7 @@ the entries written by kernel-handled H_PUT_TCE calls, and also lets userspace update the TCE table directly which is useful in some circumstances. + 4.63 KVM_ALLOCATE_RMA Capability: KVM_CAP_PPC_RMA @@ -1549,6 +1619,7 @@ is supported; 2 if the processor requires all virtual machines to have an RMA, or 1 if the processor can use an RMA but doesn't require it, because it supports the Virtual RMA (VRMA) facility. + 4.64 KVM_NMI Capability: KVM_CAP_USER_NMI @@ -1574,6 +1645,7 @@ following algorithm: Some guests configure the LINT1 NMI input to cause a panic, aiding in debugging. + 4.65 KVM_S390_UCAS_MAP Capability: KVM_CAP_S390_UCONTROL @@ -1593,6 +1665,7 @@ This ioctl maps the memory at "user_addr" with the length "length" to the vcpu's address space starting at "vcpu_addr". All parameters need to be alligned by 1 megabyte. + 4.66 KVM_S390_UCAS_UNMAP Capability: KVM_CAP_S390_UCONTROL @@ -1612,6 +1685,7 @@ This ioctl unmaps the memory in the vcpu's address space starting at "vcpu_addr" with the length "length". The field "user_addr" is ignored. All parameters need to be alligned by 1 megabyte. + 4.67 KVM_S390_VCPU_FAULT Capability: KVM_CAP_S390_UCONTROL @@ -1628,6 +1702,7 @@ table upfront. This is useful to handle validity intercepts for user controlled virtual machines to fault in the virtual cpu's lowcore pages prior to calling the KVM_RUN ioctl. + 4.68 KVM_SET_ONE_REG Capability: KVM_CAP_ONE_REG @@ -1653,6 +1728,7 @@ registers, find a list below: | | PPC | KVM_REG_PPC_HIOR | 64 + 4.69 KVM_GET_ONE_REG Capability: KVM_CAP_ONE_REG @@ -1669,7 +1745,193 @@ at the memory location pointed to by "addr". The list of registers accessible using this interface is identical to the list in 4.64. + +4.70 KVM_KVMCLOCK_CTRL + +Capability: KVM_CAP_KVMCLOCK_CTRL +Architectures: Any that implement pvclocks (currently x86 only) +Type: vcpu ioctl +Parameters: None +Returns: 0 on success, -1 on error + +This signals to the host kernel that the specified guest is being paused by +userspace. The host will set a flag in the pvclock structure that is checked +from the soft lockup watchdog. The flag is part of the pvclock structure that +is shared between guest and host, specifically the second bit of the flags +field of the pvclock_vcpu_time_info structure. It will be set exclusively by +the host and read/cleared exclusively by the guest. The guest operation of +checking and clearing the flag must an atomic operation so +load-link/store-conditional, or equivalent must be used. There are two cases +where the guest will clear the flag: when the soft lockup watchdog timer resets +itself or when a soft lockup is detected. This ioctl can be called any time +after pausing the vcpu, but before it is resumed. + + +4.71 KVM_SIGNAL_MSI + +Capability: KVM_CAP_SIGNAL_MSI +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_msi (in) +Returns: >0 on delivery, 0 if guest blocked the MSI, and -1 on error + +Directly inject a MSI message. Only valid with in-kernel irqchip that handles +MSI messages. + +struct kvm_msi { + __u32 address_lo; + __u32 address_hi; + __u32 data; + __u32 flags; + __u8 pad[16]; +}; + +No flags are defined so far. The corresponding field must be 0. + + +4.71 KVM_CREATE_PIT2 + +Capability: KVM_CAP_PIT2 +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_pit_config (in) +Returns: 0 on success, -1 on error + +Creates an in-kernel device model for the i8254 PIT. This call is only valid +after enabling in-kernel irqchip support via KVM_CREATE_IRQCHIP. The following +parameters have to be passed: + +struct kvm_pit_config { + __u32 flags; + __u32 pad[15]; +}; + +Valid flags are: + +#define KVM_PIT_SPEAKER_DUMMY 1 /* emulate speaker port stub */ + +PIT timer interrupts may use a per-VM kernel thread for injection. If it +exists, this thread will have a name of the following pattern: + +kvm-pit/<owner-process-pid> + +When running a guest with elevated priorities, the scheduling parameters of +this thread may have to be adjusted accordingly. + +This IOCTL replaces the obsolete KVM_CREATE_PIT. + + +4.72 KVM_GET_PIT2 + +Capability: KVM_CAP_PIT_STATE2 +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_pit_state2 (out) +Returns: 0 on success, -1 on error + +Retrieves the state of the in-kernel PIT model. Only valid after +KVM_CREATE_PIT2. The state is returned in the following structure: + +struct kvm_pit_state2 { + struct kvm_pit_channel_state channels[3]; + __u32 flags; + __u32 reserved[9]; +}; + +Valid flags are: + +/* disable PIT in HPET legacy mode */ +#define KVM_PIT_FLAGS_HPET_LEGACY 0x00000001 + +This IOCTL replaces the obsolete KVM_GET_PIT. + + +4.73 KVM_SET_PIT2 + +Capability: KVM_CAP_PIT_STATE2 +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_pit_state2 (in) +Returns: 0 on success, -1 on error + +Sets the state of the in-kernel PIT model. Only valid after KVM_CREATE_PIT2. +See KVM_GET_PIT2 for details on struct kvm_pit_state2. + +This IOCTL replaces the obsolete KVM_SET_PIT. + + +4.74 KVM_PPC_GET_SMMU_INFO + +Capability: KVM_CAP_PPC_GET_SMMU_INFO +Architectures: powerpc +Type: vm ioctl +Parameters: None +Returns: 0 on success, -1 on error + +This populates and returns a structure describing the features of +the "Server" class MMU emulation supported by KVM. +This can in turn be used by userspace to generate the appropariate +device-tree properties for the guest operating system. + +The structure contains some global informations, followed by an +array of supported segment page sizes: + + struct kvm_ppc_smmu_info { + __u64 flags; + __u32 slb_size; + __u32 pad; + struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ]; + }; + +The supported flags are: + + - KVM_PPC_PAGE_SIZES_REAL: + When that flag is set, guest page sizes must "fit" the backing + store page sizes. When not set, any page size in the list can + be used regardless of how they are backed by userspace. + + - KVM_PPC_1T_SEGMENTS + The emulated MMU supports 1T segments in addition to the + standard 256M ones. + +The "slb_size" field indicates how many SLB entries are supported + +The "sps" array contains 8 entries indicating the supported base +page sizes for a segment in increasing order. Each entry is defined +as follow: + + struct kvm_ppc_one_seg_page_size { + __u32 page_shift; /* Base page shift of segment (or 0) */ + __u32 slb_enc; /* SLB encoding for BookS */ + struct kvm_ppc_one_page_size enc[KVM_PPC_PAGE_SIZES_MAX_SZ]; + }; + +An entry with a "page_shift" of 0 is unused. Because the array is +organized in increasing order, a lookup can stop when encoutering +such an entry. + +The "slb_enc" field provides the encoding to use in the SLB for the +page size. The bits are in positions such as the value can directly +be OR'ed into the "vsid" argument of the slbmte instruction. + +The "enc" array is a list which for each of those segment base page +size provides the list of supported actual page sizes (which can be +only larger or equal to the base page size), along with the +corresponding encoding in the hash PTE. Similarily, the array is +8 entries sorted by increasing sizes and an entry with a "0" shift +is an empty entry and a terminator: + + struct kvm_ppc_one_page_size { + __u32 page_shift; /* Page shift (or 0) */ + __u32 pte_enc; /* Encoding in the HPTE (>>12) */ + }; + +The "pte_enc" field provides a value that can OR'ed into the hash +PTE's RPN field (ie, it needs to be shifted left by 12 to OR it +into the hash PTE second double word). + 5. The kvm_run structure +------------------------ Application code obtains a pointer to the kvm_run structure by mmap()ing a vcpu fd. From that point, application code can control @@ -1910,7 +2172,9 @@ and usually define the validity of a groups of registers. (e.g. one bit }; + 6. Capabilities that can be enabled +----------------------------------- There are certain capabilities that change the behavior of the virtual CPU when enabled. To enable them, please see section 4.37. Below you can find a list of @@ -1926,6 +2190,7 @@ The following information is provided along with the description: Returns: the return value. General error numbers (EBADF, ENOMEM, EINVAL) are not detailed, but errors with specific meanings are. + 6.1 KVM_CAP_PPC_OSI Architectures: ppc @@ -1939,6 +2204,7 @@ between the guest and the host. When this capability is enabled, KVM_EXIT_OSI can occur. + 6.2 KVM_CAP_PPC_PAPR Architectures: ppc @@ -1957,6 +2223,7 @@ HTAB invisible to the guest. When this capability is enabled, KVM_EXIT_PAPR_HCALL can occur. + 6.3 KVM_CAP_SW_TLB Architectures: ppc diff --git a/Documentation/virtual/kvm/cpuid.txt b/Documentation/virtual/kvm/cpuid.txt index 8820685..83afe65 100644 --- a/Documentation/virtual/kvm/cpuid.txt +++ b/Documentation/virtual/kvm/cpuid.txt @@ -10,11 +10,15 @@ a guest. KVM cpuid functions are: function: KVM_CPUID_SIGNATURE (0x40000000) -returns : eax = 0, +returns : eax = 0x40000001, ebx = 0x4b4d564b, ecx = 0x564b4d56, edx = 0x4d. Note that this value in ebx, ecx and edx corresponds to the string "KVMKVMKVM". +The value in eax corresponds to the maximum cpuid function present in this leaf, +and will be updated if more functions are added in the future. +Note also that old hosts set eax value to 0x0. This should +be interpreted as if the value was 0x40000001. This function queries the presence of KVM cpuid leafs. diff --git a/Documentation/virtual/kvm/msr.txt b/Documentation/virtual/kvm/msr.txt index 5031780..96b41bd 100644 --- a/Documentation/virtual/kvm/msr.txt +++ b/Documentation/virtual/kvm/msr.txt @@ -109,6 +109,10 @@ MSR_KVM_SYSTEM_TIME_NEW: 0x4b564d01 0 | 24 | multiple cpus are guaranteed to | | be monotonic ------------------------------------------------------------- + | | guest vcpu has been paused by + 1 | N/A | the host + | | See 4.70 in api.txt + ------------------------------------------------------------- Availability of this MSR must be checked via bit 3 in 0x4000001 cpuid leaf prior to usage. diff --git a/arch/alpha/include/asm/kvm_para.h b/arch/alpha/include/asm/kvm_para.h new file mode 100644 index 0000000..14fab8f --- /dev/null +++ b/arch/alpha/include/asm/kvm_para.h @@ -0,0 +1 @@ +#include <asm-generic/kvm_para.h> diff --git a/arch/arm/include/asm/kvm_para.h b/arch/arm/include/asm/kvm_para.h new file mode 100644 index 0000000..14fab8f --- /dev/null +++ b/arch/arm/include/asm/kvm_para.h @@ -0,0 +1 @@ +#include <asm-generic/kvm_para.h> diff --git a/arch/avr32/include/asm/kvm_para.h b/arch/avr32/include/asm/kvm_para.h new file mode 100644 index 0000000..14fab8f --- /dev/null +++ b/arch/avr32/include/asm/kvm_para.h @@ -0,0 +1 @@ +#include <asm-generic/kvm_para.h> diff --git a/arch/blackfin/include/asm/kvm_para.h b/arch/blackfin/include/asm/kvm_para.h new file mode 100644 index 0000000..14fab8f --- /dev/null +++ b/arch/blackfin/include/asm/kvm_para.h @@ -0,0 +1 @@ +#include <asm-generic/kvm_para.h> diff --git a/arch/c6x/include/asm/kvm_para.h b/arch/c6x/include/asm/kvm_para.h new file mode 100644 index 0000000..14fab8f --- /dev/null +++ b/arch/c6x/include/asm/kvm_para.h @@ -0,0 +1 @@ +#include <asm-generic/kvm_para.h> diff --git a/arch/frv/include/asm/kvm_para.h b/arch/frv/include/asm/kvm_para.h new file mode 100644 index 0000000..14fab8f --- /dev/null +++ b/arch/frv/include/asm/kvm_para.h @@ -0,0 +1 @@ +#include <asm-generic/kvm_para.h> diff --git a/arch/h8300/include/asm/kvm_para.h b/arch/h8300/include/asm/kvm_para.h new file mode 100644 index 0000000..14fab8f --- /dev/null +++ b/arch/h8300/include/asm/kvm_para.h @@ -0,0 +1 @@ +#include <asm-generic/kvm_para.h> diff --git a/arch/hexagon/include/asm/kvm_para.h b/arch/hexagon/include/asm/kvm_para.h new file mode 100644 index 0000000..14fab8f --- /dev/null +++ b/arch/hexagon/include/asm/kvm_para.h @@ -0,0 +1 @@ +#include <asm-generic/kvm_para.h> diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h index e35b3a8..6d6a5ac 100644 --- a/arch/ia64/include/asm/kvm_host.h +++ b/arch/ia64/include/asm/kvm_host.h @@ -365,6 +365,7 @@ struct thash_cb { }; struct kvm_vcpu_stat { + u32 halt_wakeup; }; struct kvm_vcpu_arch { @@ -448,6 +449,8 @@ struct kvm_vcpu_arch { char log_buf[VMM_LOG_LEN]; union context host; union context guest; + + char mmio_data[8]; }; struct kvm_vm_stat { diff --git a/arch/ia64/include/asm/kvm_para.h b/arch/ia64/include/asm/kvm_para.h index 1588aee..2019cb9 100644 --- a/arch/ia64/include/asm/kvm_para.h +++ b/arch/ia64/include/asm/kvm_para.h @@ -26,6 +26,11 @@ static inline unsigned int kvm_arch_para_features(void) return 0; } +static inline bool kvm_check_and_clear_guest_paused(void) +{ + return false; +} + #endif #endif diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index 463fb3b..bd77cb5 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -232,12 +232,12 @@ static int handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if ((p->addr & PAGE_MASK) == IOAPIC_DEFAULT_BASE_ADDRESS) goto mmio; vcpu->mmio_needed = 1; - vcpu->mmio_phys_addr = kvm_run->mmio.phys_addr = p->addr; - vcpu->mmio_size = kvm_run->mmio.len = p->size; + vcpu->mmio_fragments[0].gpa = kvm_run->mmio.phys_addr = p->addr; + vcpu->mmio_fragments[0].len = kvm_run->mmio.len = p->size; vcpu->mmio_is_write = kvm_run->mmio.is_write = !p->dir; if (vcpu->mmio_is_write) - memcpy(vcpu->mmio_data, &p->data, p->size); + memcpy(vcpu->arch.mmio_data, &p->data, p->size); memcpy(kvm_run->mmio.data, &p->data, p->size); kvm_run->exit_reason = KVM_EXIT_MMIO; return 0; @@ -719,7 +719,7 @@ static void kvm_set_mmio_data(struct kvm_vcpu *vcpu) struct kvm_mmio_req *p = kvm_get_vcpu_ioreq(vcpu); if (!vcpu->mmio_is_write) - memcpy(&p->data, vcpu->mmio_data, 8); + memcpy(&p->data, vcpu->arch.mmio_data, 8); p->state = STATE_IORESP_READY; } @@ -739,7 +739,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } if (vcpu->mmio_needed) { - memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); + memcpy(vcpu->arch.mmio_data, kvm_run->mmio.data, 8); kvm_set_mmio_data(vcpu); vcpu->mmio_read_completed = 1; vcpu->mmio_needed = 0; @@ -1872,21 +1872,6 @@ void kvm_arch_hardware_unsetup(void) { } -void kvm_vcpu_kick(struct kvm_vcpu *vcpu) -{ - int me; - int cpu = vcpu->cpu; - - if (waitqueue_active(&vcpu->wq)) - wake_up_interruptible(&vcpu->wq); - - me = get_cpu(); - if (cpu != me && (unsigned) cpu < nr_cpu_ids && cpu_online(cpu)) - if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests)) - smp_send_reschedule(cpu); - put_cpu(); -} - int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq) { return __apic_accept_irq(vcpu, irq->vector); @@ -1956,6 +1941,11 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) (kvm_highest_pending_irq(vcpu) != -1); } +int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) +{ + return (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests)); +} + int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { diff --git a/arch/m68k/include/asm/kvm_para.h b/arch/m68k/include/asm/kvm_para.h new file mode 100644 index 0000000..14fab8f --- /dev/null +++ b/arch/m68k/include/asm/kvm_para.h @@ -0,0 +1 @@ +#include <asm-generic/kvm_para.h> diff --git a/arch/microblaze/include/asm/kvm_para.h b/arch/microblaze/include/asm/kvm_para.h new file mode 100644 index 0000000..14fab8f --- /dev/null +++ b/arch/microblaze/include/asm/kvm_para.h @@ -0,0 +1 @@ +#include <asm-generic/kvm_para.h> diff --git a/arch/mips/include/asm/kvm_para.h b/arch/mips/include/asm/kvm_para.h new file mode 100644 index 0000000..14fab8f --- /dev/null +++ b/arch/mips/include/asm/kvm_para.h @@ -0,0 +1 @@ +#include <asm-generic/kvm_para.h> diff --git a/arch/mn10300/include/asm/kvm_para.h b/arch/mn10300/include/asm/kvm_para.h new file mode 100644 index 0000000..14fab8f --- /dev/null +++ b/arch/mn10300/include/asm/kvm_para.h @@ -0,0 +1 @@ +#include <asm-generic/kvm_para.h> diff --git a/arch/openrisc/include/asm/kvm_para.h b/arch/openrisc/include/asm/kvm_para.h new file mode 100644 index 0000000..14fab8f --- /dev/null +++ b/arch/openrisc/include/asm/kvm_para.h @@ -0,0 +1 @@ +#include <asm-generic/kvm_para.h> diff --git a/arch/parisc/include/asm/kvm_para.h b/arch/parisc/include/asm/kvm_para.h new file mode 100644 index 0000000..14fab8f --- /dev/null +++ b/arch/parisc/include/asm/kvm_para.h @@ -0,0 +1 @@ +#include <asm-generic/kvm_para.h> diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h index b9219e9..50d82c8 100644 --- a/arch/powerpc/include/asm/cputable.h +++ b/arch/powerpc/include/asm/cputable.h @@ -168,6 +168,7 @@ extern const char *powerpc_base_platform; #define CPU_FTR_LWSYNC ASM_CONST(0x0000000008000000) #define CPU_FTR_NOEXECUTE ASM_CONST(0x0000000010000000) #define CPU_FTR_INDEXED_DCR ASM_CONST(0x0000000020000000) +#define CPU_FTR_EMB_HV ASM_CONST(0x0000000040000000) /* * Add the 64-bit processor unique features in the top half of the word; @@ -376,7 +377,8 @@ extern const char *powerpc_base_platform; #define CPU_FTRS_47X (CPU_FTRS_440x6) #define CPU_FTRS_E200 (CPU_FTR_USE_TB | CPU_FTR_SPE_COMP | \ CPU_FTR_NODSISRALIGN | CPU_FTR_COHERENT_ICACHE | \ - CPU_FTR_UNIFIED_ID_CACHE | CPU_FTR_NOEXECUTE) + CPU_FTR_UNIFIED_ID_CACHE | CPU_FTR_NOEXECUTE | \ + CPU_FTR_DEBUG_LVL_EXC) #define CPU_FTRS_E500 (CPU_FTR_MAYBE_CAN_DOZE | CPU_FTR_USE_TB | \ CPU_FTR_SPE_COMP | CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_NODSISRALIGN | \ CPU_FTR_NOEXECUTE) @@ -385,15 +387,15 @@ extern const char *powerpc_base_platform; CPU_FTR_NODSISRALIGN | CPU_FTR_NOEXECUTE) #define CPU_FTRS_E500MC (CPU_FTR_USE_TB | CPU_FTR_NODSISRALIGN | \ CPU_FTR_L2CSR | CPU_FTR_LWSYNC | CPU_FTR_NOEXECUTE | \ - CPU_FTR_DBELL) + CPU_FTR_DBELL | CPU_FTR_DEBUG_LVL_EXC | CPU_FTR_EMB_HV) #define CPU_FTRS_E5500 (CPU_FTR_USE_TB | CPU_FTR_NODSISRALIGN | \ CPU_FTR_L2CSR | CPU_FTR_LWSYNC | CPU_FTR_NOEXECUTE | \ CPU_FTR_DBELL | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \ - CPU_FTR_DEBUG_LVL_EXC) + CPU_FTR_DEBUG_LVL_EXC | CPU_FTR_EMB_HV) #define CPU_FTRS_E6500 (CPU_FTR_USE_TB | CPU_FTR_NODSISRALIGN | \ CPU_FTR_L2CSR | CPU_FTR_LWSYNC | CPU_FTR_NOEXECUTE | \ CPU_FTR_DBELL | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \ - CPU_FTR_DEBUG_LVL_EXC) + CPU_FTR_DEBUG_LVL_EXC | CPU_FTR_EMB_HV) #define CPU_FTRS_GENERIC_32 (CPU_FTR_COMMON | CPU_FTR_NODSISRALIGN) /* 64-bit CPUs */ @@ -486,8 +488,10 @@ enum { CPU_FTRS_E200 | #endif #ifdef CONFIG_E500 - CPU_FTRS_E500 | CPU_FTRS_E500_2 | CPU_FTRS_E500MC | - CPU_FTRS_E5500 | CPU_FTRS_E6500 | + CPU_FTRS_E500 | CPU_FTRS_E500_2 | +#endif +#ifdef CONFIG_PPC_E500MC + CPU_FTRS_E500MC | CPU_FTRS_E5500 | CPU_FTRS_E6500 | #endif 0, }; @@ -531,9 +535,12 @@ enum { CPU_FTRS_E200 & #endif #ifdef CONFIG_E500 - CPU_FTRS_E500 & CPU_FTRS_E500_2 & CPU_FTRS_E500MC & - CPU_FTRS_E5500 & CPU_FTRS_E6500 & + CPU_FTRS_E500 & CPU_FTRS_E500_2 & +#endif +#ifdef CONFIG_PPC_E500MC + CPU_FTRS_E500MC & CPU_FTRS_E5500 & CPU_FTRS_E6500 & #endif + ~CPU_FTR_EMB_HV & /* can be removed at runtime */ CPU_FTRS_POSSIBLE, }; #endif /* __powerpc64__ */ diff --git a/arch/powerpc/include/asm/dbell.h b/arch/powerpc/include/asm/dbell.h index efa74ac..154c067 100644 --- a/arch/powerpc/include/asm/dbell.h +++ b/arch/powerpc/include/asm/dbell.h @@ -19,6 +19,9 @@ #define PPC_DBELL_MSG_BRDCAST (0x04000000) #define PPC_DBELL_TYPE(x) (((x) & 0xf) << (63-36)) +#define PPC_DBELL_TYPE_MASK PPC_DBELL_TYPE(0xf) +#define PPC_DBELL_LPID(x) ((x) << (63 - 49)) +#define PPC_DBELL_PIR_MASK 0x3fff enum ppc_dbell { PPC_DBELL = 0, /* doorbell */ PPC_DBELL_CRIT = 1, /* critical doorbell */ diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index 6122523..423cf9e 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h @@ -133,6 +133,16 @@ #define H_PP1 (1UL<<(63-62)) #define H_PP2 (1UL<<(63-63)) +/* Flags for H_REGISTER_VPA subfunction field */ +#define H_VPA_FUNC_SHIFT (63-18) /* Bit posn of subfunction code */ +#define H_VPA_FUNC_MASK 7UL +#define H_VPA_REG_VPA 1UL /* Register Virtual Processor Area */ +#define H_VPA_REG_DTL 2UL /* Register Dispatch Trace Log */ +#define H_VPA_REG_SLB 3UL /* Register SLB shadow buffer */ +#define H_VPA_DEREG_VPA 5UL /* Deregister Virtual Processor Area */ +#define H_VPA_DEREG_DTL 6UL /* Deregister Dispatch Trace Log */ +#define H_VPA_DEREG_SLB 7UL /* Deregister SLB shadow buffer */ + /* VASI States */ #define H_VASI_INVALID 0 #define H_VASI_ENABLED 1 diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index 51010bf..c9aac24 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -33,6 +33,7 @@ extern void __replay_interrupt(unsigned int vector); extern void timer_interrupt(struct pt_regs *); +extern void performance_monitor_exception(struct pt_regs *regs); #ifdef CONFIG_PPC64 #include <asm/paca.h> diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h index b921c3f..1bea4d8 100644 --- a/arch/powerpc/include/asm/kvm.h +++ b/arch/powerpc/include/asm/kvm.h @@ -277,6 +277,7 @@ struct kvm_sync_regs { #define KVM_CPU_E500V2 2 #define KVM_CPU_3S_32 3 #define KVM_CPU_3S_64 4 +#define KVM_CPU_E500MC 5 /* for KVM_CAP_SPAPR_TCE */ struct kvm_create_spapr_tce { diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h index 7b1f0e0..76fdcfe 100644 --- a/arch/powerpc/include/asm/kvm_asm.h +++ b/arch/powerpc/include/asm/kvm_asm.h @@ -20,6 +20,16 @@ #ifndef __POWERPC_KVM_ASM_H__ #define __POWERPC_KVM_ASM_H__ +#ifdef __ASSEMBLY__ +#ifdef CONFIG_64BIT +#define PPC_STD(sreg, offset, areg) std sreg, (offset)(areg) +#define PPC_LD(treg, offset, areg) ld treg, (offset)(areg) +#else +#define PPC_STD(sreg, offset, areg) stw sreg, (offset+4)(areg) +#define PPC_LD(treg, offset, areg) lwz treg, (offset+4)(areg) +#endif +#endif + /* IVPR must be 64KiB-aligned. */ #define VCPU_SIZE_ORDER 4 #define VCPU_SIZE_LOG (VCPU_SIZE_ORDER + 12) @@ -48,6 +58,14 @@ #define BOOKE_INTERRUPT_SPE_FP_DATA 33 #define BOOKE_INTERRUPT_SPE_FP_ROUND 34 #define BOOKE_INTERRUPT_PERFORMANCE_MONITOR 35 +#define BOOKE_INTERRUPT_DOORBELL 36 +#define BOOKE_INTERRUPT_DOORBELL_CRITICAL 37 + +/* booke_hv */ +#define BOOKE_INTERRUPT_GUEST_DBELL 38 +#define BOOKE_INTERRUPT_GUEST_DBELL_CRIT 39 +#define BOOKE_INTERRUPT_HV_SYSCALL 40 +#define BOOKE_INTERRUPT_HV_PRIV 41 /* book3s */ diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index fd07f43..f0e0c6a 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -453,4 +453,7 @@ static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu) #define INS_DCBZ 0x7c0007ec +/* LPIDs we support with this build -- runtime limit may be lower */ +#define KVMPPC_NR_LPIDS (LPID_RSVD + 1) + #endif /* __ASM_KVM_BOOK3S_H__ */ diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h index 1f2f5b6..88609b2 100644 --- a/arch/powerpc/include/asm/kvm_book3s_asm.h +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h @@ -79,6 +79,9 @@ struct kvmppc_host_state { u8 napping; #ifdef CONFIG_KVM_BOOK3S_64_HV + u8 hwthread_req; + u8 hwthread_state; + struct kvm_vcpu *kvm_vcpu; struct kvmppc_vcore *kvm_vcore; unsigned long xics_phys; @@ -122,4 +125,9 @@ struct kvmppc_book3s_shadow_vcpu { #endif /*__ASSEMBLY__ */ +/* Values for kvm_state */ +#define KVM_HWTHREAD_IN_KERNEL 0 +#define KVM_HWTHREAD_IN_NAP 1 +#define KVM_HWTHREAD_IN_KVM 2 + #endif /* __ASM_KVM_BOOK3S_ASM_H__ */ diff --git a/arch/powerpc/include/asm/kvm_booke.h b/arch/powerpc/include/asm/kvm_booke.h index a90e091..b7cd335 100644 --- a/arch/powerpc/include/asm/kvm_booke.h +++ b/arch/powerpc/include/asm/kvm_booke.h @@ -23,6 +23,9 @@ #include <linux/types.h> #include <linux/kvm_host.h> +/* LPIDs we support with this build -- runtime limit may be lower */ +#define KVMPPC_NR_LPIDS 64 + static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val) { vcpu->arch.gpr[num] = val; diff --git a/arch/powerpc/include/asm/kvm_booke_hv_asm.h b/arch/powerpc/include/asm/kvm_booke_hv_asm.h new file mode 100644 index 0000000..30a600f --- /dev/null +++ b/arch/powerpc/include/asm/kvm_booke_hv_asm.h @@ -0,0 +1,49 @@ +/* + * Copyright 2010-2011 Freescale Semiconductor, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#ifndef ASM_KVM_BOOKE_HV_ASM_H +#define ASM_KVM_BOOKE_HV_ASM_H + +#ifdef __ASSEMBLY__ + +/* + * All exceptions from guest state must go through KVM + * (except for those which are delivered directly to the guest) -- + * there are no exceptions for which we fall through directly to + * the normal host handler. + * + * Expected inputs (normal exceptions): + * SCRATCH0 = saved r10 + * r10 = thread struct + * r11 = appropriate SRR1 variant (currently used as scratch) + * r13 = saved CR + * *(r10 + THREAD_NORMSAVE(0)) = saved r11 + * *(r10 + THREAD_NORMSAVE(2)) = saved r13 + * + * Expected inputs (crit/mcheck/debug exceptions): + * appropriate SCRATCH = saved r8 + * r8 = exception level stack frame + * r9 = *(r8 + _CCR) = saved CR + * r11 = appropriate SRR1 variant (currently used as scratch) + * *(r8 + GPR9) = saved r9 + * *(r8 + GPR10) = saved r10 (r10 not yet clobbered) + * *(r8 + GPR11) = saved r11 + */ +.macro DO_KVM intno srr1 +#ifdef CONFIG_KVM_BOOKE_HV +BEGIN_FTR_SECTION + mtocrf 0x80, r11 /* check MSR[GS] without clobbering reg */ + bf 3, kvmppc_resume_\intno\()_\srr1 + b kvmppc_handler_\intno\()_\srr1 +kvmppc_resume_\intno\()_\srr1: +END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV) +#endif +.endm + +#endif /*__ASSEMBLY__ */ +#endif /* ASM_KVM_BOOKE_HV_ASM_H */ diff --git a/arch/powerpc/include/asm/kvm_e500.h b/arch/powerpc/include/asm/kvm_e500.h deleted file mode 100644 index 8cd50a5..0000000 --- a/arch/powerpc/include/asm/kvm_e500.h +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved. - * - * Author: Yu Liu, <yu.liu@freescale.com> - * - * Description: - * This file is derived from arch/powerpc/include/asm/kvm_44x.h, - * by Hollis Blanchard <hollisb@us.ibm.com>. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License, version 2, as - * published by the Free Software Foundation. - */ - -#ifndef __ASM_KVM_E500_H__ -#define __ASM_KVM_E500_H__ - -#include <linux/kvm_host.h> - -#define BOOKE_INTERRUPT_SIZE 36 - -#define E500_PID_NUM 3 -#define E500_TLB_NUM 2 - -#define E500_TLB_VALID 1 -#define E500_TLB_DIRTY 2 - -struct tlbe_ref { - pfn_t pfn; - unsigned int flags; /* E500_TLB_* */ -}; - -struct tlbe_priv { - struct tlbe_ref ref; /* TLB0 only -- TLB1 uses tlb_refs */ -}; - -struct vcpu_id_table; - -struct kvmppc_e500_tlb_params { - int entries, ways, sets; -}; - -struct kvmppc_vcpu_e500 { - /* Unmodified copy of the guest's TLB -- shared with host userspace. */ - struct kvm_book3e_206_tlb_entry *gtlb_arch; - - /* Starting entry number in gtlb_arch[] */ - int gtlb_offset[E500_TLB_NUM]; - - /* KVM internal information associated with each guest TLB entry */ - struct tlbe_priv *gtlb_priv[E500_TLB_NUM]; - - struct kvmppc_e500_tlb_params gtlb_params[E500_TLB_NUM]; - - unsigned int gtlb_nv[E500_TLB_NUM]; - - /* - * information associated with each host TLB entry -- - * TLB1 only for now. If/when guest TLB1 entries can be - * mapped with host TLB0, this will be used for that too. - * - * We don't want to use this for guest TLB0 because then we'd - * have the overhead of doing the translation again even if - * the entry is still in the guest TLB (e.g. we swapped out - * and back, and our host TLB entries got evicted). - */ - struct tlbe_ref *tlb_refs[E500_TLB_NUM]; - unsigned int host_tlb1_nv; - - u32 host_pid[E500_PID_NUM]; - u32 pid[E500_PID_NUM]; - u32 svr; - - /* vcpu id table */ - struct vcpu_id_table *idt; - - u32 l1csr0; - u32 l1csr1; - u32 hid0; - u32 hid1; - u32 tlb0cfg; - u32 tlb1cfg; - u64 mcar; - - struct page **shared_tlb_pages; - int num_shared_tlb_pages; - - struct kvm_vcpu vcpu; -}; - -static inline struct kvmppc_vcpu_e500 *to_e500(struct kvm_vcpu *vcpu) -{ - return container_of(vcpu, struct kvmppc_vcpu_e500, vcpu); -} - -#endif /* __ASM_KVM_E500_H__ */ diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 52eb9c1..d848cdc 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -82,7 +82,7 @@ struct kvm_vcpu; struct lppaca; struct slb_shadow; -struct dtl; +struct dtl_entry; struct kvm_vm_stat { u32 remote_tlb_flush; @@ -106,6 +106,8 @@ struct kvm_vcpu_stat { u32 dec_exits; u32 ext_intr_exits; u32 halt_wakeup; + u32 dbell_exits; + u32 gdbell_exits; #ifdef CONFIG_PPC_BOOK3S u32 pf_storage; u32 pf_instruc; @@ -140,6 +142,7 @@ enum kvm_exit_types { EMULATED_TLBSX_EXITS, EMULATED_TLBWE_EXITS, EMULATED_RFI_EXITS, + EMULATED_RFCI_EXITS, DEC_EXITS, EXT_INTR_EXITS, HALT_WAKEUP, @@ -147,6 +150,8 @@ enum kvm_exit_types { FP_UNAVAIL, DEBUG_EXITS, TIMEINGUEST, + DBELL_EXITS, + GDBELL_EXITS, __NUMBER_OF_KVM_EXIT_TYPES }; @@ -217,10 +222,10 @@ struct kvm_arch_memory_slot { }; struct kvm_arch { + unsigned int lpid; #ifdef CONFIG_KVM_BOOK3S_64_HV unsigned long hpt_virt; struct revmap_entry *revmap; - unsigned int lpid; unsigned int host_lpid; unsigned long host_lpcr; unsigned long sdr1; @@ -232,7 +237,6 @@ struct kvm_arch { unsigned long vrma_slb_v; int rma_setup_done; int using_mmu_notifiers; - struct list_head spapr_tce_tables; spinlock_t slot_phys_lock; unsigned long *slot_phys[KVM_MEM_SLOTS_NUM]; int slot_npages[KVM_MEM_SLOTS_NUM]; @@ -240,6 +244,9 @@ struct kvm_arch { struct kvmppc_vcore *vcores[KVM_MAX_VCORES]; struct kvmppc_linear_info *hpt_li; #endif /* CONFIG_KVM_BOOK3S_64_HV */ +#ifdef CONFIG_PPC_BOOK3S_64 + struct list_head spapr_tce_tables; +#endif }; /* @@ -263,6 +270,9 @@ struct kvmppc_vcore { struct list_head runnable_threads; spinlock_t lock; wait_queue_head_t wq; + u64 stolen_tb; + u64 preempt_tb; + struct kvm_vcpu *runner; }; #define VCORE_ENTRY_COUNT(vc) ((vc)->entry_exit_count & 0xff) @@ -274,6 +284,19 @@ struct kvmppc_vcore { #define VCORE_EXITING 2 #define VCORE_SLEEPING 3 +/* + * Struct used to manage memory for a virtual processor area + * registered by a PAPR guest. There are three types of area + * that a guest can register. + */ +struct kvmppc_vpa { + void *pinned_addr; /* Address in kernel linear mapping */ + void *pinned_end; /* End of region */ + unsigned long next_gpa; /* Guest phys addr for update */ + unsigned long len; /* Number of bytes required */ + u8 update_pending; /* 1 => update pinned_addr from next_gpa */ +}; + struct kvmppc_pte { ulong eaddr; u64 vpage; @@ -345,6 +368,17 @@ struct kvm_vcpu_arch { u64 vsr[64]; #endif +#ifdef CONFIG_KVM_BOOKE_HV + u32 host_mas4; + u32 host_mas6; + u32 shadow_epcr; + u32 epcr; + u32 shadow_msrp; + u32 eplc; + u32 epsc; + u32 oldpir; +#endif + #ifdef CONFIG_PPC_BOOK3S /* For Gekko paired singles */ u32 qpr[32]; @@ -370,6 +404,7 @@ struct kvm_vcpu_arch { #endif u32 vrsave; /* also USPRG0 */ u32 mmucr; + /* shadow_msr is unused for BookE HV */ ulong shadow_msr; ulong csrr0; ulong csrr1; @@ -426,8 +461,12 @@ struct kvm_vcpu_arch { ulong fault_esr; ulong queued_dear; ulong queued_esr; + u32 tlbcfg[4]; + u32 mmucfg; + u32 epr; #endif gpa_t paddr_accessed; + gva_t vaddr_accessed; u8 io_gpr; /* GPR used as IO source/target */ u8 mmio_is_bigendian; @@ -453,11 +492,6 @@ struct kvm_vcpu_arch { u8 prodded; u32 last_inst; - struct lppaca *vpa; - struct slb_shadow *slb_shadow; - struct dtl *dtl; - struct dtl *dtl_end; - wait_queue_head_t *wqp; struct kvmppc_vcore *vcore; int ret; @@ -482,6 +516,14 @@ struct kvm_vcpu_arch { struct task_struct *run_task; struct kvm_run *kvm_run; pgd_t *pgdir; + + spinlock_t vpa_update_lock; + struct kvmppc_vpa vpa; + struct kvmppc_vpa dtl; + struct dtl_entry *dtl_ptr; + unsigned long dtl_index; + u64 stolen_logged; + struct kvmppc_vpa slb_shadow; #endif }; @@ -498,4 +540,6 @@ struct kvm_vcpu_arch { #define KVM_MMIO_REG_QPR 0x0040 #define KVM_MMIO_REG_FQPR 0x0060 +#define __KVM_HAVE_ARCH_WQP + #endif /* __POWERPC_KVM_HOST_H__ */ diff --git a/arch/powerpc/include/asm/kvm_para.h b/arch/powerpc/include/asm/kvm_para.h index 7b754e7..c18916b 100644 --- a/arch/powerpc/include/asm/kvm_para.h +++ b/arch/powerpc/include/asm/kvm_para.h @@ -206,6 +206,11 @@ static inline unsigned int kvm_arch_para_features(void) return r; } +static inline bool kvm_check_and_clear_guest_paused(void) +{ + return false; +} + #endif /* __KERNEL__ */ #endif /* __POWERPC_KVM_PARA_H__ */ diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 9d6dee0..f68c22f 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -95,7 +95,7 @@ extern int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu, extern void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu); extern void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu); -extern void kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu); +extern int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu); extern int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu); extern void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags); extern void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu); @@ -107,8 +107,10 @@ extern void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu, extern int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int op, int *advance); -extern int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs); -extern int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt); +extern int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, + ulong val); +extern int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, + ulong *val); extern int kvmppc_booke_init(void); extern void kvmppc_booke_exit(void); @@ -126,6 +128,8 @@ extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu, extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu); extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, struct kvm_create_spapr_tce *args); +extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, + unsigned long ioba, unsigned long tce); extern long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, struct kvm_allocate_rma *rma); extern struct kvmppc_linear_info *kvm_alloc_rma(void); @@ -138,6 +142,11 @@ extern int kvmppc_core_prepare_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem); extern void kvmppc_core_commit_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem); +extern int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm, + struct kvm_ppc_smmu_info *info); + +extern int kvmppc_bookehv_init(void); +extern void kvmppc_bookehv_exit(void); /* * Cuts out inst bits with ordering according to spec. @@ -204,4 +213,9 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu, int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu, struct kvm_dirty_tlb *cfg); +long kvmppc_alloc_lpid(void); +void kvmppc_claim_lpid(long lpid); +void kvmppc_free_lpid(long lpid); +void kvmppc_init_lpid(unsigned long nr_lpids); + #endif /* __POWERPC_KVM_PPC_H__ */ diff --git a/arch/powerpc/include/asm/mmu-book3e.h b/arch/powerpc/include/asm/mmu-book3e.h index cdb54218..eeabcdb 100644 --- a/arch/powerpc/include/asm/mmu-book3e.h +++ b/arch/powerpc/include/asm/mmu-book3e.h @@ -104,6 +104,8 @@ #define MAS4_TSIZED_MASK 0x00000f80 /* Default TSIZE */ #define MAS4_TSIZED_SHIFT 7 +#define MAS5_SGS 0x80000000 + #define MAS6_SPID0 0x3FFF0000 #define MAS6_SPID1 0x00007FFE #define MAS6_ISIZE(x) MAS1_TSIZE(x) @@ -118,6 +120,10 @@ #define MAS7_RPN 0xFFFFFFFF +#define MAS8_TGS 0x80000000 /* Guest space */ +#define MAS8_VF 0x40000000 /* Virtualization Fault */ +#define MAS8_TLPID 0x000000ff + /* Bit definitions for MMUCFG */ #define MMUCFG_MAVN 0x00000003 /* MMU Architecture Version Number */ #define MMUCFG_MAVN_V1 0x00000000 /* v1.0 */ diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index 55e8563..413a5ea 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -240,6 +240,9 @@ struct thread_struct { #ifdef CONFIG_KVM_BOOK3S_32_HANDLER void* kvm_shadow_vcpu; /* KVM internal data */ #endif /* CONFIG_KVM_BOOK3S_32_HANDLER */ +#if defined(CONFIG_KVM) && defined(CONFIG_BOOKE) + struct kvm_vcpu *kvm_vcpu; +#endif #ifdef CONFIG_PPC64 unsigned long dscr; int dscr_inherit; diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 9d7f0fb..f0cb7f4 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -257,7 +257,9 @@ #define LPCR_LPES_SH 2 #define LPCR_RMI 0x00000002 /* real mode is cache inhibit */ #define LPCR_HDICE 0x00000001 /* Hyp Decr enable (HV,PR,EE) */ +#ifndef SPRN_LPID #define SPRN_LPID 0x13F /* Logical Partition Identifier */ +#endif #define LPID_RSVD 0x3ff /* Reserved LPID for partn switching */ #define SPRN_HMER 0x150 /* Hardware m? error recovery */ #define SPRN_HMEER 0x151 /* Hardware m? enable error recovery */ diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h index 8a97aa7..2d916c4 100644 --- a/arch/powerpc/include/asm/reg_booke.h +++ b/arch/powerpc/include/asm/reg_booke.h @@ -56,18 +56,30 @@ #define SPRN_SPRG7W 0x117 /* Special Purpose Register General 7 Write */ #define SPRN_EPCR 0x133 /* Embedded Processor Control Register */ #define SPRN_DBCR2 0x136 /* Debug Control Register 2 */ +#define SPRN_MSRP 0x137 /* MSR Protect Register */ #define SPRN_IAC3 0x13A /* Instruction Address Compare 3 */ #define SPRN_IAC4 0x13B /* Instruction Address Compare 4 */ #define SPRN_DVC1 0x13E /* Data Value Compare Register 1 */ #define SPRN_DVC2 0x13F /* Data Value Compare Register 2 */ +#define SPRN_LPID 0x152 /* Logical Partition ID */ #define SPRN_MAS8 0x155 /* MMU Assist Register 8 */ #define SPRN_TLB0PS 0x158 /* TLB 0 Page Size Register */ #define SPRN_TLB1PS 0x159 /* TLB 1 Page Size Register */ #define SPRN_MAS5_MAS6 0x15c /* MMU Assist Register 5 || 6 */ #define SPRN_MAS8_MAS1 0x15d /* MMU Assist Register 8 || 1 */ #define SPRN_EPTCFG 0x15e /* Embedded Page Table Config */ +#define SPRN_GSPRG0 0x170 /* Guest SPRG0 */ +#define SPRN_GSPRG1 0x171 /* Guest SPRG1 */ +#define SPRN_GSPRG2 0x172 /* Guest SPRG2 */ +#define SPRN_GSPRG3 0x173 /* Guest SPRG3 */ #define SPRN_MAS7_MAS3 0x174 /* MMU Assist Register 7 || 3 */ #define SPRN_MAS0_MAS1 0x175 /* MMU Assist Register 0 || 1 */ +#define SPRN_GSRR0 0x17A /* Guest SRR0 */ +#define SPRN_GSRR1 0x17B /* Guest SRR1 */ +#define SPRN_GEPR 0x17C /* Guest EPR */ +#define SPRN_GDEAR 0x17D /* Guest DEAR */ +#define SPRN_GPIR 0x17E /* Guest PIR */ +#define SPRN_GESR 0x17F /* Guest Exception Syndrome Register */ #define SPRN_IVOR0 0x190 /* Interrupt Vector Offset Register 0 */ #define SPRN_IVOR1 0x191 /* Interrupt Vector Offset Register 1 */ #define SPRN_IVOR2 0x192 /* Interrupt Vector Offset Register 2 */ @@ -88,6 +100,13 @@ #define SPRN_IVOR39 0x1B1 /* Interrupt Vector Offset Register 39 */ #define SPRN_IVOR40 0x1B2 /* Interrupt Vector Offset Register 40 */ #define SPRN_IVOR41 0x1B3 /* Interrupt Vector Offset Register 41 */ +#define SPRN_GIVOR2 0x1B8 /* Guest IVOR2 */ +#define SPRN_GIVOR3 0x1B9 /* Guest IVOR3 */ +#define SPRN_GIVOR4 0x1BA /* Guest IVOR4 */ +#define SPRN_GIVOR8 0x1BB /* Guest IVOR8 */ +#define SPRN_GIVOR13 0x1BC /* Guest IVOR13 */ +#define SPRN_GIVOR14 0x1BD /* Guest IVOR14 */ +#define SPRN_GIVPR 0x1BF /* Guest IVPR */ #define SPRN_SPEFSCR 0x200 /* SPE & Embedded FP Status & Control */ #define SPRN_BBEAR 0x201 /* Branch Buffer Entry Address Register */ #define SPRN_BBTAR 0x202 /* Branch Buffer Target Address Register */ @@ -240,6 +259,10 @@ #define MCSR_LDG 0x00002000UL /* Guarded Load */ #define MCSR_TLBSYNC 0x00000002UL /* Multiple tlbsyncs detected */ #define MCSR_BSL2_ERR 0x00000001UL /* Backside L2 cache error */ + +#define MSRP_UCLEP 0x04000000 /* Protect MSR[UCLE] */ +#define MSRP_DEP 0x00000200 /* Protect MSR[DE] */ +#define MSRP_PMMP 0x00000004 /* Protect MSR[PMM] */ #endif #ifdef CONFIG_E200 @@ -594,6 +617,17 @@ #define SPRN_EPCR_DMIUH 0x00400000 /* Disable MAS Interrupt updates * for hypervisor */ +/* Bit definitions for EPLC/EPSC */ +#define EPC_EPR 0x80000000 /* 1 = user, 0 = kernel */ +#define EPC_EPR_SHIFT 31 +#define EPC_EAS 0x40000000 /* Address Space */ +#define EPC_EAS_SHIFT 30 +#define EPC_EGS 0x20000000 /* 1 = guest, 0 = hypervisor */ +#define EPC_EGS_SHIFT 29 +#define EPC_ELPID 0x00ff0000 +#define EPC_ELPID_SHIFT 16 +#define EPC_EPID 0x00003fff +#define EPC_EPID_SHIFT 0 /* * The IBM-403 is an even more odd special case, as it is much diff --git a/arch/powerpc/include/asm/switch_to.h b/arch/powerpc/include/asm/switch_to.h index 1a63202..200d763 100644 --- a/arch/powerpc/include/asm/switch_to.h +++ b/arch/powerpc/include/asm/switch_to.h @@ -17,6 +17,7 @@ extern struct task_struct *_switch(struct thread_struct *prev, struct thread_struct *next); extern void giveup_fpu(struct task_struct *); +extern void load_up_fpu(void); extern void disable_kernel_fp(void); extern void enable_kernel_fp(void); extern void flush_fp_to_thread(struct task_struct *); diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index 2136f58..3b4b4a8 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -23,6 +23,7 @@ extern unsigned long tb_ticks_per_jiffy; extern unsigned long tb_ticks_per_usec; extern unsigned long tb_ticks_per_sec; +extern struct clock_event_device decrementer_clockevent; struct rtc_time; extern void to_tm(int tim, struct rtc_time * tm); diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 4554dc2..52c7ad7 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -116,6 +116,9 @@ int main(void) #ifdef CONFIG_KVM_BOOK3S_32_HANDLER DEFINE(THREAD_KVM_SVCPU, offsetof(struct thread_struct, kvm_shadow_vcpu)); #endif +#ifdef CONFIG_KVM_BOOKE_HV + DEFINE(THREAD_KVM_VCPU, offsetof(struct thread_struct, kvm_vcpu)); +#endif DEFINE(TI_FLAGS, offsetof(struct thread_info, flags)); DEFINE(TI_LOCAL_FLAGS, offsetof(struct thread_info, local_flags)); @@ -383,6 +386,7 @@ int main(void) #ifdef CONFIG_KVM DEFINE(VCPU_HOST_STACK, offsetof(struct kvm_vcpu, arch.host_stack)); DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid)); + DEFINE(VCPU_GUEST_PID, offsetof(struct kvm_vcpu, arch.pid)); DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr)); DEFINE(VCPU_VRSAVE, offsetof(struct kvm_vcpu, arch.vrsave)); DEFINE(VCPU_FPRS, offsetof(struct kvm_vcpu, arch.fpr)); @@ -425,9 +429,11 @@ int main(void) DEFINE(VCPU_SHARED_MAS4, offsetof(struct kvm_vcpu_arch_shared, mas4)); DEFINE(VCPU_SHARED_MAS6, offsetof(struct kvm_vcpu_arch_shared, mas6)); + DEFINE(VCPU_KVM, offsetof(struct kvm_vcpu, kvm)); + DEFINE(KVM_LPID, offsetof(struct kvm, arch.lpid)); + /* book3s */ #ifdef CONFIG_KVM_BOOK3S_64_HV - DEFINE(KVM_LPID, offsetof(struct kvm, arch.lpid)); DEFINE(KVM_SDR1, offsetof(struct kvm, arch.sdr1)); DEFINE(KVM_HOST_LPID, offsetof(struct kvm, arch.host_lpid)); DEFINE(KVM_HOST_LPCR, offsetof(struct kvm, arch.host_lpcr)); @@ -440,9 +446,9 @@ int main(void) DEFINE(KVM_VRMA_SLB_V, offsetof(struct kvm, arch.vrma_slb_v)); DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.shregs.dsisr)); DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar)); + DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa.pinned_addr)); #endif #ifdef CONFIG_PPC_BOOK3S - DEFINE(VCPU_KVM, offsetof(struct kvm_vcpu, kvm)); DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id)); DEFINE(VCPU_PURR, offsetof(struct kvm_vcpu, arch.purr)); DEFINE(VCPU_SPURR, offsetof(struct kvm_vcpu, arch.spurr)); @@ -457,7 +463,6 @@ int main(void) DEFINE(VCPU_PENDING_EXC, offsetof(struct kvm_vcpu, arch.pending_exceptions)); DEFINE(VCPU_CEDED, offsetof(struct kvm_vcpu, arch.ceded)); DEFINE(VCPU_PRODDED, offsetof(struct kvm_vcpu, arch.prodded)); - DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa)); DEFINE(VCPU_MMCR, offsetof(struct kvm_vcpu, arch.mmcr)); DEFINE(VCPU_PMC, offsetof(struct kvm_vcpu, arch.pmc)); DEFINE(VCPU_SLB, offsetof(struct kvm_vcpu, arch.slb)); @@ -533,6 +538,8 @@ int main(void) HSTATE_FIELD(HSTATE_NAPPING, napping); #ifdef CONFIG_KVM_BOOK3S_64_HV + HSTATE_FIELD(HSTATE_HWTHREAD_REQ, hwthread_req); + HSTATE_FIELD(HSTATE_HWTHREAD_STATE, hwthread_state); HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu); HSTATE_FIELD(HSTATE_KVM_VCORE, kvm_vcore); HSTATE_FIELD(HSTATE_XICS_PHYS, xics_phys); @@ -593,6 +600,12 @@ int main(void) DEFINE(VCPU_HOST_SPEFSCR, offsetof(struct kvm_vcpu, arch.host_spefscr)); #endif +#ifdef CONFIG_KVM_BOOKE_HV + DEFINE(VCPU_HOST_MAS4, offsetof(struct kvm_vcpu, arch.host_mas4)); + DEFINE(VCPU_HOST_MAS6, offsetof(struct kvm_vcpu, arch.host_mas6)); + DEFINE(VCPU_EPLC, offsetof(struct kvm_vcpu, arch.eplc)); +#endif + #ifdef CONFIG_KVM_EXIT_TIMING DEFINE(VCPU_TIMING_EXIT_TBU, offsetof(struct kvm_vcpu, arch.timing_exit.tv32.tbu)); diff --git a/arch/powerpc/kernel/cpu_setup_fsl_booke.S b/arch/powerpc/kernel/cpu_setup_fsl_booke.S index 8053db0..69fdd23 100644 --- a/arch/powerpc/kernel/cpu_setup_fsl_booke.S +++ b/arch/powerpc/kernel/cpu_setup_fsl_booke.S @@ -73,6 +73,7 @@ _GLOBAL(__setup_cpu_e500v2) mtlr r4 blr _GLOBAL(__setup_cpu_e500mc) + mr r5, r4 mflr r4 bl __e500_icache_setup bl __e500_dcache_setup diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index f7bed44..1c06d29 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -63,11 +63,13 @@ BEGIN_FTR_SECTION GET_PACA(r13) #ifdef CONFIG_KVM_BOOK3S_64_HV - lbz r0,PACAPROCSTART(r13) - cmpwi r0,0x80 - bne 1f - li r0,1 - stb r0,PACAPROCSTART(r13) + li r0,KVM_HWTHREAD_IN_KERNEL + stb r0,HSTATE_HWTHREAD_STATE(r13) + /* Order setting hwthread_state vs. testing hwthread_req */ + sync + lbz r0,HSTATE_HWTHREAD_REQ(r13) + cmpwi r0,0 + beq 1f b kvm_start_guest 1: #endif diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S index 22d608e..7a2e5e4 100644 --- a/arch/powerpc/kernel/head_44x.S +++ b/arch/powerpc/kernel/head_44x.S @@ -248,10 +248,11 @@ _ENTRY(_start); interrupt_base: /* Critical Input Interrupt */ - CRITICAL_EXCEPTION(0x0100, CriticalInput, unknown_exception) + CRITICAL_EXCEPTION(0x0100, CRITICAL, CriticalInput, unknown_exception) /* Machine Check Interrupt */ - CRITICAL_EXCEPTION(0x0200, MachineCheck, machine_check_exception) + CRITICAL_EXCEPTION(0x0200, MACHINE_CHECK, MachineCheck, \ + machine_check_exception) MCHECK_EXCEPTION(0x0210, MachineCheckA, machine_check_exception) /* Data Storage Interrupt */ @@ -261,7 +262,8 @@ interrupt_base: INSTRUCTION_STORAGE_EXCEPTION /* External Input Interrupt */ - EXCEPTION(0x0500, ExternalInput, do_IRQ, EXC_XFER_LITE) + EXCEPTION(0x0500, BOOKE_INTERRUPT_EXTERNAL, ExternalInput, \ + do_IRQ, EXC_XFER_LITE) /* Alignment Interrupt */ ALIGNMENT_EXCEPTION @@ -273,29 +275,32 @@ interrupt_base: #ifdef CONFIG_PPC_FPU FP_UNAVAILABLE_EXCEPTION #else - EXCEPTION(0x2010, FloatingPointUnavailable, unknown_exception, EXC_XFER_EE) + EXCEPTION(0x2010, BOOKE_INTERRUPT_FP_UNAVAIL, \ + FloatingPointUnavailable, unknown_exception, EXC_XFER_EE) #endif /* System Call Interrupt */ START_EXCEPTION(SystemCall) - NORMAL_EXCEPTION_PROLOG + NORMAL_EXCEPTION_PROLOG(BOOKE_INTERRUPT_SYSCALL) EXC_XFER_EE_LITE(0x0c00, DoSyscall) /* Auxiliary Processor Unavailable Interrupt */ - EXCEPTION(0x2020, AuxillaryProcessorUnavailable, unknown_exception, EXC_XFER_EE) + EXCEPTION(0x2020, BOOKE_INTERRUPT_AP_UNAVAIL, \ + AuxillaryProcessorUnavailable, unknown_exception, EXC_XFER_EE) /* Decrementer Interrupt */ DECREMENTER_EXCEPTION /* Fixed Internal Timer Interrupt */ /* TODO: Add FIT support */ - EXCEPTION(0x1010, FixedIntervalTimer, unknown_exception, EXC_XFER_EE) + EXCEPTION(0x1010, BOOKE_INTERRUPT_FIT, FixedIntervalTimer, \ + unknown_exception, EXC_XFER_EE) /* Watchdog Timer Interrupt */ /* TODO: Add watchdog support */ #ifdef CONFIG_BOOKE_WDT - CRITICAL_EXCEPTION(0x1020, WatchdogTimer, WatchdogException) + CRITICAL_EXCEPTION(0x1020, WATCHDOG, WatchdogTimer, WatchdogException) #else - CRITICAL_EXCEPTION(0x1020, WatchdogTimer, unknown_exception) + CRITICAL_EXCEPTION(0x1020, WATCHDOG, WatchdogTimer, unknown_exception) #endif /* Data TLB Error Interrupt */ diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h index 0e41753..5f051ee 100644 --- a/arch/powerpc/kernel/head_booke.h +++ b/arch/powerpc/kernel/head_booke.h @@ -2,6 +2,9 @@ #define __HEAD_BOOKE_H__ #include <asm/ptrace.h> /* for STACK_FRAME_REGS_MARKER */ +#include <asm/kvm_asm.h> +#include <asm/kvm_booke_hv_asm.h> + /* * Macros used for common Book-e exception handling */ @@ -28,14 +31,15 @@ */ #define THREAD_NORMSAVE(offset) (THREAD_NORMSAVES + (offset * 4)) -#define NORMAL_EXCEPTION_PROLOG \ +#define NORMAL_EXCEPTION_PROLOG(intno) \ mtspr SPRN_SPRG_WSCRATCH0, r10; /* save one register */ \ mfspr r10, SPRN_SPRG_THREAD; \ stw r11, THREAD_NORMSAVE(0)(r10); \ stw r13, THREAD_NORMSAVE(2)(r10); \ mfcr r13; /* save CR in r13 for now */\ - mfspr r11,SPRN_SRR1; /* check whether user or kernel */\ - andi. r11,r11,MSR_PR; \ + mfspr r11, SPRN_SRR1; \ + DO_KVM BOOKE_INTERRUPT_##intno SPRN_SRR1; \ + andi. r11, r11, MSR_PR; /* check whether user or kernel */\ mr r11, r1; \ beq 1f; \ /* if from user, start at top of this thread's kernel stack */ \ @@ -113,7 +117,7 @@ * registers as the normal prolog above. Instead we use a portion of the * critical/machine check exception stack at low physical addresses. */ -#define EXC_LEVEL_EXCEPTION_PROLOG(exc_level, exc_level_srr0, exc_level_srr1) \ +#define EXC_LEVEL_EXCEPTION_PROLOG(exc_level, intno, exc_level_srr0, exc_level_srr1) \ mtspr SPRN_SPRG_WSCRATCH_##exc_level,r8; \ BOOKE_LOAD_EXC_LEVEL_STACK(exc_level);/* r8 points to the exc_level stack*/ \ stw r9,GPR9(r8); /* save various registers */\ @@ -121,8 +125,9 @@ stw r10,GPR10(r8); \ stw r11,GPR11(r8); \ stw r9,_CCR(r8); /* save CR on stack */\ - mfspr r10,exc_level_srr1; /* check whether user or kernel */\ - andi. r10,r10,MSR_PR; \ + mfspr r11,exc_level_srr1; /* check whether user or kernel */\ + DO_KVM BOOKE_INTERRUPT_##intno exc_level_srr1; \ + andi. r11,r11,MSR_PR; \ mfspr r11,SPRN_SPRG_THREAD; /* if from user, start at top of */\ lwz r11,THREAD_INFO-THREAD(r11); /* this thread's kernel stack */\ addi r11,r11,EXC_LVL_FRAME_OVERHEAD; /* allocate stack frame */\ @@ -162,12 +167,30 @@ SAVE_4GPRS(3, r11); \ SAVE_2GPRS(7, r11) -#define CRITICAL_EXCEPTION_PROLOG \ - EXC_LEVEL_EXCEPTION_PROLOG(CRIT, SPRN_CSRR0, SPRN_CSRR1) +#define CRITICAL_EXCEPTION_PROLOG(intno) \ + EXC_LEVEL_EXCEPTION_PROLOG(CRIT, intno, SPRN_CSRR0, SPRN_CSRR1) #define DEBUG_EXCEPTION_PROLOG \ - EXC_LEVEL_EXCEPTION_PROLOG(DBG, SPRN_DSRR0, SPRN_DSRR1) + EXC_LEVEL_EXCEPTION_PROLOG(DBG, DEBUG, SPRN_DSRR0, SPRN_DSRR1) #define MCHECK_EXCEPTION_PROLOG \ - EXC_LEVEL_EXCEPTION_PROLOG(MC, SPRN_MCSRR0, SPRN_MCSRR1) + EXC_LEVEL_EXCEPTION_PROLOG(MC, MACHINE_CHECK, \ + SPRN_MCSRR0, SPRN_MCSRR1) + +/* + * Guest Doorbell -- this is a bit odd in that uses GSRR0/1 despite + * being delivered to the host. This exception can only happen + * inside a KVM guest -- so we just handle up to the DO_KVM rather + * than try to fit this into one of the existing prolog macros. + */ +#define GUEST_DOORBELL_EXCEPTION \ + START_EXCEPTION(GuestDoorbell); \ + mtspr SPRN_SPRG_WSCRATCH0, r10; /* save one register */ \ + mfspr r10, SPRN_SPRG_THREAD; \ + stw r11, THREAD_NORMSAVE(0)(r10); \ + mfspr r11, SPRN_SRR1; \ + stw r13, THREAD_NORMSAVE(2)(r10); \ + mfcr r13; /* save CR in r13 for now */\ + DO_KVM BOOKE_INTERRUPT_GUEST_DBELL SPRN_GSRR1; \ + trap /* * Exception vectors. @@ -181,16 +204,16 @@ label: .long func; \ .long ret_from_except_full -#define EXCEPTION(n, label, hdlr, xfer) \ +#define EXCEPTION(n, intno, label, hdlr, xfer) \ START_EXCEPTION(label); \ - NORMAL_EXCEPTION_PROLOG; \ + NORMAL_EXCEPTION_PROLOG(intno); \ addi r3,r1,STACK_FRAME_OVERHEAD; \ xfer(n, hdlr) -#define CRITICAL_EXCEPTION(n, label, hdlr) \ - START_EXCEPTION(label); \ - CRITICAL_EXCEPTION_PROLOG; \ - addi r3,r1,STACK_FRAME_OVERHEAD; \ +#define CRITICAL_EXCEPTION(n, intno, label, hdlr) \ + START_EXCEPTION(label); \ + CRITICAL_EXCEPTION_PROLOG(intno); \ + addi r3,r1,STACK_FRAME_OVERHEAD; \ EXC_XFER_TEMPLATE(hdlr, n+2, (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), \ NOCOPY, crit_transfer_to_handler, \ ret_from_crit_exc) @@ -302,7 +325,7 @@ label: #define DEBUG_CRIT_EXCEPTION \ START_EXCEPTION(DebugCrit); \ - CRITICAL_EXCEPTION_PROLOG; \ + CRITICAL_EXCEPTION_PROLOG(DEBUG); \ \ /* \ * If there is a single step or branch-taken exception in an \ @@ -355,7 +378,7 @@ label: #define DATA_STORAGE_EXCEPTION \ START_EXCEPTION(DataStorage) \ - NORMAL_EXCEPTION_PROLOG; \ + NORMAL_EXCEPTION_PROLOG(DATA_STORAGE); \ mfspr r5,SPRN_ESR; /* Grab the ESR and save it */ \ stw r5,_ESR(r11); \ mfspr r4,SPRN_DEAR; /* Grab the DEAR */ \ @@ -363,7 +386,7 @@ label: #define INSTRUCTION_STORAGE_EXCEPTION \ START_EXCEPTION(InstructionStorage) \ - NORMAL_EXCEPTION_PROLOG; \ + NORMAL_EXCEPTION_PROLOG(INST_STORAGE); \ mfspr r5,SPRN_ESR; /* Grab the ESR and save it */ \ stw r5,_ESR(r11); \ mr r4,r12; /* Pass SRR0 as arg2 */ \ @@ -372,7 +395,7 @@ label: #define ALIGNMENT_EXCEPTION \ START_EXCEPTION(Alignment) \ - NORMAL_EXCEPTION_PROLOG; \ + NORMAL_EXCEPTION_PROLOG(ALIGNMENT); \ mfspr r4,SPRN_DEAR; /* Grab the DEAR and save it */ \ stw r4,_DEAR(r11); \ addi r3,r1,STACK_FRAME_OVERHEAD; \ @@ -380,7 +403,7 @@ label: #define PROGRAM_EXCEPTION \ START_EXCEPTION(Program) \ - NORMAL_EXCEPTION_PROLOG; \ + NORMAL_EXCEPTION_PROLOG(PROGRAM); \ mfspr r4,SPRN_ESR; /* Grab the ESR and save it */ \ stw r4,_ESR(r11); \ addi r3,r1,STACK_FRAME_OVERHEAD; \ @@ -388,7 +411,7 @@ label: #define DECREMENTER_EXCEPTION \ START_EXCEPTION(Decrementer) \ - NORMAL_EXCEPTION_PROLOG; \ + NORMAL_EXCEPTION_PROLOG(DECREMENTER); \ lis r0,TSR_DIS@h; /* Setup the DEC interrupt mask */ \ mtspr SPRN_TSR,r0; /* Clear the DEC interrupt */ \ addi r3,r1,STACK_FRAME_OVERHEAD; \ @@ -396,7 +419,7 @@ label: #define FP_UNAVAILABLE_EXCEPTION \ START_EXCEPTION(FloatingPointUnavailable) \ - NORMAL_EXCEPTION_PROLOG; \ + NORMAL_EXCEPTION_PROLOG(FP_UNAVAIL); \ beq 1f; \ bl load_up_fpu; /* if from user, just load it up */ \ b fast_exception_return; \ diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S index de80e0f..1f4434a 100644 --- a/arch/powerpc/kernel/head_fsl_booke.S +++ b/arch/powerpc/kernel/head_fsl_booke.S @@ -301,19 +301,20 @@ _ENTRY(__early_start) interrupt_base: /* Critical Input Interrupt */ - CRITICAL_EXCEPTION(0x0100, CriticalInput, unknown_exception) + CRITICAL_EXCEPTION(0x0100, CRITICAL, CriticalInput, unknown_exception) /* Machine Check Interrupt */ #ifdef CONFIG_E200 /* no RFMCI, MCSRRs on E200 */ - CRITICAL_EXCEPTION(0x0200, MachineCheck, machine_check_exception) + CRITICAL_EXCEPTION(0x0200, MACHINE_CHECK, MachineCheck, \ + machine_check_exception) #else MCHECK_EXCEPTION(0x0200, MachineCheck, machine_check_exception) #endif /* Data Storage Interrupt */ START_EXCEPTION(DataStorage) - NORMAL_EXCEPTION_PROLOG + NORMAL_EXCEPTION_PROLOG(DATA_STORAGE) mfspr r5,SPRN_ESR /* Grab the ESR, save it, pass arg3 */ stw r5,_ESR(r11) mfspr r4,SPRN_DEAR /* Grab the DEAR, save it, pass arg2 */ @@ -328,7 +329,7 @@ interrupt_base: INSTRUCTION_STORAGE_EXCEPTION /* External Input Interrupt */ - EXCEPTION(0x0500, ExternalInput, do_IRQ, EXC_XFER_LITE) + EXCEPTION(0x0500, EXTERNAL, ExternalInput, do_IRQ, EXC_XFER_LITE) /* Alignment Interrupt */ ALIGNMENT_EXCEPTION @@ -342,32 +343,36 @@ interrupt_base: #else #ifdef CONFIG_E200 /* E200 treats 'normal' floating point instructions as FP Unavail exception */ - EXCEPTION(0x0800, FloatingPointUnavailable, program_check_exception, EXC_XFER_EE) + EXCEPTION(0x0800, FP_UNAVAIL, FloatingPointUnavailable, \ + program_check_exception, EXC_XFER_EE) #else - EXCEPTION(0x0800, FloatingPointUnavailable, unknown_exception, EXC_XFER_EE) + EXCEPTION(0x0800, FP_UNAVAIL, FloatingPointUnavailable, \ + unknown_exception, EXC_XFER_EE) #endif #endif /* System Call Interrupt */ START_EXCEPTION(SystemCall) - NORMAL_EXCEPTION_PROLOG + NORMAL_EXCEPTION_PROLOG(SYSCALL) EXC_XFER_EE_LITE(0x0c00, DoSyscall) /* Auxiliary Processor Unavailable Interrupt */ - EXCEPTION(0x2900, AuxillaryProcessorUnavailable, unknown_exception, EXC_XFER_EE) + EXCEPTION(0x2900, AP_UNAVAIL, AuxillaryProcessorUnavailable, \ + unknown_exception, EXC_XFER_EE) /* Decrementer Interrupt */ DECREMENTER_EXCEPTION /* Fixed Internal Timer Interrupt */ /* TODO: Add FIT support */ - EXCEPTION(0x3100, FixedIntervalTimer, unknown_exception, EXC_XFER_EE) + EXCEPTION(0x3100, FIT, FixedIntervalTimer, \ + unknown_exception, EXC_XFER_EE) /* Watchdog Timer Interrupt */ #ifdef CONFIG_BOOKE_WDT - CRITICAL_EXCEPTION(0x3200, WatchdogTimer, WatchdogException) + CRITICAL_EXCEPTION(0x3200, WATCHDOG, WatchdogTimer, WatchdogException) #else - CRITICAL_EXCEPTION(0x3200, WatchdogTimer, unknown_exception) + CRITICAL_EXCEPTION(0x3200, WATCHDOG, WatchdogTimer, unknown_exception) #endif /* Data TLB Error Interrupt */ @@ -375,10 +380,16 @@ interrupt_base: mtspr SPRN_SPRG_WSCRATCH0, r10 /* Save some working registers */ mfspr r10, SPRN_SPRG_THREAD stw r11, THREAD_NORMSAVE(0)(r10) +#ifdef CONFIG_KVM_BOOKE_HV +BEGIN_FTR_SECTION + mfspr r11, SPRN_SRR1 +END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV) +#endif stw r12, THREAD_NORMSAVE(1)(r10) stw r13, THREAD_NORMSAVE(2)(r10) mfcr r13 stw r13, THREAD_NORMSAVE(3)(r10) + DO_KVM BOOKE_INTERRUPT_DTLB_MISS SPRN_SRR1 mfspr r10, SPRN_DEAR /* Get faulting address */ /* If we are faulting a kernel address, we have to use the @@ -463,10 +474,16 @@ interrupt_base: mtspr SPRN_SPRG_WSCRATCH0, r10 /* Save some working registers */ mfspr r10, SPRN_SPRG_THREAD stw r11, THREAD_NORMSAVE(0)(r10) +#ifdef CONFIG_KVM_BOOKE_HV +BEGIN_FTR_SECTION + mfspr r11, SPRN_SRR1 +END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV) +#endif stw r12, THREAD_NORMSAVE(1)(r10) stw r13, THREAD_NORMSAVE(2)(r10) mfcr r13 stw r13, THREAD_NORMSAVE(3)(r10) + DO_KVM BOOKE_INTERRUPT_ITLB_MISS SPRN_SRR1 mfspr r10, SPRN_SRR0 /* Get faulting address */ /* If we are faulting a kernel address, we have to use the @@ -538,36 +555,54 @@ interrupt_base: #ifdef CONFIG_SPE /* SPE Unavailable */ START_EXCEPTION(SPEUnavailable) - NORMAL_EXCEPTION_PROLOG + NORMAL_EXCEPTION_PROLOG(SPE_UNAVAIL) bne load_up_spe addi r3,r1,STACK_FRAME_OVERHEAD EXC_XFER_EE_LITE(0x2010, KernelSPE) #else - EXCEPTION(0x2020, SPEUnavailable, unknown_exception, EXC_XFER_EE) + EXCEPTION(0x2020, SPE_UNAVAIL, SPEUnavailable, \ + unknown_exception, EXC_XFER_EE) #endif /* CONFIG_SPE */ /* SPE Floating Point Data */ #ifdef CONFIG_SPE - EXCEPTION(0x2030, SPEFloatingPointData, SPEFloatingPointException, EXC_XFER_EE); + EXCEPTION(0x2030, SPE_FP_DATA, SPEFloatingPointData, \ + SPEFloatingPointException, EXC_XFER_EE); /* SPE Floating Point Round */ - EXCEPTION(0x2050, SPEFloatingPointRound, SPEFloatingPointRoundException, EXC_XFER_EE) + EXCEPTION(0x2050, SPE_FP_ROUND, SPEFloatingPointRound, \ + SPEFloatingPointRoundException, EXC_XFER_EE) #else - EXCEPTION(0x2040, SPEFloatingPointData, unknown_exception, EXC_XFER_EE) - EXCEPTION(0x2050, SPEFloatingPointRound, unknown_exception, EXC_XFER_EE) + EXCEPTION(0x2040, SPE_FP_DATA, SPEFloatingPointData, \ + unknown_exception, EXC_XFER_EE) + EXCEPTION(0x2050, SPE_FP_ROUND, SPEFloatingPointRound, \ + unknown_exception, EXC_XFER_EE) #endif /* CONFIG_SPE */ /* Performance Monitor */ - EXCEPTION(0x2060, PerformanceMonitor, performance_monitor_exception, EXC_XFER_STD) + EXCEPTION(0x2060, PERFORMANCE_MONITOR, PerformanceMonitor, \ + performance_monitor_exception, EXC_XFER_STD) - EXCEPTION(0x2070, Doorbell, doorbell_exception, EXC_XFER_STD) + EXCEPTION(0x2070, DOORBELL, Doorbell, doorbell_exception, EXC_XFER_STD) - CRITICAL_EXCEPTION(0x2080, CriticalDoorbell, unknown_exception) + CRITICAL_EXCEPTION(0x2080, DOORBELL_CRITICAL, \ + CriticalDoorbell, unknown_exception) /* Debug Interrupt */ DEBUG_DEBUG_EXCEPTION DEBUG_CRIT_EXCEPTION + GUEST_DOORBELL_EXCEPTION + + CRITICAL_EXCEPTION(0, GUEST_DBELL_CRIT, CriticalGuestDoorbell, \ + unknown_exception) + + /* Hypercall */ + EXCEPTION(0, HV_SYSCALL, Hypercall, unknown_exception, EXC_XFER_EE) + + /* Embedded Hypervisor Privilege */ + EXCEPTION(0, HV_PRIV, Ehvpriv, unknown_exception, EXC_XFER_EE) + /* * Local functions */ @@ -871,8 +906,31 @@ _GLOBAL(__setup_e500mc_ivors) mtspr SPRN_IVOR36,r3 li r3,CriticalDoorbell@l mtspr SPRN_IVOR37,r3 + + /* + * We only want to touch IVOR38-41 if we're running on hardware + * that supports category E.HV. The architectural way to determine + * this is MMUCFG[LPIDSIZE]. + */ + mfspr r3, SPRN_MMUCFG + andis. r3, r3, MMUCFG_LPIDSIZE@h + beq no_hv + li r3,GuestDoorbell@l + mtspr SPRN_IVOR38,r3 + li r3,CriticalGuestDoorbell@l + mtspr SPRN_IVOR39,r3 + li r3,Hypercall@l + mtspr SPRN_IVOR40,r3 + li r3,Ehvpriv@l + mtspr SPRN_IVOR41,r3 +skip_hv_ivors: sync blr +no_hv: + lwz r3, CPU_SPEC_FEATURES(r5) + rlwinm r3, r3, 0, ~CPU_FTR_EMB_HV + stw r3, CPU_SPEC_FEATURES(r5) + b skip_hv_ivors #ifdef CONFIG_SPE /* diff --git a/arch/powerpc/kernel/idle_power7.S b/arch/powerpc/kernel/idle_power7.S index 0cdc9a3..7140d83 100644 --- a/arch/powerpc/kernel/idle_power7.S +++ b/arch/powerpc/kernel/idle_power7.S @@ -16,6 +16,7 @@ #include <asm/asm-offsets.h> #include <asm/ppc-opcode.h> #include <asm/hw_irq.h> +#include <asm/kvm_book3s_asm.h> #undef DEBUG @@ -81,6 +82,12 @@ _GLOBAL(power7_idle) std r9,_MSR(r1) std r1,PACAR1(r13) +#ifdef CONFIG_KVM_BOOK3S_64_HV + /* Tell KVM we're napping */ + li r4,KVM_HWTHREAD_IN_NAP + stb r4,HSTATE_HWTHREAD_STATE(r13) +#endif + /* Magic NAP mode enter sequence */ std r0,0(r1) ptesync diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c index 786a270..d1f2aaf 100644 --- a/arch/powerpc/kernel/ppc_ksyms.c +++ b/arch/powerpc/kernel/ppc_ksyms.c @@ -190,3 +190,7 @@ EXPORT_SYMBOL(__arch_hweight16); EXPORT_SYMBOL(__arch_hweight32); EXPORT_SYMBOL(__arch_hweight64); #endif + +#ifdef CONFIG_PPC_BOOK3S_64 +EXPORT_SYMBOL_GPL(mmu_psize_defs); +#endif diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 2c42cd7..99a995c 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -100,7 +100,7 @@ static int decrementer_set_next_event(unsigned long evt, static void decrementer_set_mode(enum clock_event_mode mode, struct clock_event_device *dev); -static struct clock_event_device decrementer_clockevent = { +struct clock_event_device decrementer_clockevent = { .name = "decrementer", .rating = 200, .irq = 0, @@ -108,6 +108,7 @@ static struct clock_event_device decrementer_clockevent = { .set_mode = decrementer_set_mode, .features = CLOCK_EVT_FEAT_ONESHOT, }; +EXPORT_SYMBOL(decrementer_clockevent); DEFINE_PER_CPU(u64, decrementers_next_tb); static DEFINE_PER_CPU(struct clock_event_device, decrementers); diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c index 7b612a7..50e7dbc 100644 --- a/arch/powerpc/kvm/44x.c +++ b/arch/powerpc/kvm/44x.c @@ -29,15 +29,18 @@ #include <asm/kvm_ppc.h> #include "44x_tlb.h" +#include "booke.h" void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { + kvmppc_booke_vcpu_load(vcpu, cpu); kvmppc_44x_tlb_load(vcpu); } void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) { kvmppc_44x_tlb_put(vcpu); + kvmppc_booke_vcpu_put(vcpu); } int kvmppc_core_check_processor_compat(void) @@ -160,6 +163,15 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) kmem_cache_free(kvm_vcpu_cache, vcpu_44x); } +int kvmppc_core_init_vm(struct kvm *kvm) +{ + return 0; +} + +void kvmppc_core_destroy_vm(struct kvm *kvm) +{ +} + static int __init kvmppc_44x_init(void) { int r; diff --git a/arch/powerpc/kvm/44x_emulate.c b/arch/powerpc/kvm/44x_emulate.c index 549bb2c..c8c6157 100644 --- a/arch/powerpc/kvm/44x_emulate.c +++ b/arch/powerpc/kvm/44x_emulate.c @@ -37,22 +37,19 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int inst, int *advance) { int emulated = EMULATE_DONE; - int dcrn; - int ra; - int rb; - int rc; - int rs; - int rt; - int ws; + int dcrn = get_dcrn(inst); + int ra = get_ra(inst); + int rb = get_rb(inst); + int rc = get_rc(inst); + int rs = get_rs(inst); + int rt = get_rt(inst); + int ws = get_ws(inst); switch (get_op(inst)) { case 31: switch (get_xop(inst)) { case XOP_MFDCR: - dcrn = get_dcrn(inst); - rt = get_rt(inst); - /* The guest may access CPR0 registers to determine the timebase * frequency, and it must know the real host frequency because it * can directly access the timebase registers. @@ -88,9 +85,6 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, break; case XOP_MTDCR: - dcrn = get_dcrn(inst); - rs = get_rs(inst); - /* emulate some access in kernel */ switch (dcrn) { case DCRN_CPR0_CONFIG_ADDR: @@ -108,17 +102,10 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, break; case XOP_TLBWE: - ra = get_ra(inst); - rs = get_rs(inst); - ws = get_ws(inst); emulated = kvmppc_44x_emul_tlbwe(vcpu, ra, rs, ws); break; case XOP_TLBSX: - rt = get_rt(inst); - ra = get_ra(inst); - rb = get_rb(inst); - rc = get_rc(inst); emulated = kvmppc_44x_emul_tlbsx(vcpu, rt, ra, rb, rc); break; @@ -141,41 +128,41 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, return emulated; } -int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) +int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) { int emulated = EMULATE_DONE; switch (sprn) { case SPRN_PID: - kvmppc_set_pid(vcpu, kvmppc_get_gpr(vcpu, rs)); break; + kvmppc_set_pid(vcpu, spr_val); break; case SPRN_MMUCR: - vcpu->arch.mmucr = kvmppc_get_gpr(vcpu, rs); break; + vcpu->arch.mmucr = spr_val; break; case SPRN_CCR0: - vcpu->arch.ccr0 = kvmppc_get_gpr(vcpu, rs); break; + vcpu->arch.ccr0 = spr_val; break; case SPRN_CCR1: - vcpu->arch.ccr1 = kvmppc_get_gpr(vcpu, rs); break; + vcpu->arch.ccr1 = spr_val; break; default: - emulated = kvmppc_booke_emulate_mtspr(vcpu, sprn, rs); + emulated = kvmppc_booke_emulate_mtspr(vcpu, sprn, spr_val); } return emulated; } -int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) +int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) { int emulated = EMULATE_DONE; switch (sprn) { case SPRN_PID: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.pid); break; + *spr_val = vcpu->arch.pid; break; case SPRN_MMUCR: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.mmucr); break; + *spr_val = vcpu->arch.mmucr; break; case SPRN_CCR0: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.ccr0); break; + *spr_val = vcpu->arch.ccr0; break; case SPRN_CCR1: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.ccr1); break; + *spr_val = vcpu->arch.ccr1; break; default: - emulated = kvmppc_booke_emulate_mfspr(vcpu, sprn, rt); + emulated = kvmppc_booke_emulate_mfspr(vcpu, sprn, spr_val); } return emulated; diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 8f64709..f4dacb9 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -90,6 +90,9 @@ config KVM_BOOK3S_64_PR depends on KVM_BOOK3S_64 && !KVM_BOOK3S_64_HV select KVM_BOOK3S_PR +config KVM_BOOKE_HV + bool + config KVM_440 bool "KVM support for PowerPC 440 processors" depends on EXPERIMENTAL && 44x @@ -106,7 +109,7 @@ config KVM_440 config KVM_EXIT_TIMING bool "Detailed exit timing" - depends on KVM_440 || KVM_E500 + depends on KVM_440 || KVM_E500V2 || KVM_E500MC ---help--- Calculate elapsed time for every exit/enter cycle. A per-vcpu report is available in debugfs kvm/vm#_vcpu#_timing. @@ -115,14 +118,29 @@ config KVM_EXIT_TIMING If unsure, say N. -config KVM_E500 - bool "KVM support for PowerPC E500 processors" - depends on EXPERIMENTAL && E500 +config KVM_E500V2 + bool "KVM support for PowerPC E500v2 processors" + depends on EXPERIMENTAL && E500 && !PPC_E500MC select KVM select KVM_MMIO ---help--- Support running unmodified E500 guest kernels in virtual machines on - E500 host processors. + E500v2 host processors. + + This module provides access to the hardware capabilities through + a character device node named /dev/kvm. + + If unsure, say N. + +config KVM_E500MC + bool "KVM support for PowerPC E500MC/E5500 processors" + depends on EXPERIMENTAL && PPC_E500MC + select KVM + select KVM_MMIO + select KVM_BOOKE_HV + ---help--- + Support running unmodified E500MC/E5500 (32-bit) guest kernels in + virtual machines on E500MC/E5500 host processors. This module provides access to the hardware capabilities through a character device node named /dev/kvm. diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index 3688aee..c2a0863 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -36,7 +36,17 @@ kvm-e500-objs := \ e500.o \ e500_tlb.o \ e500_emulate.o -kvm-objs-$(CONFIG_KVM_E500) := $(kvm-e500-objs) +kvm-objs-$(CONFIG_KVM_E500V2) := $(kvm-e500-objs) + +kvm-e500mc-objs := \ + $(common-objs-y) \ + booke.o \ + booke_emulate.o \ + bookehv_interrupts.o \ + e500mc.o \ + e500_tlb.o \ + e500_emulate.o +kvm-objs-$(CONFIG_KVM_E500MC) := $(kvm-e500mc-objs) kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_PR) := \ ../../../virt/kvm/coalesced_mmio.o \ @@ -44,6 +54,7 @@ kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_PR) := \ book3s_paired_singles.o \ book3s_pr.o \ book3s_pr_papr.o \ + book3s_64_vio_hv.o \ book3s_emulate.o \ book3s_interrupts.o \ book3s_mmu_hpte.o \ @@ -68,6 +79,7 @@ kvm-book3s_64-module-objs := \ powerpc.o \ emulate.o \ book3s.o \ + book3s_64_vio.o \ $(kvm-book3s_64-objs-y) kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-module-objs) @@ -88,7 +100,8 @@ kvm-objs-$(CONFIG_KVM_BOOK3S_32) := $(kvm-book3s_32-objs) kvm-objs := $(kvm-objs-m) $(kvm-objs-y) obj-$(CONFIG_KVM_440) += kvm.o -obj-$(CONFIG_KVM_E500) += kvm.o +obj-$(CONFIG_KVM_E500V2) += kvm.o +obj-$(CONFIG_KVM_E500MC) += kvm.o obj-$(CONFIG_KVM_BOOK3S_64) += kvm.o obj-$(CONFIG_KVM_BOOK3S_32) += kvm.o diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 7d54f4e..3f2a836 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -258,7 +258,7 @@ static bool clear_irqprio(struct kvm_vcpu *vcpu, unsigned int priority) return true; } -void kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu) +int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu) { unsigned long *pending = &vcpu->arch.pending_exceptions; unsigned long old_pending = vcpu->arch.pending_exceptions; @@ -283,12 +283,17 @@ void kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu) /* Tell the guest about our interrupt status */ kvmppc_update_int_pending(vcpu, *pending, old_pending); + + return 0; } pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn) { ulong mp_pa = vcpu->arch.magic_page_pa; + if (!(vcpu->arch.shared->msr & MSR_SF)) + mp_pa = (uint32_t)mp_pa; + /* Magic page override */ if (unlikely(mp_pa) && unlikely(((gfn << PAGE_SHIFT) & KVM_PAM) == diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index c3beaee..80a5775 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -36,13 +36,11 @@ /* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */ #define MAX_LPID_970 63 -#define NR_LPIDS (LPID_RSVD + 1) -unsigned long lpid_inuse[BITS_TO_LONGS(NR_LPIDS)]; long kvmppc_alloc_hpt(struct kvm *kvm) { unsigned long hpt; - unsigned long lpid; + long lpid; struct revmap_entry *rev; struct kvmppc_linear_info *li; @@ -72,14 +70,9 @@ long kvmppc_alloc_hpt(struct kvm *kvm) } kvm->arch.revmap = rev; - /* Allocate the guest's logical partition ID */ - do { - lpid = find_first_zero_bit(lpid_inuse, NR_LPIDS); - if (lpid >= NR_LPIDS) { - pr_err("kvm_alloc_hpt: No LPIDs free\n"); - goto out_freeboth; - } - } while (test_and_set_bit(lpid, lpid_inuse)); + lpid = kvmppc_alloc_lpid(); + if (lpid < 0) + goto out_freeboth; kvm->arch.sdr1 = __pa(hpt) | (HPT_ORDER - 18); kvm->arch.lpid = lpid; @@ -96,7 +89,7 @@ long kvmppc_alloc_hpt(struct kvm *kvm) void kvmppc_free_hpt(struct kvm *kvm) { - clear_bit(kvm->arch.lpid, lpid_inuse); + kvmppc_free_lpid(kvm->arch.lpid); vfree(kvm->arch.revmap); if (kvm->arch.hpt_li) kvm_release_hpt(kvm->arch.hpt_li); @@ -171,8 +164,7 @@ int kvmppc_mmu_hv_init(void) if (!cpu_has_feature(CPU_FTR_HVMODE)) return -EINVAL; - memset(lpid_inuse, 0, sizeof(lpid_inuse)); - + /* POWER7 has 10-bit LPIDs, PPC970 and e500mc have 6-bit LPIDs */ if (cpu_has_feature(CPU_FTR_ARCH_206)) { host_lpid = mfspr(SPRN_LPID); /* POWER7 */ rsvd_lpid = LPID_RSVD; @@ -181,9 +173,11 @@ int kvmppc_mmu_hv_init(void) rsvd_lpid = MAX_LPID_970; } - set_bit(host_lpid, lpid_inuse); + kvmppc_init_lpid(rsvd_lpid + 1); + + kvmppc_claim_lpid(host_lpid); /* rsvd_lpid is reserved for use in partition switching */ - set_bit(rsvd_lpid, lpid_inuse); + kvmppc_claim_lpid(rsvd_lpid); return 0; } @@ -452,7 +446,7 @@ static int instruction_is_store(unsigned int instr) } static int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu, - unsigned long gpa, int is_store) + unsigned long gpa, gva_t ea, int is_store) { int ret; u32 last_inst; @@ -499,6 +493,7 @@ static int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu, */ vcpu->arch.paddr_accessed = gpa; + vcpu->arch.vaddr_accessed = ea; return kvmppc_emulate_mmio(run, vcpu); } @@ -552,7 +547,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, /* No memslot means it's an emulated MMIO region */ if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) { unsigned long gpa = (gfn << PAGE_SHIFT) | (ea & (psize - 1)); - return kvmppc_hv_emulate_mmio(run, vcpu, gpa, + return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, dsisr & DSISR_ISSTORE); } diff --git a/arch/powerpc/kvm/book3s_64_slb.S b/arch/powerpc/kvm/book3s_64_slb.S index f2e6e48..56b983e 100644 --- a/arch/powerpc/kvm/book3s_64_slb.S +++ b/arch/powerpc/kvm/book3s_64_slb.S @@ -90,8 +90,6 @@ slb_exit_skip_ ## num: or r10, r10, r12 slbie r10 - isync - /* Fill SLB with our shadow */ lbz r12, SVCPU_SLB_MAX(r3) diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c new file mode 100644 index 0000000..72ffc89 --- /dev/null +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -0,0 +1,150 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + * Copyright 2011 David Gibson, IBM Corporation <dwg@au1.ibm.com> + */ + +#include <linux/types.h> +#include <linux/string.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/highmem.h> +#include <linux/gfp.h> +#include <linux/slab.h> +#include <linux/hugetlb.h> +#include <linux/list.h> +#include <linux/anon_inodes.h> + +#include <asm/tlbflush.h> +#include <asm/kvm_ppc.h> +#include <asm/kvm_book3s.h> +#include <asm/mmu-hash64.h> +#include <asm/hvcall.h> +#include <asm/synch.h> +#include <asm/ppc-opcode.h> +#include <asm/kvm_host.h> +#include <asm/udbg.h> + +#define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64)) + +static long kvmppc_stt_npages(unsigned long window_size) +{ + return ALIGN((window_size >> SPAPR_TCE_SHIFT) + * sizeof(u64), PAGE_SIZE) / PAGE_SIZE; +} + +static void release_spapr_tce_table(struct kvmppc_spapr_tce_table *stt) +{ + struct kvm *kvm = stt->kvm; + int i; + + mutex_lock(&kvm->lock); + list_del(&stt->list); + for (i = 0; i < kvmppc_stt_npages(stt->window_size); i++) + __free_page(stt->pages[i]); + kfree(stt); + mutex_unlock(&kvm->lock); + + kvm_put_kvm(kvm); +} + +static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct kvmppc_spapr_tce_table *stt = vma->vm_file->private_data; + struct page *page; + + if (vmf->pgoff >= kvmppc_stt_npages(stt->window_size)) + return VM_FAULT_SIGBUS; + + page = stt->pages[vmf->pgoff]; + get_page(page); + vmf->page = page; + return 0; +} + +static const struct vm_operations_struct kvm_spapr_tce_vm_ops = { + .fault = kvm_spapr_tce_fault, +}; + +static int kvm_spapr_tce_mmap(struct file *file, struct vm_area_struct *vma) +{ + vma->vm_ops = &kvm_spapr_tce_vm_ops; + return 0; +} + +static int kvm_spapr_tce_release(struct inode *inode, struct file *filp) +{ + struct kvmppc_spapr_tce_table *stt = filp->private_data; + + release_spapr_tce_table(stt); + return 0; +} + +static struct file_operations kvm_spapr_tce_fops = { + .mmap = kvm_spapr_tce_mmap, + .release = kvm_spapr_tce_release, +}; + +long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, + struct kvm_create_spapr_tce *args) +{ + struct kvmppc_spapr_tce_table *stt = NULL; + long npages; + int ret = -ENOMEM; + int i; + + /* Check this LIOBN hasn't been previously allocated */ + list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) { + if (stt->liobn == args->liobn) + return -EBUSY; + } + + npages = kvmppc_stt_npages(args->window_size); + + stt = kzalloc(sizeof(*stt) + npages * sizeof(struct page *), + GFP_KERNEL); + if (!stt) + goto fail; + + stt->liobn = args->liobn; + stt->window_size = args->window_size; + stt->kvm = kvm; + + for (i = 0; i < npages; i++) { + stt->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!stt->pages[i]) + goto fail; + } + + kvm_get_kvm(kvm); + + mutex_lock(&kvm->lock); + list_add(&stt->list, &kvm->arch.spapr_tce_tables); + + mutex_unlock(&kvm->lock); + + return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops, + stt, O_RDWR); + +fail: + if (stt) { + for (i = 0; i < npages; i++) + if (stt->pages[i]) + __free_page(stt->pages[i]); + + kfree(stt); + } + return ret; +} diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index ea0f8c5..30c2f3b 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -38,6 +38,9 @@ #define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64)) +/* WARNING: This will be called in real-mode on HV KVM and virtual + * mode on PR KVM + */ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, unsigned long ioba, unsigned long tce) { diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index 135663a..b9a989d 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -87,6 +87,10 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int inst, int *advance) { int emulated = EMULATE_DONE; + int rt = get_rt(inst); + int rs = get_rs(inst); + int ra = get_ra(inst); + int rb = get_rb(inst); switch (get_op(inst)) { case 19: @@ -106,21 +110,22 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, case 31: switch (get_xop(inst)) { case OP_31_XOP_MFMSR: - kvmppc_set_gpr(vcpu, get_rt(inst), - vcpu->arch.shared->msr); + kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->msr); break; case OP_31_XOP_MTMSRD: { - ulong rs = kvmppc_get_gpr(vcpu, get_rs(inst)); + ulong rs_val = kvmppc_get_gpr(vcpu, rs); if (inst & 0x10000) { - vcpu->arch.shared->msr &= ~(MSR_RI | MSR_EE); - vcpu->arch.shared->msr |= rs & (MSR_RI | MSR_EE); + ulong new_msr = vcpu->arch.shared->msr; + new_msr &= ~(MSR_RI | MSR_EE); + new_msr |= rs_val & (MSR_RI | MSR_EE); + vcpu->arch.shared->msr = new_msr; } else - kvmppc_set_msr(vcpu, rs); + kvmppc_set_msr(vcpu, rs_val); break; } case OP_31_XOP_MTMSR: - kvmppc_set_msr(vcpu, kvmppc_get_gpr(vcpu, get_rs(inst))); + kvmppc_set_msr(vcpu, kvmppc_get_gpr(vcpu, rs)); break; case OP_31_XOP_MFSR: { @@ -130,7 +135,7 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, if (vcpu->arch.mmu.mfsrin) { u32 sr; sr = vcpu->arch.mmu.mfsrin(vcpu, srnum); - kvmppc_set_gpr(vcpu, get_rt(inst), sr); + kvmppc_set_gpr(vcpu, rt, sr); } break; } @@ -138,29 +143,29 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, { int srnum; - srnum = (kvmppc_get_gpr(vcpu, get_rb(inst)) >> 28) & 0xf; + srnum = (kvmppc_get_gpr(vcpu, rb) >> 28) & 0xf; if (vcpu->arch.mmu.mfsrin) { u32 sr; sr = vcpu->arch.mmu.mfsrin(vcpu, srnum); - kvmppc_set_gpr(vcpu, get_rt(inst), sr); + kvmppc_set_gpr(vcpu, rt, sr); } break; } case OP_31_XOP_MTSR: vcpu->arch.mmu.mtsrin(vcpu, (inst >> 16) & 0xf, - kvmppc_get_gpr(vcpu, get_rs(inst))); + kvmppc_get_gpr(vcpu, rs)); break; case OP_31_XOP_MTSRIN: vcpu->arch.mmu.mtsrin(vcpu, - (kvmppc_get_gpr(vcpu, get_rb(inst)) >> 28) & 0xf, - kvmppc_get_gpr(vcpu, get_rs(inst))); + (kvmppc_get_gpr(vcpu, rb) >> 28) & 0xf, + kvmppc_get_gpr(vcpu, rs)); break; case OP_31_XOP_TLBIE: case OP_31_XOP_TLBIEL: { bool large = (inst & 0x00200000) ? true : false; - ulong addr = kvmppc_get_gpr(vcpu, get_rb(inst)); + ulong addr = kvmppc_get_gpr(vcpu, rb); vcpu->arch.mmu.tlbie(vcpu, addr, large); break; } @@ -171,15 +176,15 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, return EMULATE_FAIL; vcpu->arch.mmu.slbmte(vcpu, - kvmppc_get_gpr(vcpu, get_rs(inst)), - kvmppc_get_gpr(vcpu, get_rb(inst))); + kvmppc_get_gpr(vcpu, rs), + kvmppc_get_gpr(vcpu, rb)); break; case OP_31_XOP_SLBIE: if (!vcpu->arch.mmu.slbie) return EMULATE_FAIL; vcpu->arch.mmu.slbie(vcpu, - kvmppc_get_gpr(vcpu, get_rb(inst))); + kvmppc_get_gpr(vcpu, rb)); break; case OP_31_XOP_SLBIA: if (!vcpu->arch.mmu.slbia) @@ -191,22 +196,22 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, if (!vcpu->arch.mmu.slbmfee) { emulated = EMULATE_FAIL; } else { - ulong t, rb; + ulong t, rb_val; - rb = kvmppc_get_gpr(vcpu, get_rb(inst)); - t = vcpu->arch.mmu.slbmfee(vcpu, rb); - kvmppc_set_gpr(vcpu, get_rt(inst), t); + rb_val = kvmppc_get_gpr(vcpu, rb); + t = vcpu->arch.mmu.slbmfee(vcpu, rb_val); + kvmppc_set_gpr(vcpu, rt, t); } break; case OP_31_XOP_SLBMFEV: if (!vcpu->arch.mmu.slbmfev) { emulated = EMULATE_FAIL; } else { - ulong t, rb; + ulong t, rb_val; - rb = kvmppc_get_gpr(vcpu, get_rb(inst)); - t = vcpu->arch.mmu.slbmfev(vcpu, rb); - kvmppc_set_gpr(vcpu, get_rt(inst), t); + rb_val = kvmppc_get_gpr(vcpu, rb); + t = vcpu->arch.mmu.slbmfev(vcpu, rb_val); + kvmppc_set_gpr(vcpu, rt, t); } break; case OP_31_XOP_DCBA: @@ -214,17 +219,17 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, break; case OP_31_XOP_DCBZ: { - ulong rb = kvmppc_get_gpr(vcpu, get_rb(inst)); - ulong ra = 0; + ulong rb_val = kvmppc_get_gpr(vcpu, rb); + ulong ra_val = 0; ulong addr, vaddr; u32 zeros[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; u32 dsisr; int r; - if (get_ra(inst)) - ra = kvmppc_get_gpr(vcpu, get_ra(inst)); + if (ra) + ra_val = kvmppc_get_gpr(vcpu, ra); - addr = (ra + rb) & ~31ULL; + addr = (ra_val + rb_val) & ~31ULL; if (!(vcpu->arch.shared->msr & MSR_SF)) addr &= 0xffffffff; vaddr = addr; @@ -313,10 +318,9 @@ static struct kvmppc_bat *kvmppc_find_bat(struct kvm_vcpu *vcpu, int sprn) return bat; } -int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) +int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) { int emulated = EMULATE_DONE; - ulong spr_val = kvmppc_get_gpr(vcpu, rs); switch (sprn) { case SPRN_SDR1: @@ -428,7 +432,7 @@ unprivileged: return emulated; } -int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) +int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) { int emulated = EMULATE_DONE; @@ -441,46 +445,46 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) struct kvmppc_bat *bat = kvmppc_find_bat(vcpu, sprn); if (sprn % 2) - kvmppc_set_gpr(vcpu, rt, bat->raw >> 32); + *spr_val = bat->raw >> 32; else - kvmppc_set_gpr(vcpu, rt, bat->raw); + *spr_val = bat->raw; break; } case SPRN_SDR1: if (!spr_allowed(vcpu, PRIV_HYPER)) goto unprivileged; - kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->sdr1); + *spr_val = to_book3s(vcpu)->sdr1; break; case SPRN_DSISR: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->dsisr); + *spr_val = vcpu->arch.shared->dsisr; break; case SPRN_DAR: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->dar); + *spr_val = vcpu->arch.shared->dar; break; case SPRN_HIOR: - kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hior); + *spr_val = to_book3s(vcpu)->hior; break; case SPRN_HID0: - kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[0]); + *spr_val = to_book3s(vcpu)->hid[0]; break; case SPRN_HID1: - kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[1]); + *spr_val = to_book3s(vcpu)->hid[1]; break; case SPRN_HID2: case SPRN_HID2_GEKKO: - kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[2]); + *spr_val = to_book3s(vcpu)->hid[2]; break; case SPRN_HID4: case SPRN_HID4_GEKKO: - kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[4]); + *spr_val = to_book3s(vcpu)->hid[4]; break; case SPRN_HID5: - kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[5]); + *spr_val = to_book3s(vcpu)->hid[5]; break; case SPRN_CFAR: case SPRN_PURR: - kvmppc_set_gpr(vcpu, rt, 0); + *spr_val = 0; break; case SPRN_GQR0: case SPRN_GQR1: @@ -490,8 +494,7 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) case SPRN_GQR5: case SPRN_GQR6: case SPRN_GQR7: - kvmppc_set_gpr(vcpu, rt, - to_book3s(vcpu)->gqr[sprn - SPRN_GQR0]); + *spr_val = to_book3s(vcpu)->gqr[sprn - SPRN_GQR0]; break; case SPRN_THRM1: case SPRN_THRM2: @@ -506,7 +509,7 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) case SPRN_PMC3_GEKKO: case SPRN_PMC4_GEKKO: case SPRN_WPAR_GEKKO: - kvmppc_set_gpr(vcpu, rt, 0); + *spr_val = 0; break; default: unprivileged: @@ -565,23 +568,22 @@ u32 kvmppc_alignment_dsisr(struct kvm_vcpu *vcpu, unsigned int inst) ulong kvmppc_alignment_dar(struct kvm_vcpu *vcpu, unsigned int inst) { ulong dar = 0; - ulong ra; + ulong ra = get_ra(inst); + ulong rb = get_rb(inst); switch (get_op(inst)) { case OP_LFS: case OP_LFD: case OP_STFD: case OP_STFS: - ra = get_ra(inst); if (ra) dar = kvmppc_get_gpr(vcpu, ra); dar += (s32)((s16)inst); break; case 31: - ra = get_ra(inst); if (ra) dar = kvmppc_get_gpr(vcpu, ra); - dar += kvmppc_get_gpr(vcpu, get_rb(inst)); + dar += kvmppc_get_gpr(vcpu, rb); break; default: printk(KERN_INFO "KVM: Unaligned instruction 0x%x\n", inst); diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 108d1f5..c6af1d6 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -60,12 +60,20 @@ static int kvmppc_hv_setup_rma(struct kvm_vcpu *vcpu); void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { + struct kvmppc_vcore *vc = vcpu->arch.vcore; + local_paca->kvm_hstate.kvm_vcpu = vcpu; - local_paca->kvm_hstate.kvm_vcore = vcpu->arch.vcore; + local_paca->kvm_hstate.kvm_vcore = vc; + if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE) + vc->stolen_tb += mftb() - vc->preempt_tb; } void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) { + struct kvmppc_vcore *vc = vcpu->arch.vcore; + + if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE) + vc->preempt_tb = mftb(); } void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr) @@ -134,6 +142,22 @@ static void init_vpa(struct kvm_vcpu *vcpu, struct lppaca *vpa) vpa->yield_count = 1; } +/* Length for a per-processor buffer is passed in at offset 4 in the buffer */ +struct reg_vpa { + u32 dummy; + union { + u16 hword; + u32 word; + } length; +}; + +static int vpa_is_registered(struct kvmppc_vpa *vpap) +{ + if (vpap->update_pending) + return vpap->next_gpa != 0; + return vpap->pinned_addr != NULL; +} + static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu, unsigned long flags, unsigned long vcpuid, unsigned long vpa) @@ -142,88 +166,182 @@ static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu, unsigned long len, nb; void *va; struct kvm_vcpu *tvcpu; - int err = H_PARAMETER; + int err; + int subfunc; + struct kvmppc_vpa *vpap; tvcpu = kvmppc_find_vcpu(kvm, vcpuid); if (!tvcpu) return H_PARAMETER; - flags >>= 63 - 18; - flags &= 7; - if (flags == 0 || flags == 4) - return H_PARAMETER; - if (flags < 4) { - if (vpa & 0x7f) + subfunc = (flags >> H_VPA_FUNC_SHIFT) & H_VPA_FUNC_MASK; + if (subfunc == H_VPA_REG_VPA || subfunc == H_VPA_REG_DTL || + subfunc == H_VPA_REG_SLB) { + /* Registering new area - address must be cache-line aligned */ + if ((vpa & (L1_CACHE_BYTES - 1)) || !vpa) return H_PARAMETER; - if (flags >= 2 && !tvcpu->arch.vpa) - return H_RESOURCE; - /* registering new area; convert logical addr to real */ + + /* convert logical addr to kernel addr and read length */ va = kvmppc_pin_guest_page(kvm, vpa, &nb); if (va == NULL) return H_PARAMETER; - if (flags <= 1) - len = *(unsigned short *)(va + 4); + if (subfunc == H_VPA_REG_VPA) + len = ((struct reg_vpa *)va)->length.hword; else - len = *(unsigned int *)(va + 4); - if (len > nb) - goto out_unpin; - switch (flags) { - case 1: /* register VPA */ - if (len < 640) - goto out_unpin; - if (tvcpu->arch.vpa) - kvmppc_unpin_guest_page(kvm, vcpu->arch.vpa); - tvcpu->arch.vpa = va; - init_vpa(vcpu, va); - break; - case 2: /* register DTL */ - if (len < 48) - goto out_unpin; - len -= len % 48; - if (tvcpu->arch.dtl) - kvmppc_unpin_guest_page(kvm, vcpu->arch.dtl); - tvcpu->arch.dtl = va; - tvcpu->arch.dtl_end = va + len; + len = ((struct reg_vpa *)va)->length.word; + kvmppc_unpin_guest_page(kvm, va); + + /* Check length */ + if (len > nb || len < sizeof(struct reg_vpa)) + return H_PARAMETER; + } else { + vpa = 0; + len = 0; + } + + err = H_PARAMETER; + vpap = NULL; + spin_lock(&tvcpu->arch.vpa_update_lock); + + switch (subfunc) { + case H_VPA_REG_VPA: /* register VPA */ + if (len < sizeof(struct lppaca)) break; - case 3: /* register SLB shadow buffer */ - if (len < 16) - goto out_unpin; - if (tvcpu->arch.slb_shadow) - kvmppc_unpin_guest_page(kvm, vcpu->arch.slb_shadow); - tvcpu->arch.slb_shadow = va; + vpap = &tvcpu->arch.vpa; + err = 0; + break; + + case H_VPA_REG_DTL: /* register DTL */ + if (len < sizeof(struct dtl_entry)) break; - } - } else { - switch (flags) { - case 5: /* unregister VPA */ - if (tvcpu->arch.slb_shadow || tvcpu->arch.dtl) - return H_RESOURCE; - if (!tvcpu->arch.vpa) - break; - kvmppc_unpin_guest_page(kvm, tvcpu->arch.vpa); - tvcpu->arch.vpa = NULL; + len -= len % sizeof(struct dtl_entry); + + /* Check that they have previously registered a VPA */ + err = H_RESOURCE; + if (!vpa_is_registered(&tvcpu->arch.vpa)) break; - case 6: /* unregister DTL */ - if (!tvcpu->arch.dtl) - break; - kvmppc_unpin_guest_page(kvm, tvcpu->arch.dtl); - tvcpu->arch.dtl = NULL; + + vpap = &tvcpu->arch.dtl; + err = 0; + break; + + case H_VPA_REG_SLB: /* register SLB shadow buffer */ + /* Check that they have previously registered a VPA */ + err = H_RESOURCE; + if (!vpa_is_registered(&tvcpu->arch.vpa)) break; - case 7: /* unregister SLB shadow buffer */ - if (!tvcpu->arch.slb_shadow) - break; - kvmppc_unpin_guest_page(kvm, tvcpu->arch.slb_shadow); - tvcpu->arch.slb_shadow = NULL; + + vpap = &tvcpu->arch.slb_shadow; + err = 0; + break; + + case H_VPA_DEREG_VPA: /* deregister VPA */ + /* Check they don't still have a DTL or SLB buf registered */ + err = H_RESOURCE; + if (vpa_is_registered(&tvcpu->arch.dtl) || + vpa_is_registered(&tvcpu->arch.slb_shadow)) break; - } + + vpap = &tvcpu->arch.vpa; + err = 0; + break; + + case H_VPA_DEREG_DTL: /* deregister DTL */ + vpap = &tvcpu->arch.dtl; + err = 0; + break; + + case H_VPA_DEREG_SLB: /* deregister SLB shadow buffer */ + vpap = &tvcpu->arch.slb_shadow; + err = 0; + break; + } + + if (vpap) { + vpap->next_gpa = vpa; + vpap->len = len; + vpap->update_pending = 1; } - return H_SUCCESS; - out_unpin: - kvmppc_unpin_guest_page(kvm, va); + spin_unlock(&tvcpu->arch.vpa_update_lock); + return err; } +static void kvmppc_update_vpa(struct kvm *kvm, struct kvmppc_vpa *vpap) +{ + void *va; + unsigned long nb; + + vpap->update_pending = 0; + va = NULL; + if (vpap->next_gpa) { + va = kvmppc_pin_guest_page(kvm, vpap->next_gpa, &nb); + if (nb < vpap->len) { + /* + * If it's now too short, it must be that userspace + * has changed the mappings underlying guest memory, + * so unregister the region. + */ + kvmppc_unpin_guest_page(kvm, va); + va = NULL; + } + } + if (vpap->pinned_addr) + kvmppc_unpin_guest_page(kvm, vpap->pinned_addr); + vpap->pinned_addr = va; + if (va) + vpap->pinned_end = va + vpap->len; +} + +static void kvmppc_update_vpas(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + + spin_lock(&vcpu->arch.vpa_update_lock); + if (vcpu->arch.vpa.update_pending) { + kvmppc_update_vpa(kvm, &vcpu->arch.vpa); + init_vpa(vcpu, vcpu->arch.vpa.pinned_addr); + } + if (vcpu->arch.dtl.update_pending) { + kvmppc_update_vpa(kvm, &vcpu->arch.dtl); + vcpu->arch.dtl_ptr = vcpu->arch.dtl.pinned_addr; + vcpu->arch.dtl_index = 0; + } + if (vcpu->arch.slb_shadow.update_pending) + kvmppc_update_vpa(kvm, &vcpu->arch.slb_shadow); + spin_unlock(&vcpu->arch.vpa_update_lock); +} + +static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu, + struct kvmppc_vcore *vc) +{ + struct dtl_entry *dt; + struct lppaca *vpa; + unsigned long old_stolen; + + dt = vcpu->arch.dtl_ptr; + vpa = vcpu->arch.vpa.pinned_addr; + old_stolen = vcpu->arch.stolen_logged; + vcpu->arch.stolen_logged = vc->stolen_tb; + if (!dt || !vpa) + return; + memset(dt, 0, sizeof(struct dtl_entry)); + dt->dispatch_reason = 7; + dt->processor_id = vc->pcpu + vcpu->arch.ptid; + dt->timebase = mftb(); + dt->enqueue_to_dispatch_time = vc->stolen_tb - old_stolen; + dt->srr0 = kvmppc_get_pc(vcpu); + dt->srr1 = vcpu->arch.shregs.msr; + ++dt; + if (dt == vcpu->arch.dtl.pinned_end) + dt = vcpu->arch.dtl.pinned_addr; + vcpu->arch.dtl_ptr = dt; + /* order writing *dt vs. writing vpa->dtl_idx */ + smp_wmb(); + vpa->dtl_idx = ++vcpu->arch.dtl_index; +} + int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) { unsigned long req = kvmppc_get_gpr(vcpu, 3); @@ -468,6 +586,7 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) /* default to host PVR, since we can't spoof it */ vcpu->arch.pvr = mfspr(SPRN_PVR); kvmppc_set_pvr(vcpu, vcpu->arch.pvr); + spin_lock_init(&vcpu->arch.vpa_update_lock); kvmppc_mmu_book3s_hv_init(vcpu); @@ -486,6 +605,7 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) INIT_LIST_HEAD(&vcore->runnable_threads); spin_lock_init(&vcore->lock); init_waitqueue_head(&vcore->wq); + vcore->preempt_tb = mftb(); } kvm->arch.vcores[core] = vcore; } @@ -498,6 +618,7 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) ++vcore->num_threads; spin_unlock(&vcore->lock); vcpu->arch.vcore = vcore; + vcpu->arch.stolen_logged = vcore->stolen_tb; vcpu->arch.cpu_type = KVM_CPU_3S_64; kvmppc_sanity_check(vcpu); @@ -512,12 +633,14 @@ out: void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) { - if (vcpu->arch.dtl) - kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.dtl); - if (vcpu->arch.slb_shadow) - kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.slb_shadow); - if (vcpu->arch.vpa) - kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.vpa); + spin_lock(&vcpu->arch.vpa_update_lock); + if (vcpu->arch.dtl.pinned_addr) + kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.dtl.pinned_addr); + if (vcpu->arch.slb_shadow.pinned_addr) + kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.slb_shadow.pinned_addr); + if (vcpu->arch.vpa.pinned_addr) + kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.vpa.pinned_addr); + spin_unlock(&vcpu->arch.vpa_update_lock); kvm_vcpu_uninit(vcpu); kmem_cache_free(kvm_vcpu_cache, vcpu); } @@ -569,6 +692,45 @@ static void kvmppc_remove_runnable(struct kvmppc_vcore *vc, list_del(&vcpu->arch.run_list); } +static int kvmppc_grab_hwthread(int cpu) +{ + struct paca_struct *tpaca; + long timeout = 1000; + + tpaca = &paca[cpu]; + + /* Ensure the thread won't go into the kernel if it wakes */ + tpaca->kvm_hstate.hwthread_req = 1; + + /* + * If the thread is already executing in the kernel (e.g. handling + * a stray interrupt), wait for it to get back to nap mode. + * The smp_mb() is to ensure that our setting of hwthread_req + * is visible before we look at hwthread_state, so if this + * races with the code at system_reset_pSeries and the thread + * misses our setting of hwthread_req, we are sure to see its + * setting of hwthread_state, and vice versa. + */ + smp_mb(); + while (tpaca->kvm_hstate.hwthread_state == KVM_HWTHREAD_IN_KERNEL) { + if (--timeout <= 0) { + pr_err("KVM: couldn't grab cpu %d\n", cpu); + return -EBUSY; + } + udelay(1); + } + return 0; +} + +static void kvmppc_release_hwthread(int cpu) +{ + struct paca_struct *tpaca; + + tpaca = &paca[cpu]; + tpaca->kvm_hstate.hwthread_req = 0; + tpaca->kvm_hstate.kvm_vcpu = NULL; +} + static void kvmppc_start_thread(struct kvm_vcpu *vcpu) { int cpu; @@ -588,8 +750,7 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu) smp_wmb(); #if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP) if (vcpu->arch.ptid) { - tpaca->cpu_start = 0x80; - wmb(); + kvmppc_grab_hwthread(cpu); xics_wake_cpu(cpu); ++vc->n_woken; } @@ -639,7 +800,7 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc) struct kvm_vcpu *vcpu, *vcpu0, *vnext; long ret; u64 now; - int ptid; + int ptid, i; /* don't start if any threads have a signal pending */ list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) @@ -681,17 +842,29 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc) vc->nap_count = 0; vc->entry_exit_count = 0; vc->vcore_state = VCORE_RUNNING; + vc->stolen_tb += mftb() - vc->preempt_tb; vc->in_guest = 0; vc->pcpu = smp_processor_id(); vc->napping_threads = 0; - list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) + list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { kvmppc_start_thread(vcpu); + if (vcpu->arch.vpa.update_pending || + vcpu->arch.slb_shadow.update_pending || + vcpu->arch.dtl.update_pending) + kvmppc_update_vpas(vcpu); + kvmppc_create_dtl_entry(vcpu, vc); + } + /* Grab any remaining hw threads so they can't go into the kernel */ + for (i = ptid; i < threads_per_core; ++i) + kvmppc_grab_hwthread(vc->pcpu + i); preempt_disable(); spin_unlock(&vc->lock); kvm_guest_enter(); __kvmppc_vcore_entry(NULL, vcpu0); + for (i = 0; i < threads_per_core; ++i) + kvmppc_release_hwthread(vc->pcpu + i); spin_lock(&vc->lock); /* disable sending of IPIs on virtual external irqs */ @@ -737,6 +910,7 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc) spin_lock(&vc->lock); out: vc->vcore_state = VCORE_INACTIVE; + vc->preempt_tb = mftb(); list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads, arch.run_list) { if (vcpu->arch.ret != RESUME_GUEST) { @@ -835,6 +1009,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) spin_lock(&vc->lock); continue; } + vc->runner = vcpu; n_ceded = 0; list_for_each_entry(v, &vc->runnable_threads, arch.run_list) n_ceded += v->arch.ceded; @@ -854,6 +1029,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) wake_up(&v->arch.cpu_run); } } + vc->runner = NULL; } if (signal_pending(current)) { @@ -917,115 +1093,6 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) return r; } -static long kvmppc_stt_npages(unsigned long window_size) -{ - return ALIGN((window_size >> SPAPR_TCE_SHIFT) - * sizeof(u64), PAGE_SIZE) / PAGE_SIZE; -} - -static void release_spapr_tce_table(struct kvmppc_spapr_tce_table *stt) -{ - struct kvm *kvm = stt->kvm; - int i; - - mutex_lock(&kvm->lock); - list_del(&stt->list); - for (i = 0; i < kvmppc_stt_npages(stt->window_size); i++) - __free_page(stt->pages[i]); - kfree(stt); - mutex_unlock(&kvm->lock); - - kvm_put_kvm(kvm); -} - -static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct kvmppc_spapr_tce_table *stt = vma->vm_file->private_data; - struct page *page; - - if (vmf->pgoff >= kvmppc_stt_npages(stt->window_size)) - return VM_FAULT_SIGBUS; - - page = stt->pages[vmf->pgoff]; - get_page(page); - vmf->page = page; - return 0; -} - -static const struct vm_operations_struct kvm_spapr_tce_vm_ops = { - .fault = kvm_spapr_tce_fault, -}; - -static int kvm_spapr_tce_mmap(struct file *file, struct vm_area_struct *vma) -{ - vma->vm_ops = &kvm_spapr_tce_vm_ops; - return 0; -} - -static int kvm_spapr_tce_release(struct inode *inode, struct file *filp) -{ - struct kvmppc_spapr_tce_table *stt = filp->private_data; - - release_spapr_tce_table(stt); - return 0; -} - -static struct file_operations kvm_spapr_tce_fops = { - .mmap = kvm_spapr_tce_mmap, - .release = kvm_spapr_tce_release, -}; - -long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, - struct kvm_create_spapr_tce *args) -{ - struct kvmppc_spapr_tce_table *stt = NULL; - long npages; - int ret = -ENOMEM; - int i; - - /* Check this LIOBN hasn't been previously allocated */ - list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) { - if (stt->liobn == args->liobn) - return -EBUSY; - } - - npages = kvmppc_stt_npages(args->window_size); - - stt = kzalloc(sizeof(*stt) + npages* sizeof(struct page *), - GFP_KERNEL); - if (!stt) - goto fail; - - stt->liobn = args->liobn; - stt->window_size = args->window_size; - stt->kvm = kvm; - - for (i = 0; i < npages; i++) { - stt->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!stt->pages[i]) - goto fail; - } - - kvm_get_kvm(kvm); - - mutex_lock(&kvm->lock); - list_add(&stt->list, &kvm->arch.spapr_tce_tables); - - mutex_unlock(&kvm->lock); - - return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops, - stt, O_RDWR); - -fail: - if (stt) { - for (i = 0; i < npages; i++) - if (stt->pages[i]) - __free_page(stt->pages[i]); - - kfree(stt); - } - return ret; -} /* Work out RMLS (real mode limit selector) field value for a given RMA size. Assumes POWER7 or PPC970. */ @@ -1108,6 +1175,38 @@ long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, struct kvm_allocate_rma *ret) return fd; } +static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps, + int linux_psize) +{ + struct mmu_psize_def *def = &mmu_psize_defs[linux_psize]; + + if (!def->shift) + return; + (*sps)->page_shift = def->shift; + (*sps)->slb_enc = def->sllp; + (*sps)->enc[0].page_shift = def->shift; + (*sps)->enc[0].pte_enc = def->penc; + (*sps)++; +} + +int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm, struct kvm_ppc_smmu_info *info) +{ + struct kvm_ppc_one_seg_page_size *sps; + + info->flags = KVM_PPC_PAGE_SIZES_REAL; + if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) + info->flags |= KVM_PPC_1T_SEGMENTS; + info->slb_size = mmu_slb_size; + + /* We only support these sizes for now, and no muti-size segments */ + sps = &info->sps[0]; + kvmppc_add_seg_page_size(&sps, MMU_PAGE_4K); + kvmppc_add_seg_page_size(&sps, MMU_PAGE_64K); + kvmppc_add_seg_page_size(&sps, MMU_PAGE_16M); + + return 0; +} + /* * Get (and clear) the dirty memory log for a memory slot. */ @@ -1404,12 +1503,12 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, return EMULATE_FAIL; } -int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) +int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) { return EMULATE_FAIL; } -int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) +int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) { return EMULATE_FAIL; } diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S index d3fb4df..84035a5 100644 --- a/arch/powerpc/kvm/book3s_hv_interrupts.S +++ b/arch/powerpc/kvm/book3s_hv_interrupts.S @@ -68,19 +68,24 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) rotldi r10,r10,16 mtmsrd r10,1 - /* Save host PMU registers and load guest PMU registers */ + /* Save host PMU registers */ /* R4 is live here (vcpu pointer) but not r3 or r5 */ li r3, 1 sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */ mfspr r7, SPRN_MMCR0 /* save MMCR0 */ mtspr SPRN_MMCR0, r3 /* freeze all counters, disable interrupts */ + mfspr r6, SPRN_MMCRA +BEGIN_FTR_SECTION + /* On P7, clear MMCRA in order to disable SDAR updates */ + li r5, 0 + mtspr SPRN_MMCRA, r5 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) isync ld r3, PACALPPACAPTR(r13) /* is the host using the PMU? */ lbz r5, LPPACA_PMCINUSE(r3) cmpwi r5, 0 beq 31f /* skip if not */ mfspr r5, SPRN_MMCR1 - mfspr r6, SPRN_MMCRA std r7, HSTATE_MMCR(r13) std r5, HSTATE_MMCR + 8(r13) std r6, HSTATE_MMCR + 16(r13) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index b70bf22..a84aafc 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -26,6 +26,7 @@ #include <asm/hvcall.h> #include <asm/asm-offsets.h> #include <asm/exception-64s.h> +#include <asm/kvm_book3s_asm.h> /***************************************************************************** * * @@ -82,6 +83,7 @@ _GLOBAL(kvmppc_hv_entry_trampoline) #define XICS_XIRR 4 #define XICS_QIRR 0xc +#define XICS_IPI 2 /* interrupt source # for IPIs */ /* * We come in here when wakened from nap mode on a secondary hw thread. @@ -94,26 +96,54 @@ kvm_start_guest: subi r1,r1,STACK_FRAME_OVERHEAD ld r2,PACATOC(r13) - /* were we napping due to cede? */ - lbz r0,HSTATE_NAPPING(r13) - cmpwi r0,0 - bne kvm_end_cede + li r0,KVM_HWTHREAD_IN_KVM + stb r0,HSTATE_HWTHREAD_STATE(r13) - /* get vcpu pointer */ - ld r4, HSTATE_KVM_VCPU(r13) + /* NV GPR values from power7_idle() will no longer be valid */ + li r0,1 + stb r0,PACA_NAPSTATELOST(r13) - /* We got here with an IPI; clear it */ - ld r5, HSTATE_XICS_PHYS(r13) - li r0, 0xff - li r6, XICS_QIRR - li r7, XICS_XIRR - lwzcix r8, r5, r7 /* ack the interrupt */ + /* get vcpu pointer, NULL if we have no vcpu to run */ + ld r4,HSTATE_KVM_VCPU(r13) + cmpdi cr1,r4,0 + + /* Check the wake reason in SRR1 to see why we got here */ + mfspr r3,SPRN_SRR1 + rlwinm r3,r3,44-31,0x7 /* extract wake reason field */ + cmpwi r3,4 /* was it an external interrupt? */ + bne 27f + + /* + * External interrupt - for now assume it is an IPI, since we + * should never get any other interrupts sent to offline threads. + * Only do this for secondary threads. + */ + beq cr1,25f + lwz r3,VCPU_PTID(r4) + cmpwi r3,0 + beq 27f +25: ld r5,HSTATE_XICS_PHYS(r13) + li r0,0xff + li r6,XICS_QIRR + li r7,XICS_XIRR + lwzcix r8,r5,r7 /* get and ack the interrupt */ sync - stbcix r0, r5, r6 /* clear it */ - stwcix r8, r5, r7 /* EOI it */ + clrldi. r9,r8,40 /* get interrupt source ID. */ + beq 27f /* none there? */ + cmpwi r9,XICS_IPI + bne 26f + stbcix r0,r5,r6 /* clear IPI */ +26: stwcix r8,r5,r7 /* EOI the interrupt */ - /* NV GPR values from power7_idle() will no longer be valid */ - stb r0, PACA_NAPSTATELOST(r13) +27: /* XXX should handle hypervisor maintenance interrupts etc. here */ + + /* if we have no vcpu to run, go back to sleep */ + beq cr1,kvm_no_guest + + /* were we napping due to cede? */ + lbz r0,HSTATE_NAPPING(r13) + cmpwi r0,0 + bne kvm_end_cede .global kvmppc_hv_entry kvmppc_hv_entry: @@ -129,24 +159,15 @@ kvmppc_hv_entry: mflr r0 std r0, HSTATE_VMHANDLER(r13) - ld r14, VCPU_GPR(r14)(r4) - ld r15, VCPU_GPR(r15)(r4) - ld r16, VCPU_GPR(r16)(r4) - ld r17, VCPU_GPR(r17)(r4) - ld r18, VCPU_GPR(r18)(r4) - ld r19, VCPU_GPR(r19)(r4) - ld r20, VCPU_GPR(r20)(r4) - ld r21, VCPU_GPR(r21)(r4) - ld r22, VCPU_GPR(r22)(r4) - ld r23, VCPU_GPR(r23)(r4) - ld r24, VCPU_GPR(r24)(r4) - ld r25, VCPU_GPR(r25)(r4) - ld r26, VCPU_GPR(r26)(r4) - ld r27, VCPU_GPR(r27)(r4) - ld r28, VCPU_GPR(r28)(r4) - ld r29, VCPU_GPR(r29)(r4) - ld r30, VCPU_GPR(r30)(r4) - ld r31, VCPU_GPR(r31)(r4) + /* Set partition DABR */ + /* Do this before re-enabling PMU to avoid P7 DABR corruption bug */ + li r5,3 + ld r6,VCPU_DABR(r4) + mtspr SPRN_DABRX,r5 + mtspr SPRN_DABR,r6 +BEGIN_FTR_SECTION + isync +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) /* Load guest PMU registers */ /* R4 is live here (vcpu pointer) */ @@ -185,6 +206,25 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) /* Load up FP, VMX and VSX registers */ bl kvmppc_load_fp + ld r14, VCPU_GPR(r14)(r4) + ld r15, VCPU_GPR(r15)(r4) + ld r16, VCPU_GPR(r16)(r4) + ld r17, VCPU_GPR(r17)(r4) + ld r18, VCPU_GPR(r18)(r4) + ld r19, VCPU_GPR(r19)(r4) + ld r20, VCPU_GPR(r20)(r4) + ld r21, VCPU_GPR(r21)(r4) + ld r22, VCPU_GPR(r22)(r4) + ld r23, VCPU_GPR(r23)(r4) + ld r24, VCPU_GPR(r24)(r4) + ld r25, VCPU_GPR(r25)(r4) + ld r26, VCPU_GPR(r26)(r4) + ld r27, VCPU_GPR(r27)(r4) + ld r28, VCPU_GPR(r28)(r4) + ld r29, VCPU_GPR(r29)(r4) + ld r30, VCPU_GPR(r30)(r4) + ld r31, VCPU_GPR(r31)(r4) + BEGIN_FTR_SECTION /* Switch DSCR to guest value */ ld r5, VCPU_DSCR(r4) @@ -226,12 +266,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) mtspr SPRN_DAR, r5 mtspr SPRN_DSISR, r6 - /* Set partition DABR */ - li r5,3 - ld r6,VCPU_DABR(r4) - mtspr SPRN_DABRX,r5 - mtspr SPRN_DABR,r6 - BEGIN_FTR_SECTION /* Restore AMR and UAMOR, set AMOR to all 1s */ ld r5,VCPU_AMR(r4) @@ -925,12 +959,6 @@ BEGIN_FTR_SECTION mtspr SPRN_AMR,r6 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) - /* Restore host DABR and DABRX */ - ld r5,HSTATE_DABR(r13) - li r6,7 - mtspr SPRN_DABR,r5 - mtspr SPRN_DABRX,r6 - /* Switch DSCR back to host value */ BEGIN_FTR_SECTION mfspr r8, SPRN_DSCR @@ -969,6 +997,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) std r5, VCPU_SPRG2(r9) std r6, VCPU_SPRG3(r9) + /* save FP state */ + mr r3, r9 + bl .kvmppc_save_fp + /* Increment yield count if they have a VPA */ ld r8, VCPU_VPA(r9) /* do they have a VPA? */ cmpdi r8, 0 @@ -983,6 +1015,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */ mfspr r4, SPRN_MMCR0 /* save MMCR0 */ mtspr SPRN_MMCR0, r3 /* freeze all counters, disable ints */ + mfspr r6, SPRN_MMCRA +BEGIN_FTR_SECTION + /* On P7, clear MMCRA in order to disable SDAR updates */ + li r7, 0 + mtspr SPRN_MMCRA, r7 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) isync beq 21f /* if no VPA, save PMU stuff anyway */ lbz r7, LPPACA_PMCINUSE(r8) @@ -991,7 +1029,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) std r3, VCPU_MMCR(r9) /* if not, set saved MMCR0 to FC */ b 22f 21: mfspr r5, SPRN_MMCR1 - mfspr r6, SPRN_MMCRA std r4, VCPU_MMCR(r9) std r5, VCPU_MMCR + 8(r9) std r6, VCPU_MMCR + 16(r9) @@ -1016,17 +1053,20 @@ BEGIN_FTR_SECTION stw r11, VCPU_PMC + 28(r9) END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) 22: - /* save FP state */ - mr r3, r9 - bl .kvmppc_save_fp /* Secondary threads go off to take a nap on POWER7 */ BEGIN_FTR_SECTION - lwz r0,VCPU_PTID(r3) + lwz r0,VCPU_PTID(r9) cmpwi r0,0 bne secondary_nap END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) + /* Restore host DABR and DABRX */ + ld r5,HSTATE_DABR(r13) + li r6,7 + mtspr SPRN_DABR,r5 + mtspr SPRN_DABRX,r6 + /* * Reload DEC. HDEC interrupts were disabled when * we reloaded the host's LPCR value. @@ -1363,7 +1403,12 @@ bounce_ext_interrupt: _GLOBAL(kvmppc_h_set_dabr) std r4,VCPU_DABR(r3) - mtspr SPRN_DABR,r4 + /* Work around P7 bug where DABR can get corrupted on mtspr */ +1: mtspr SPRN_DABR,r4 + mfspr r5, SPRN_DABR + cmpd r4, r5 + bne 1b + isync li r3,0 blr @@ -1445,8 +1490,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206) * Take a nap until a decrementer or external interrupt occurs, * with PECE1 (wake on decr) and PECE0 (wake on external) set in LPCR */ - li r0,0x80 - stb r0,PACAPROCSTART(r13) + li r0,1 + stb r0,HSTATE_HWTHREAD_REQ(r13) mfspr r5,SPRN_LPCR ori r5,r5,LPCR_PECE0 | LPCR_PECE1 mtspr SPRN_LPCR,r5 @@ -1463,26 +1508,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206) kvm_end_cede: /* Woken by external or decrementer interrupt */ ld r1, HSTATE_HOST_R1(r13) - ld r2, PACATOC(r13) - /* If we're a secondary thread and we got here by an IPI, ack it */ - ld r4,HSTATE_KVM_VCPU(r13) - lwz r3,VCPU_PTID(r4) - cmpwi r3,0 - beq 27f - mfspr r3,SPRN_SRR1 - rlwinm r3,r3,44-31,0x7 /* extract wake reason field */ - cmpwi r3,4 /* was it an external interrupt? */ - bne 27f - ld r5, HSTATE_XICS_PHYS(r13) - li r0,0xff - li r6,XICS_QIRR - li r7,XICS_XIRR - lwzcix r8,r5,r7 /* ack the interrupt */ - sync - stbcix r0,r5,r6 /* clear it */ - stwcix r8,r5,r7 /* EOI it */ -27: /* load up FP state */ bl kvmppc_load_fp @@ -1580,12 +1606,17 @@ secondary_nap: stwcx. r3, 0, r4 bne 51b +kvm_no_guest: + li r0, KVM_HWTHREAD_IN_NAP + stb r0, HSTATE_HWTHREAD_STATE(r13) + li r0, 0 + std r0, HSTATE_KVM_VCPU(r13) + li r3, LPCR_PECE0 mfspr r4, SPRN_LPCR rlwimi r4, r3, 0, LPCR_PECE0 | LPCR_PECE1 mtspr SPRN_LPCR, r4 isync - li r0, 0 std r0, HSTATE_SCRATCH0(r13) ptesync ld r0, HSTATE_SCRATCH0(r13) @@ -1599,8 +1630,8 @@ secondary_nap: * r3 = vcpu pointer */ _GLOBAL(kvmppc_save_fp) - mfmsr r9 - ori r8,r9,MSR_FP + mfmsr r5 + ori r8,r5,MSR_FP #ifdef CONFIG_ALTIVEC BEGIN_FTR_SECTION oris r8,r8,MSR_VEC@h @@ -1649,7 +1680,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) #endif mfspr r6,SPRN_VRSAVE stw r6,VCPU_VRSAVE(r3) - mtmsrd r9 + mtmsrd r5 isync blr diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 7759053..a1baec3 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -120,6 +120,7 @@ void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr) if (msr & MSR_POW) { if (!vcpu->arch.pending_exceptions) { kvm_vcpu_block(vcpu); + clear_bit(KVM_REQ_UNHALT, &vcpu->requests); vcpu->stat.halt_wakeup++; /* Unset POW bit after we woke up */ @@ -144,6 +145,21 @@ void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr) } } + /* + * When switching from 32 to 64-bit, we may have a stale 32-bit + * magic page around, we need to flush it. Typically 32-bit magic + * page will be instanciated when calling into RTAS. Note: We + * assume that such transition only happens while in kernel mode, + * ie, we never transition from user 32-bit to kernel 64-bit with + * a 32-bit magic page around. + */ + if (vcpu->arch.magic_page_pa && + !(old_msr & MSR_PR) && !(old_msr & MSR_SF) && (msr & MSR_SF)) { + /* going from RTAS to normal kernel code */ + kvmppc_mmu_pte_flush(vcpu, (uint32_t)vcpu->arch.magic_page_pa, + ~0xFFFUL); + } + /* Preload FPU if it's enabled */ if (vcpu->arch.shared->msr & MSR_FP) kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP); @@ -251,6 +267,9 @@ static int kvmppc_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) { ulong mp_pa = vcpu->arch.magic_page_pa; + if (!(vcpu->arch.shared->msr & MSR_SF)) + mp_pa = (uint32_t)mp_pa; + if (unlikely(mp_pa) && unlikely((mp_pa & KVM_PAM) >> PAGE_SHIFT == gfn)) { return 1; @@ -351,6 +370,7 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, /* MMIO */ vcpu->stat.mmio_exits++; vcpu->arch.paddr_accessed = pte.raddr; + vcpu->arch.vaddr_accessed = pte.eaddr; r = kvmppc_emulate_mmio(run, vcpu); if ( r == RESUME_HOST_NV ) r = RESUME_HOST; @@ -528,6 +548,9 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, run->exit_reason = KVM_EXIT_UNKNOWN; run->ready_for_interrupt_injection = 1; + /* We get here with MSR.EE=0, so enable it to be a nice citizen */ + __hard_irq_enable(); + trace_kvm_book3s_exit(exit_nr, vcpu); preempt_enable(); kvm_resched(vcpu); @@ -617,10 +640,13 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, break; /* We're good on these - the host merely wanted to get our attention */ case BOOK3S_INTERRUPT_DECREMENTER: + case BOOK3S_INTERRUPT_HV_DECREMENTER: vcpu->stat.dec_exits++; r = RESUME_GUEST; break; case BOOK3S_INTERRUPT_EXTERNAL: + case BOOK3S_INTERRUPT_EXTERNAL_LEVEL: + case BOOK3S_INTERRUPT_EXTERNAL_HV: vcpu->stat.ext_intr_exits++; r = RESUME_GUEST; break; @@ -628,6 +654,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, r = RESUME_GUEST; break; case BOOK3S_INTERRUPT_PROGRAM: + case BOOK3S_INTERRUPT_H_EMUL_ASSIST: { enum emulation_result er; struct kvmppc_book3s_shadow_vcpu *svcpu; @@ -1131,6 +1158,31 @@ out: return r; } +#ifdef CONFIG_PPC64 +int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm, struct kvm_ppc_smmu_info *info) +{ + /* No flags */ + info->flags = 0; + + /* SLB is always 64 entries */ + info->slb_size = 64; + + /* Standard 4k base page size segment */ + info->sps[0].page_shift = 12; + info->sps[0].slb_enc = 0; + info->sps[0].enc[0].page_shift = 12; + info->sps[0].enc[0].pte_enc = 0; + + /* Standard 16M large page size segment */ + info->sps[1].page_shift = 24; + info->sps[1].slb_enc = SLB_VSID_L; + info->sps[1].enc[0].page_shift = 24; + info->sps[1].enc[0].pte_enc = 0; + + return 0; +} +#endif /* CONFIG_PPC64 */ + int kvmppc_core_prepare_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem) { @@ -1144,11 +1196,18 @@ void kvmppc_core_commit_memory_region(struct kvm *kvm, int kvmppc_core_init_vm(struct kvm *kvm) { +#ifdef CONFIG_PPC64 + INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); +#endif + return 0; } void kvmppc_core_destroy_vm(struct kvm *kvm) { +#ifdef CONFIG_PPC64 + WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables)); +#endif } static int kvmppc_book3s_init(void) diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c index b958932..3ff9013 100644 --- a/arch/powerpc/kvm/book3s_pr_papr.c +++ b/arch/powerpc/kvm/book3s_pr_papr.c @@ -15,6 +15,8 @@ * published by the Free Software Foundation. */ +#include <linux/anon_inodes.h> + #include <asm/uaccess.h> #include <asm/kvm_ppc.h> #include <asm/kvm_book3s.h> @@ -98,6 +100,83 @@ static int kvmppc_h_pr_remove(struct kvm_vcpu *vcpu) return EMULATE_DONE; } +/* Request defs for kvmppc_h_pr_bulk_remove() */ +#define H_BULK_REMOVE_TYPE 0xc000000000000000ULL +#define H_BULK_REMOVE_REQUEST 0x4000000000000000ULL +#define H_BULK_REMOVE_RESPONSE 0x8000000000000000ULL +#define H_BULK_REMOVE_END 0xc000000000000000ULL +#define H_BULK_REMOVE_CODE 0x3000000000000000ULL +#define H_BULK_REMOVE_SUCCESS 0x0000000000000000ULL +#define H_BULK_REMOVE_NOT_FOUND 0x1000000000000000ULL +#define H_BULK_REMOVE_PARM 0x2000000000000000ULL +#define H_BULK_REMOVE_HW 0x3000000000000000ULL +#define H_BULK_REMOVE_RC 0x0c00000000000000ULL +#define H_BULK_REMOVE_FLAGS 0x0300000000000000ULL +#define H_BULK_REMOVE_ABSOLUTE 0x0000000000000000ULL +#define H_BULK_REMOVE_ANDCOND 0x0100000000000000ULL +#define H_BULK_REMOVE_AVPN 0x0200000000000000ULL +#define H_BULK_REMOVE_PTEX 0x00ffffffffffffffULL +#define H_BULK_REMOVE_MAX_BATCH 4 + +static int kvmppc_h_pr_bulk_remove(struct kvm_vcpu *vcpu) +{ + int i; + int paramnr = 4; + int ret = H_SUCCESS; + + for (i = 0; i < H_BULK_REMOVE_MAX_BATCH; i++) { + unsigned long tsh = kvmppc_get_gpr(vcpu, paramnr+(2*i)); + unsigned long tsl = kvmppc_get_gpr(vcpu, paramnr+(2*i)+1); + unsigned long pteg, rb, flags; + unsigned long pte[2]; + unsigned long v = 0; + + if ((tsh & H_BULK_REMOVE_TYPE) == H_BULK_REMOVE_END) { + break; /* Exit success */ + } else if ((tsh & H_BULK_REMOVE_TYPE) != + H_BULK_REMOVE_REQUEST) { + ret = H_PARAMETER; + break; /* Exit fail */ + } + + tsh &= H_BULK_REMOVE_PTEX | H_BULK_REMOVE_FLAGS; + tsh |= H_BULK_REMOVE_RESPONSE; + + if ((tsh & H_BULK_REMOVE_ANDCOND) && + (tsh & H_BULK_REMOVE_AVPN)) { + tsh |= H_BULK_REMOVE_PARM; + kvmppc_set_gpr(vcpu, paramnr+(2*i), tsh); + ret = H_PARAMETER; + break; /* Exit fail */ + } + + pteg = get_pteg_addr(vcpu, tsh & H_BULK_REMOVE_PTEX); + copy_from_user(pte, (void __user *)pteg, sizeof(pte)); + + /* tsl = AVPN */ + flags = (tsh & H_BULK_REMOVE_FLAGS) >> 26; + + if ((pte[0] & HPTE_V_VALID) == 0 || + ((flags & H_AVPN) && (pte[0] & ~0x7fUL) != tsl) || + ((flags & H_ANDCOND) && (pte[0] & tsl) != 0)) { + tsh |= H_BULK_REMOVE_NOT_FOUND; + } else { + /* Splat the pteg in (userland) hpt */ + copy_to_user((void __user *)pteg, &v, sizeof(v)); + + rb = compute_tlbie_rb(pte[0], pte[1], + tsh & H_BULK_REMOVE_PTEX); + vcpu->arch.mmu.tlbie(vcpu, rb, rb & 1 ? true : false); + tsh |= H_BULK_REMOVE_SUCCESS; + tsh |= (pte[1] & (HPTE_R_C | HPTE_R_R)) << 43; + } + kvmppc_set_gpr(vcpu, paramnr+(2*i), tsh); + } + kvmppc_set_gpr(vcpu, 3, ret); + + return EMULATE_DONE; +} + static int kvmppc_h_pr_protect(struct kvm_vcpu *vcpu) { unsigned long flags = kvmppc_get_gpr(vcpu, 4); @@ -134,6 +213,20 @@ static int kvmppc_h_pr_protect(struct kvm_vcpu *vcpu) return EMULATE_DONE; } +static int kvmppc_h_pr_put_tce(struct kvm_vcpu *vcpu) +{ + unsigned long liobn = kvmppc_get_gpr(vcpu, 4); + unsigned long ioba = kvmppc_get_gpr(vcpu, 5); + unsigned long tce = kvmppc_get_gpr(vcpu, 6); + long rc; + + rc = kvmppc_h_put_tce(vcpu, liobn, ioba, tce); + if (rc == H_TOO_HARD) + return EMULATE_FAIL; + kvmppc_set_gpr(vcpu, 3, rc); + return EMULATE_DONE; +} + int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd) { switch (cmd) { @@ -144,12 +237,12 @@ int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd) case H_PROTECT: return kvmppc_h_pr_protect(vcpu); case H_BULK_REMOVE: - /* We just flush all PTEs, so user space can - handle the HPT modifications */ - kvmppc_mmu_pte_flush(vcpu, 0, 0); - break; + return kvmppc_h_pr_bulk_remove(vcpu); + case H_PUT_TCE: + return kvmppc_h_pr_put_tce(vcpu); case H_CEDE: kvm_vcpu_block(vcpu); + clear_bit(KVM_REQ_UNHALT, &vcpu->requests); vcpu->stat.halt_wakeup++; return EMULATE_DONE; } diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S index 6e6e9ce..798491a 100644 --- a/arch/powerpc/kvm/book3s_segment.S +++ b/arch/powerpc/kvm/book3s_segment.S @@ -128,24 +128,25 @@ no_dcbz32_on: /* First clear RI in our current MSR value */ li r0, MSR_RI andc r6, r6, r0 - MTMSR_EERI(r6) - mtsrr0 r9 - mtsrr1 r4 PPC_LL r0, SVCPU_R0(r3) PPC_LL r1, SVCPU_R1(r3) PPC_LL r2, SVCPU_R2(r3) - PPC_LL r4, SVCPU_R4(r3) PPC_LL r5, SVCPU_R5(r3) - PPC_LL r6, SVCPU_R6(r3) PPC_LL r7, SVCPU_R7(r3) PPC_LL r8, SVCPU_R8(r3) - PPC_LL r9, SVCPU_R9(r3) PPC_LL r10, SVCPU_R10(r3) PPC_LL r11, SVCPU_R11(r3) PPC_LL r12, SVCPU_R12(r3) PPC_LL r13, SVCPU_R13(r3) + MTMSR_EERI(r6) + mtsrr0 r9 + mtsrr1 r4 + + PPC_LL r4, SVCPU_R4(r3) + PPC_LL r6, SVCPU_R6(r3) + PPC_LL r9, SVCPU_R9(r3) PPC_LL r3, (SVCPU_R3)(r3) RFI diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index ee9e1ee..72f13f4 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -17,6 +17,8 @@ * * Authors: Hollis Blanchard <hollisb@us.ibm.com> * Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com> + * Scott Wood <scottwood@freescale.com> + * Varun Sethi <varun.sethi@freescale.com> */ #include <linux/errno.h> @@ -30,9 +32,12 @@ #include <asm/cputable.h> #include <asm/uaccess.h> #include <asm/kvm_ppc.h> -#include "timing.h" #include <asm/cacheflush.h> +#include <asm/dbell.h> +#include <asm/hw_irq.h> +#include <asm/irq.h> +#include "timing.h" #include "booke.h" unsigned long kvmppc_booke_handlers; @@ -55,6 +60,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "dec", VCPU_STAT(dec_exits) }, { "ext_intr", VCPU_STAT(ext_intr_exits) }, { "halt_wakeup", VCPU_STAT(halt_wakeup) }, + { "doorbell", VCPU_STAT(dbell_exits) }, + { "guest doorbell", VCPU_STAT(gdbell_exits) }, { NULL } }; @@ -121,6 +128,10 @@ void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr) { u32 old_msr = vcpu->arch.shared->msr; +#ifdef CONFIG_KVM_BOOKE_HV + new_msr |= MSR_GS; +#endif + vcpu->arch.shared->msr = new_msr; kvmppc_mmu_msr_notify(vcpu, old_msr); @@ -195,17 +206,87 @@ void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu, clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions); } +static void set_guest_srr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1) +{ +#ifdef CONFIG_KVM_BOOKE_HV + mtspr(SPRN_GSRR0, srr0); + mtspr(SPRN_GSRR1, srr1); +#else + vcpu->arch.shared->srr0 = srr0; + vcpu->arch.shared->srr1 = srr1; +#endif +} + +static void set_guest_csrr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1) +{ + vcpu->arch.csrr0 = srr0; + vcpu->arch.csrr1 = srr1; +} + +static void set_guest_dsrr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1) +{ + if (cpu_has_feature(CPU_FTR_DEBUG_LVL_EXC)) { + vcpu->arch.dsrr0 = srr0; + vcpu->arch.dsrr1 = srr1; + } else { + set_guest_csrr(vcpu, srr0, srr1); + } +} + +static void set_guest_mcsrr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1) +{ + vcpu->arch.mcsrr0 = srr0; + vcpu->arch.mcsrr1 = srr1; +} + +static unsigned long get_guest_dear(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_KVM_BOOKE_HV + return mfspr(SPRN_GDEAR); +#else + return vcpu->arch.shared->dar; +#endif +} + +static void set_guest_dear(struct kvm_vcpu *vcpu, unsigned long dear) +{ +#ifdef CONFIG_KVM_BOOKE_HV + mtspr(SPRN_GDEAR, dear); +#else + vcpu->arch.shared->dar = dear; +#endif +} + +static unsigned long get_guest_esr(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_KVM_BOOKE_HV + return mfspr(SPRN_GESR); +#else + return vcpu->arch.shared->esr; +#endif +} + +static void set_guest_esr(struct kvm_vcpu *vcpu, u32 esr) +{ +#ifdef CONFIG_KVM_BOOKE_HV + mtspr(SPRN_GESR, esr); +#else + vcpu->arch.shared->esr = esr; +#endif +} + /* Deliver the interrupt of the corresponding priority, if possible. */ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority) { int allowed = 0; - ulong uninitialized_var(msr_mask); + ulong msr_mask = 0; bool update_esr = false, update_dear = false; ulong crit_raw = vcpu->arch.shared->critical; ulong crit_r1 = kvmppc_get_gpr(vcpu, 1); bool crit; bool keep_irq = false; + enum int_class int_class; /* Truncate crit indicators in 32 bit mode */ if (!(vcpu->arch.shared->msr & MSR_SF)) { @@ -241,46 +322,85 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, case BOOKE_IRQPRIO_AP_UNAVAIL: case BOOKE_IRQPRIO_ALIGNMENT: allowed = 1; - msr_mask = MSR_CE|MSR_ME|MSR_DE; + msr_mask = MSR_CE | MSR_ME | MSR_DE; + int_class = INT_CLASS_NONCRIT; break; case BOOKE_IRQPRIO_CRITICAL: - case BOOKE_IRQPRIO_WATCHDOG: + case BOOKE_IRQPRIO_DBELL_CRIT: allowed = vcpu->arch.shared->msr & MSR_CE; + allowed = allowed && !crit; msr_mask = MSR_ME; + int_class = INT_CLASS_CRIT; break; case BOOKE_IRQPRIO_MACHINE_CHECK: allowed = vcpu->arch.shared->msr & MSR_ME; - msr_mask = 0; + allowed = allowed && !crit; + int_class = INT_CLASS_MC; break; case BOOKE_IRQPRIO_DECREMENTER: case BOOKE_IRQPRIO_FIT: keep_irq = true; /* fall through */ case BOOKE_IRQPRIO_EXTERNAL: + case BOOKE_IRQPRIO_DBELL: allowed = vcpu->arch.shared->msr & MSR_EE; allowed = allowed && !crit; - msr_mask = MSR_CE|MSR_ME|MSR_DE; + msr_mask = MSR_CE | MSR_ME | MSR_DE; + int_class = INT_CLASS_NONCRIT; break; case BOOKE_IRQPRIO_DEBUG: allowed = vcpu->arch.shared->msr & MSR_DE; + allowed = allowed && !crit; msr_mask = MSR_ME; + int_class = INT_CLASS_CRIT; break; } if (allowed) { - vcpu->arch.shared->srr0 = vcpu->arch.pc; - vcpu->arch.shared->srr1 = vcpu->arch.shared->msr; + switch (int_class) { + case INT_CLASS_NONCRIT: + set_guest_srr(vcpu, vcpu->arch.pc, + vcpu->arch.shared->msr); + break; + case INT_CLASS_CRIT: + set_guest_csrr(vcpu, vcpu->arch.pc, + vcpu->arch.shared->msr); + break; + case INT_CLASS_DBG: + set_guest_dsrr(vcpu, vcpu->arch.pc, + vcpu->arch.shared->msr); + break; + case INT_CLASS_MC: + set_guest_mcsrr(vcpu, vcpu->arch.pc, + vcpu->arch.shared->msr); + break; + } + vcpu->arch.pc = vcpu->arch.ivpr | vcpu->arch.ivor[priority]; if (update_esr == true) - vcpu->arch.shared->esr = vcpu->arch.queued_esr; + set_guest_esr(vcpu, vcpu->arch.queued_esr); if (update_dear == true) - vcpu->arch.shared->dar = vcpu->arch.queued_dear; + set_guest_dear(vcpu, vcpu->arch.queued_dear); kvmppc_set_msr(vcpu, vcpu->arch.shared->msr & msr_mask); if (!keep_irq) clear_bit(priority, &vcpu->arch.pending_exceptions); } +#ifdef CONFIG_KVM_BOOKE_HV + /* + * If an interrupt is pending but masked, raise a guest doorbell + * so that we are notified when the guest enables the relevant + * MSR bit. + */ + if (vcpu->arch.pending_exceptions & BOOKE_IRQMASK_EE) + kvmppc_set_pending_interrupt(vcpu, INT_CLASS_NONCRIT); + if (vcpu->arch.pending_exceptions & BOOKE_IRQMASK_CE) + kvmppc_set_pending_interrupt(vcpu, INT_CLASS_CRIT); + if (vcpu->arch.pending_exceptions & BOOKE_IRQPRIO_MACHINE_CHECK) + kvmppc_set_pending_interrupt(vcpu, INT_CLASS_MC); +#endif + return allowed; } @@ -305,7 +425,7 @@ static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu) } priority = __ffs(*pending); - while (priority <= BOOKE_IRQPRIO_MAX) { + while (priority < BOOKE_IRQPRIO_MAX) { if (kvmppc_booke_irqprio_deliver(vcpu, priority)) break; @@ -319,8 +439,9 @@ static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu) } /* Check pending exceptions and deliver one, if possible. */ -void kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu) +int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu) { + int r = 0; WARN_ON_ONCE(!irqs_disabled()); kvmppc_core_check_exceptions(vcpu); @@ -328,16 +449,60 @@ void kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu) if (vcpu->arch.shared->msr & MSR_WE) { local_irq_enable(); kvm_vcpu_block(vcpu); + clear_bit(KVM_REQ_UNHALT, &vcpu->requests); local_irq_disable(); kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS); - kvmppc_core_check_exceptions(vcpu); + r = 1; }; + + return r; +} + +/* + * Common checks before entering the guest world. Call with interrupts + * disabled. + * + * returns !0 if a signal is pending and check_signal is true + */ +static int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu) +{ + int r = 0; + + WARN_ON_ONCE(!irqs_disabled()); + while (true) { + if (need_resched()) { + local_irq_enable(); + cond_resched(); + local_irq_disable(); + continue; + } + + if (signal_pending(current)) { + r = 1; + break; + } + + if (kvmppc_core_prepare_to_enter(vcpu)) { + /* interrupts got enabled in between, so we + are back at square 1 */ + continue; + } + + break; + } + + return r; } int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) { int ret; +#ifdef CONFIG_PPC_FPU + unsigned int fpscr; + int fpexc_mode; + u64 fpr[32]; +#endif if (!vcpu->arch.sane) { kvm_run->exit_reason = KVM_EXIT_INTERNAL_ERROR; @@ -345,17 +510,53 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) } local_irq_disable(); - - kvmppc_core_prepare_to_enter(vcpu); - - if (signal_pending(current)) { + if (kvmppc_prepare_to_enter(vcpu)) { kvm_run->exit_reason = KVM_EXIT_INTR; ret = -EINTR; goto out; } kvm_guest_enter(); + +#ifdef CONFIG_PPC_FPU + /* Save userspace FPU state in stack */ + enable_kernel_fp(); + memcpy(fpr, current->thread.fpr, sizeof(current->thread.fpr)); + fpscr = current->thread.fpscr.val; + fpexc_mode = current->thread.fpexc_mode; + + /* Restore guest FPU state to thread */ + memcpy(current->thread.fpr, vcpu->arch.fpr, sizeof(vcpu->arch.fpr)); + current->thread.fpscr.val = vcpu->arch.fpscr; + + /* + * Since we can't trap on MSR_FP in GS-mode, we consider the guest + * as always using the FPU. Kernel usage of FP (via + * enable_kernel_fp()) in this thread must not occur while + * vcpu->fpu_active is set. + */ + vcpu->fpu_active = 1; + + kvmppc_load_guest_fp(vcpu); +#endif + ret = __kvmppc_vcpu_run(kvm_run, vcpu); + +#ifdef CONFIG_PPC_FPU + kvmppc_save_guest_fp(vcpu); + + vcpu->fpu_active = 0; + + /* Save guest FPU state from thread */ + memcpy(vcpu->arch.fpr, current->thread.fpr, sizeof(vcpu->arch.fpr)); + vcpu->arch.fpscr = current->thread.fpscr.val; + + /* Restore userspace FPU state from stack */ + memcpy(current->thread.fpr, fpr, sizeof(current->thread.fpr)); + current->thread.fpscr.val = fpscr; + current->thread.fpexc_mode = fpexc_mode; +#endif + kvm_guest_exit(); out: @@ -363,6 +564,84 @@ out: return ret; } +static int emulation_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) +{ + enum emulation_result er; + + er = kvmppc_emulate_instruction(run, vcpu); + switch (er) { + case EMULATE_DONE: + /* don't overwrite subtypes, just account kvm_stats */ + kvmppc_account_exit_stat(vcpu, EMULATED_INST_EXITS); + /* Future optimization: only reload non-volatiles if + * they were actually modified by emulation. */ + return RESUME_GUEST_NV; + + case EMULATE_DO_DCR: + run->exit_reason = KVM_EXIT_DCR; + return RESUME_HOST; + + case EMULATE_FAIL: + printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n", + __func__, vcpu->arch.pc, vcpu->arch.last_inst); + /* For debugging, encode the failing instruction and + * report it to userspace. */ + run->hw.hardware_exit_reason = ~0ULL << 32; + run->hw.hardware_exit_reason |= vcpu->arch.last_inst; + kvmppc_core_queue_program(vcpu, ESR_PIL); + return RESUME_HOST; + + default: + BUG(); + } +} + +static void kvmppc_fill_pt_regs(struct pt_regs *regs) +{ + ulong r1, ip, msr, lr; + + asm("mr %0, 1" : "=r"(r1)); + asm("mflr %0" : "=r"(lr)); + asm("mfmsr %0" : "=r"(msr)); + asm("bl 1f; 1: mflr %0" : "=r"(ip)); + + memset(regs, 0, sizeof(*regs)); + regs->gpr[1] = r1; + regs->nip = ip; + regs->msr = msr; + regs->link = lr; +} + +static void kvmppc_restart_interrupt(struct kvm_vcpu *vcpu, + unsigned int exit_nr) +{ + struct pt_regs regs; + + switch (exit_nr) { + case BOOKE_INTERRUPT_EXTERNAL: + kvmppc_fill_pt_regs(®s); + do_IRQ(®s); + break; + case BOOKE_INTERRUPT_DECREMENTER: + kvmppc_fill_pt_regs(®s); + timer_interrupt(®s); + break; +#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_BOOK3E_64) + case BOOKE_INTERRUPT_DOORBELL: + kvmppc_fill_pt_regs(®s); + doorbell_exception(®s); + break; +#endif + case BOOKE_INTERRUPT_MACHINE_CHECK: + /* FIXME */ + break; + case BOOKE_INTERRUPT_PERFORMANCE_MONITOR: + kvmppc_fill_pt_regs(®s); + performance_monitor_exception(®s); + break; + } +} + /** * kvmppc_handle_exit * @@ -371,12 +650,14 @@ out: int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int exit_nr) { - enum emulation_result er; int r = RESUME_HOST; /* update before a new last_exit_type is rewritten */ kvmppc_update_timing_stats(vcpu); + /* restart interrupts if they were meant for the host */ + kvmppc_restart_interrupt(vcpu, exit_nr); + local_irq_enable(); run->exit_reason = KVM_EXIT_UNKNOWN; @@ -386,62 +667,74 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, case BOOKE_INTERRUPT_MACHINE_CHECK: printk("MACHINE CHECK: %lx\n", mfspr(SPRN_MCSR)); kvmppc_dump_vcpu(vcpu); + /* For debugging, send invalid exit reason to user space */ + run->hw.hardware_exit_reason = ~1ULL << 32; + run->hw.hardware_exit_reason |= mfspr(SPRN_MCSR); r = RESUME_HOST; break; case BOOKE_INTERRUPT_EXTERNAL: kvmppc_account_exit(vcpu, EXT_INTR_EXITS); - if (need_resched()) - cond_resched(); r = RESUME_GUEST; break; case BOOKE_INTERRUPT_DECREMENTER: - /* Since we switched IVPR back to the host's value, the host - * handled this interrupt the moment we enabled interrupts. - * Now we just offer it a chance to reschedule the guest. */ kvmppc_account_exit(vcpu, DEC_EXITS); - if (need_resched()) - cond_resched(); r = RESUME_GUEST; break; + case BOOKE_INTERRUPT_DOORBELL: + kvmppc_account_exit(vcpu, DBELL_EXITS); + r = RESUME_GUEST; + break; + + case BOOKE_INTERRUPT_GUEST_DBELL_CRIT: + kvmppc_account_exit(vcpu, GDBELL_EXITS); + + /* + * We are here because there is a pending guest interrupt + * which could not be delivered as MSR_CE or MSR_ME was not + * set. Once we break from here we will retry delivery. + */ + r = RESUME_GUEST; + break; + + case BOOKE_INTERRUPT_GUEST_DBELL: + kvmppc_account_exit(vcpu, GDBELL_EXITS); + + /* + * We are here because there is a pending guest interrupt + * which could not be delivered as MSR_EE was not set. Once + * we break from here we will retry delivery. + */ + r = RESUME_GUEST; + break; + + case BOOKE_INTERRUPT_PERFORMANCE_MONITOR: + r = RESUME_GUEST; + break; + + case BOOKE_INTERRUPT_HV_PRIV: + r = emulation_exit(run, vcpu); + break; + case BOOKE_INTERRUPT_PROGRAM: - if (vcpu->arch.shared->msr & MSR_PR) { - /* Program traps generated by user-level software must be handled - * by the guest kernel. */ + if (vcpu->arch.shared->msr & (MSR_PR | MSR_GS)) { + /* + * Program traps generated by user-level software must + * be handled by the guest kernel. + * + * In GS mode, hypervisor privileged instructions trap + * on BOOKE_INTERRUPT_HV_PRIV, not here, so these are + * actual program interrupts, handled by the guest. + */ kvmppc_core_queue_program(vcpu, vcpu->arch.fault_esr); r = RESUME_GUEST; kvmppc_account_exit(vcpu, USR_PR_INST); break; } - er = kvmppc_emulate_instruction(run, vcpu); - switch (er) { - case EMULATE_DONE: - /* don't overwrite subtypes, just account kvm_stats */ - kvmppc_account_exit_stat(vcpu, EMULATED_INST_EXITS); - /* Future optimization: only reload non-volatiles if - * they were actually modified by emulation. */ - r = RESUME_GUEST_NV; - break; - case EMULATE_DO_DCR: - run->exit_reason = KVM_EXIT_DCR; - r = RESUME_HOST; - break; - case EMULATE_FAIL: - /* XXX Deliver Program interrupt to guest. */ - printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n", - __func__, vcpu->arch.pc, vcpu->arch.last_inst); - /* For debugging, encode the failing instruction and - * report it to userspace. */ - run->hw.hardware_exit_reason = ~0ULL << 32; - run->hw.hardware_exit_reason |= vcpu->arch.last_inst; - r = RESUME_HOST; - break; - default: - BUG(); - } + r = emulation_exit(run, vcpu); break; case BOOKE_INTERRUPT_FP_UNAVAIL: @@ -506,6 +799,21 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, r = RESUME_GUEST; break; +#ifdef CONFIG_KVM_BOOKE_HV + case BOOKE_INTERRUPT_HV_SYSCALL: + if (!(vcpu->arch.shared->msr & MSR_PR)) { + kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu)); + } else { + /* + * hcall from guest userspace -- send privileged + * instruction program check. + */ + kvmppc_core_queue_program(vcpu, ESR_PPR); + } + + r = RESUME_GUEST; + break; +#else case BOOKE_INTERRUPT_SYSCALL: if (!(vcpu->arch.shared->msr & MSR_PR) && (((u32)kvmppc_get_gpr(vcpu, 0)) == KVM_SC_MAGIC_R0)) { @@ -519,6 +827,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, kvmppc_account_exit(vcpu, SYSCALL_EXITS); r = RESUME_GUEST; break; +#endif case BOOKE_INTERRUPT_DTLB_MISS: { unsigned long eaddr = vcpu->arch.fault_dear; @@ -526,7 +835,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, gpa_t gpaddr; gfn_t gfn; -#ifdef CONFIG_KVM_E500 +#ifdef CONFIG_KVM_E500V2 if (!(vcpu->arch.shared->msr & MSR_PR) && (eaddr & PAGE_MASK) == vcpu->arch.magic_page_ea) { kvmppc_map_magic(vcpu); @@ -567,6 +876,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, /* Guest has mapped and accessed a page which is not * actually RAM. */ vcpu->arch.paddr_accessed = gpaddr; + vcpu->arch.vaddr_accessed = eaddr; r = kvmppc_emulate_mmio(run, vcpu); kvmppc_account_exit(vcpu, MMIO_EXITS); } @@ -634,15 +944,13 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, BUG(); } - local_irq_disable(); - - kvmppc_core_prepare_to_enter(vcpu); - + /* + * To avoid clobbering exit_reason, only check for signals if we + * aren't already exiting to userspace for some other reason. + */ if (!(r & RESUME_HOST)) { - /* To avoid clobbering exit_reason, only check for signals if - * we aren't already exiting to userspace for some other - * reason. */ - if (signal_pending(current)) { + local_irq_disable(); + if (kvmppc_prepare_to_enter(vcpu)) { run->exit_reason = KVM_EXIT_INTR; r = (-EINTR << 2) | RESUME_HOST | (r & RESUME_FLAG_NV); kvmppc_account_exit(vcpu, SIGNAL_EXITS); @@ -659,12 +967,15 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) int r; vcpu->arch.pc = 0; - vcpu->arch.shared->msr = 0; - vcpu->arch.shadow_msr = MSR_USER | MSR_DE | MSR_IS | MSR_DS; vcpu->arch.shared->pir = vcpu->vcpu_id; kvmppc_set_gpr(vcpu, 1, (16<<20) - 8); /* -8 for the callee-save LR slot */ + kvmppc_set_msr(vcpu, 0); +#ifndef CONFIG_KVM_BOOKE_HV + vcpu->arch.shadow_msr = MSR_USER | MSR_DE | MSR_IS | MSR_DS; vcpu->arch.shadow_pid = 1; + vcpu->arch.shared->msr = 0; +#endif /* Eye-catching numbers so we know if the guest takes an interrupt * before it's programmed its own IVPR/IVORs. */ @@ -745,8 +1056,8 @@ static void get_sregs_base(struct kvm_vcpu *vcpu, sregs->u.e.csrr0 = vcpu->arch.csrr0; sregs->u.e.csrr1 = vcpu->arch.csrr1; sregs->u.e.mcsr = vcpu->arch.mcsr; - sregs->u.e.esr = vcpu->arch.shared->esr; - sregs->u.e.dear = vcpu->arch.shared->dar; + sregs->u.e.esr = get_guest_esr(vcpu); + sregs->u.e.dear = get_guest_dear(vcpu); sregs->u.e.tsr = vcpu->arch.tsr; sregs->u.e.tcr = vcpu->arch.tcr; sregs->u.e.dec = kvmppc_get_dec(vcpu, tb); @@ -763,8 +1074,8 @@ static int set_sregs_base(struct kvm_vcpu *vcpu, vcpu->arch.csrr0 = sregs->u.e.csrr0; vcpu->arch.csrr1 = sregs->u.e.csrr1; vcpu->arch.mcsr = sregs->u.e.mcsr; - vcpu->arch.shared->esr = sregs->u.e.esr; - vcpu->arch.shared->dar = sregs->u.e.dear; + set_guest_esr(vcpu, sregs->u.e.esr); + set_guest_dear(vcpu, sregs->u.e.dear); vcpu->arch.vrsave = sregs->u.e.vrsave; kvmppc_set_tcr(vcpu, sregs->u.e.tcr); @@ -932,15 +1243,6 @@ void kvmppc_core_commit_memory_region(struct kvm *kvm, { } -int kvmppc_core_init_vm(struct kvm *kvm) -{ - return 0; -} - -void kvmppc_core_destroy_vm(struct kvm *kvm) -{ -} - void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr) { vcpu->arch.tcr = new_tcr; @@ -968,8 +1270,19 @@ void kvmppc_decrementer_func(unsigned long data) kvmppc_set_tsr_bits(vcpu, TSR_DIS); } +void kvmppc_booke_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + current->thread.kvm_vcpu = vcpu; +} + +void kvmppc_booke_vcpu_put(struct kvm_vcpu *vcpu) +{ + current->thread.kvm_vcpu = NULL; +} + int __init kvmppc_booke_init(void) { +#ifndef CONFIG_KVM_BOOKE_HV unsigned long ivor[16]; unsigned long max_ivor = 0; int i; @@ -1012,7 +1325,7 @@ int __init kvmppc_booke_init(void) } flush_icache_range(kvmppc_booke_handlers, kvmppc_booke_handlers + max_ivor + kvmppc_handler_len); - +#endif /* !BOOKE_HV */ return 0; } diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h index 2fe2027..ba61974 100644 --- a/arch/powerpc/kvm/booke.h +++ b/arch/powerpc/kvm/booke.h @@ -23,6 +23,7 @@ #include <linux/types.h> #include <linux/kvm_host.h> #include <asm/kvm_ppc.h> +#include <asm/switch_to.h> #include "timing.h" /* interrupt priortity ordering */ @@ -48,7 +49,20 @@ #define BOOKE_IRQPRIO_PERFORMANCE_MONITOR 19 /* Internal pseudo-irqprio for level triggered externals */ #define BOOKE_IRQPRIO_EXTERNAL_LEVEL 20 -#define BOOKE_IRQPRIO_MAX 20 +#define BOOKE_IRQPRIO_DBELL 21 +#define BOOKE_IRQPRIO_DBELL_CRIT 22 +#define BOOKE_IRQPRIO_MAX 23 + +#define BOOKE_IRQMASK_EE ((1 << BOOKE_IRQPRIO_EXTERNAL_LEVEL) | \ + (1 << BOOKE_IRQPRIO_PERFORMANCE_MONITOR) | \ + (1 << BOOKE_IRQPRIO_DBELL) | \ + (1 << BOOKE_IRQPRIO_DECREMENTER) | \ + (1 << BOOKE_IRQPRIO_FIT) | \ + (1 << BOOKE_IRQPRIO_EXTERNAL)) + +#define BOOKE_IRQMASK_CE ((1 << BOOKE_IRQPRIO_DBELL_CRIT) | \ + (1 << BOOKE_IRQPRIO_WATCHDOG) | \ + (1 << BOOKE_IRQPRIO_CRITICAL)) extern unsigned long kvmppc_booke_handlers; @@ -61,8 +75,8 @@ void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits); int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int inst, int *advance); -int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt); -int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs); +int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val); +int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val); /* low-level asm code to transfer guest state */ void kvmppc_load_guest_spe(struct kvm_vcpu *vcpu); @@ -71,4 +85,46 @@ void kvmppc_save_guest_spe(struct kvm_vcpu *vcpu); /* high-level function, manages flags, host state */ void kvmppc_vcpu_disable_spe(struct kvm_vcpu *vcpu); +void kvmppc_booke_vcpu_load(struct kvm_vcpu *vcpu, int cpu); +void kvmppc_booke_vcpu_put(struct kvm_vcpu *vcpu); + +enum int_class { + INT_CLASS_NONCRIT, + INT_CLASS_CRIT, + INT_CLASS_MC, + INT_CLASS_DBG, +}; + +void kvmppc_set_pending_interrupt(struct kvm_vcpu *vcpu, enum int_class type); + +/* + * Load up guest vcpu FP state if it's needed. + * It also set the MSR_FP in thread so that host know + * we're holding FPU, and then host can help to save + * guest vcpu FP state if other threads require to use FPU. + * This simulates an FP unavailable fault. + * + * It requires to be called with preemption disabled. + */ +static inline void kvmppc_load_guest_fp(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_PPC_FPU + if (vcpu->fpu_active && !(current->thread.regs->msr & MSR_FP)) { + load_up_fpu(); + current->thread.regs->msr |= MSR_FP; + } +#endif +} + +/* + * Save guest vcpu FP state into thread. + * It requires to be called with preemption disabled. + */ +static inline void kvmppc_save_guest_fp(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_PPC_FPU + if (vcpu->fpu_active && (current->thread.regs->msr & MSR_FP)) + giveup_fpu(current); +#endif +} #endif /* __KVM_BOOKE_H__ */ diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c index 3e652da..6c76397 100644 --- a/arch/powerpc/kvm/booke_emulate.c +++ b/arch/powerpc/kvm/booke_emulate.c @@ -40,8 +40,8 @@ int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int inst, int *advance) { int emulated = EMULATE_DONE; - int rs; - int rt; + int rs = get_rs(inst); + int rt = get_rt(inst); switch (get_op(inst)) { case 19: @@ -62,19 +62,16 @@ int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, switch (get_xop(inst)) { case OP_31_XOP_MFMSR: - rt = get_rt(inst); kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->msr); kvmppc_set_exit_type(vcpu, EMULATED_MFMSR_EXITS); break; case OP_31_XOP_MTMSR: - rs = get_rs(inst); kvmppc_set_exit_type(vcpu, EMULATED_MTMSR_EXITS); kvmppc_set_msr(vcpu, kvmppc_get_gpr(vcpu, rs)); break; case OP_31_XOP_WRTEE: - rs = get_rs(inst); vcpu->arch.shared->msr = (vcpu->arch.shared->msr & ~MSR_EE) | (kvmppc_get_gpr(vcpu, rs) & MSR_EE); kvmppc_set_exit_type(vcpu, EMULATED_WRTEE_EXITS); @@ -99,22 +96,32 @@ int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, return emulated; } -int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) +/* + * NOTE: some of these registers are not emulated on BOOKE_HV (GS-mode). + * Their backing store is in real registers, and these functions + * will return the wrong result if called for them in another context + * (such as debugging). + */ +int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) { int emulated = EMULATE_DONE; - ulong spr_val = kvmppc_get_gpr(vcpu, rs); switch (sprn) { case SPRN_DEAR: - vcpu->arch.shared->dar = spr_val; break; + vcpu->arch.shared->dar = spr_val; + break; case SPRN_ESR: - vcpu->arch.shared->esr = spr_val; break; + vcpu->arch.shared->esr = spr_val; + break; case SPRN_DBCR0: - vcpu->arch.dbcr0 = spr_val; break; + vcpu->arch.dbcr0 = spr_val; + break; case SPRN_DBCR1: - vcpu->arch.dbcr1 = spr_val; break; + vcpu->arch.dbcr1 = spr_val; + break; case SPRN_DBSR: - vcpu->arch.dbsr &= ~spr_val; break; + vcpu->arch.dbsr &= ~spr_val; + break; case SPRN_TSR: kvmppc_clr_tsr_bits(vcpu, spr_val); break; @@ -122,20 +129,29 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) kvmppc_set_tcr(vcpu, spr_val); break; - /* Note: SPRG4-7 are user-readable. These values are - * loaded into the real SPRGs when resuming the - * guest. */ + /* + * Note: SPRG4-7 are user-readable. + * These values are loaded into the real SPRGs when resuming the + * guest (PR-mode only). + */ case SPRN_SPRG4: - vcpu->arch.shared->sprg4 = spr_val; break; + vcpu->arch.shared->sprg4 = spr_val; + break; case SPRN_SPRG5: - vcpu->arch.shared->sprg5 = spr_val; break; + vcpu->arch.shared->sprg5 = spr_val; + break; case SPRN_SPRG6: - vcpu->arch.shared->sprg6 = spr_val; break; + vcpu->arch.shared->sprg6 = spr_val; + break; case SPRN_SPRG7: - vcpu->arch.shared->sprg7 = spr_val; break; + vcpu->arch.shared->sprg7 = spr_val; + break; case SPRN_IVPR: vcpu->arch.ivpr = spr_val; +#ifdef CONFIG_KVM_BOOKE_HV + mtspr(SPRN_GIVPR, spr_val); +#endif break; case SPRN_IVOR0: vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL] = spr_val; @@ -145,6 +161,9 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) break; case SPRN_IVOR2: vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE] = spr_val; +#ifdef CONFIG_KVM_BOOKE_HV + mtspr(SPRN_GIVOR2, spr_val); +#endif break; case SPRN_IVOR3: vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE] = spr_val; @@ -163,6 +182,9 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) break; case SPRN_IVOR8: vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL] = spr_val; +#ifdef CONFIG_KVM_BOOKE_HV + mtspr(SPRN_GIVOR8, spr_val); +#endif break; case SPRN_IVOR9: vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL] = spr_val; @@ -193,75 +215,83 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) return emulated; } -int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) +int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) { int emulated = EMULATE_DONE; switch (sprn) { case SPRN_IVPR: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivpr); break; + *spr_val = vcpu->arch.ivpr; + break; case SPRN_DEAR: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->dar); break; + *spr_val = vcpu->arch.shared->dar; + break; case SPRN_ESR: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->esr); break; + *spr_val = vcpu->arch.shared->esr; + break; case SPRN_DBCR0: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.dbcr0); break; + *spr_val = vcpu->arch.dbcr0; + break; case SPRN_DBCR1: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.dbcr1); break; + *spr_val = vcpu->arch.dbcr1; + break; case SPRN_DBSR: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.dbsr); break; + *spr_val = vcpu->arch.dbsr; + break; case SPRN_TSR: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.tsr); break; + *spr_val = vcpu->arch.tsr; + break; case SPRN_TCR: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.tcr); break; + *spr_val = vcpu->arch.tcr; + break; case SPRN_IVOR0: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL]); + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL]; break; case SPRN_IVOR1: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK]); + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK]; break; case SPRN_IVOR2: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE]); + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE]; break; case SPRN_IVOR3: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE]); + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE]; break; case SPRN_IVOR4: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL]); + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL]; break; case SPRN_IVOR5: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT]); + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT]; break; case SPRN_IVOR6: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM]); + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM]; break; case SPRN_IVOR7: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL]); + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL]; break; case SPRN_IVOR8: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL]); + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL]; break; case SPRN_IVOR9: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL]); + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL]; break; case SPRN_IVOR10: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER]); + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER]; break; case SPRN_IVOR11: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_FIT]); + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_FIT]; break; case SPRN_IVOR12: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG]); + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG]; break; case SPRN_IVOR13: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS]); + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS]; break; case SPRN_IVOR14: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS]); + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS]; break; case SPRN_IVOR15: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG]); + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG]; break; default: diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S index c8c4b87..8feec2f 100644 --- a/arch/powerpc/kvm/booke_interrupts.S +++ b/arch/powerpc/kvm/booke_interrupts.S @@ -419,13 +419,13 @@ lightweight_exit: * written directly to the shared area, so we * need to reload them here with the guest's values. */ - lwz r3, VCPU_SHARED_SPRG4(r5) + PPC_LD(r3, VCPU_SHARED_SPRG4, r5) mtspr SPRN_SPRG4W, r3 - lwz r3, VCPU_SHARED_SPRG5(r5) + PPC_LD(r3, VCPU_SHARED_SPRG5, r5) mtspr SPRN_SPRG5W, r3 - lwz r3, VCPU_SHARED_SPRG6(r5) + PPC_LD(r3, VCPU_SHARED_SPRG6, r5) mtspr SPRN_SPRG6W, r3 - lwz r3, VCPU_SHARED_SPRG7(r5) + PPC_LD(r3, VCPU_SHARED_SPRG7, r5) mtspr SPRN_SPRG7W, r3 #ifdef CONFIG_KVM_EXIT_TIMING diff --git a/arch/powerpc/kvm/bookehv_interrupts.S b/arch/powerpc/kvm/bookehv_interrupts.S new file mode 100644 index 0000000..6048a00 --- /dev/null +++ b/arch/powerpc/kvm/bookehv_interrupts.S @@ -0,0 +1,597 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright (C) 2010-2011 Freescale Semiconductor, Inc. + * + * Author: Varun Sethi <varun.sethi@freescale.com> + * Author: Scott Wood <scotwood@freescale.com> + * + * This file is derived from arch/powerpc/kvm/booke_interrupts.S + */ + +#include <asm/ppc_asm.h> +#include <asm/kvm_asm.h> +#include <asm/reg.h> +#include <asm/mmu-44x.h> +#include <asm/page.h> +#include <asm/asm-compat.h> +#include <asm/asm-offsets.h> +#include <asm/bitsperlong.h> +#include <asm/thread_info.h> + +#include "../kernel/head_booke.h" /* for THREAD_NORMSAVE() */ + +#define GET_VCPU(vcpu, thread) \ + PPC_LL vcpu, THREAD_KVM_VCPU(thread) + +#define LONGBYTES (BITS_PER_LONG / 8) + +#define VCPU_GPR(n) (VCPU_GPRS + (n * LONGBYTES)) +#define VCPU_GUEST_SPRG(n) (VCPU_GUEST_SPRGS + (n * LONGBYTES)) + +/* The host stack layout: */ +#define HOST_R1 (0 * LONGBYTES) /* Implied by stwu. */ +#define HOST_CALLEE_LR (1 * LONGBYTES) +#define HOST_RUN (2 * LONGBYTES) /* struct kvm_run */ +/* + * r2 is special: it holds 'current', and it made nonvolatile in the + * kernel with the -ffixed-r2 gcc option. + */ +#define HOST_R2 (3 * LONGBYTES) +#define HOST_CR (4 * LONGBYTES) +#define HOST_NV_GPRS (5 * LONGBYTES) +#define HOST_NV_GPR(n) (HOST_NV_GPRS + ((n - 14) * LONGBYTES)) +#define HOST_MIN_STACK_SIZE (HOST_NV_GPR(31) + LONGBYTES) +#define HOST_STACK_SIZE ((HOST_MIN_STACK_SIZE + 15) & ~15) /* Align. */ +#define HOST_STACK_LR (HOST_STACK_SIZE + LONGBYTES) /* In caller stack frame. */ + +#define NEED_EMU 0x00000001 /* emulation -- save nv regs */ +#define NEED_DEAR 0x00000002 /* save faulting DEAR */ +#define NEED_ESR 0x00000004 /* save faulting ESR */ + +/* + * On entry: + * r4 = vcpu, r5 = srr0, r6 = srr1 + * saved in vcpu: cr, ctr, r3-r13 + */ +.macro kvm_handler_common intno, srr0, flags + /* Restore host stack pointer */ + PPC_STL r1, VCPU_GPR(r1)(r4) + PPC_STL r2, VCPU_GPR(r2)(r4) + PPC_LL r1, VCPU_HOST_STACK(r4) + PPC_LL r2, HOST_R2(r1) + + mfspr r10, SPRN_PID + lwz r8, VCPU_HOST_PID(r4) + PPC_LL r11, VCPU_SHARED(r4) + PPC_STL r14, VCPU_GPR(r14)(r4) /* We need a non-volatile GPR. */ + li r14, \intno + + stw r10, VCPU_GUEST_PID(r4) + mtspr SPRN_PID, r8 + +#ifdef CONFIG_KVM_EXIT_TIMING + /* save exit time */ +1: mfspr r7, SPRN_TBRU + mfspr r8, SPRN_TBRL + mfspr r9, SPRN_TBRU + cmpw r9, r7 + stw r8, VCPU_TIMING_EXIT_TBL(r4) + bne- 1b + stw r9, VCPU_TIMING_EXIT_TBU(r4) +#endif + + oris r8, r6, MSR_CE@h + PPC_STD(r6, VCPU_SHARED_MSR, r11) + ori r8, r8, MSR_ME | MSR_RI + PPC_STL r5, VCPU_PC(r4) + + /* + * Make sure CE/ME/RI are set (if appropriate for exception type) + * whether or not the guest had it set. Since mfmsr/mtmsr are + * somewhat expensive, skip in the common case where the guest + * had all these bits set (and thus they're still set if + * appropriate for the exception type). + */ + cmpw r6, r8 + beq 1f + mfmsr r7 + .if \srr0 != SPRN_MCSRR0 && \srr0 != SPRN_CSRR0 + oris r7, r7, MSR_CE@h + .endif + .if \srr0 != SPRN_MCSRR0 + ori r7, r7, MSR_ME | MSR_RI + .endif + mtmsr r7 +1: + + .if \flags & NEED_EMU + /* + * This assumes you have external PID support. + * To support a bookehv CPU without external PID, you'll + * need to look up the TLB entry and create a temporary mapping. + * + * FIXME: we don't currently handle if the lwepx faults. PR-mode + * booke doesn't handle it either. Since Linux doesn't use + * broadcast tlbivax anymore, the only way this should happen is + * if the guest maps its memory execute-but-not-read, or if we + * somehow take a TLB miss in the middle of this entry code and + * evict the relevant entry. On e500mc, all kernel lowmem is + * bolted into TLB1 large page mappings, and we don't use + * broadcast invalidates, so we should not take a TLB miss here. + * + * Later we'll need to deal with faults here. Disallowing guest + * mappings that are execute-but-not-read could be an option on + * e500mc, but not on chips with an LRAT if it is used. + */ + + mfspr r3, SPRN_EPLC /* will already have correct ELPID and EGS */ + PPC_STL r15, VCPU_GPR(r15)(r4) + PPC_STL r16, VCPU_GPR(r16)(r4) + PPC_STL r17, VCPU_GPR(r17)(r4) + PPC_STL r18, VCPU_GPR(r18)(r4) + PPC_STL r19, VCPU_GPR(r19)(r4) + mr r8, r3 + PPC_STL r20, VCPU_GPR(r20)(r4) + rlwimi r8, r6, EPC_EAS_SHIFT - MSR_IR_LG, EPC_EAS + PPC_STL r21, VCPU_GPR(r21)(r4) + rlwimi r8, r6, EPC_EPR_SHIFT - MSR_PR_LG, EPC_EPR + PPC_STL r22, VCPU_GPR(r22)(r4) + rlwimi r8, r10, EPC_EPID_SHIFT, EPC_EPID + PPC_STL r23, VCPU_GPR(r23)(r4) + PPC_STL r24, VCPU_GPR(r24)(r4) + PPC_STL r25, VCPU_GPR(r25)(r4) + PPC_STL r26, VCPU_GPR(r26)(r4) + PPC_STL r27, VCPU_GPR(r27)(r4) + PPC_STL r28, VCPU_GPR(r28)(r4) + PPC_STL r29, VCPU_GPR(r29)(r4) + PPC_STL r30, VCPU_GPR(r30)(r4) + PPC_STL r31, VCPU_GPR(r31)(r4) + mtspr SPRN_EPLC, r8 + + /* disable preemption, so we are sure we hit the fixup handler */ +#ifdef CONFIG_PPC64 + clrrdi r8,r1,THREAD_SHIFT +#else + rlwinm r8,r1,0,0,31-THREAD_SHIFT /* current thread_info */ +#endif + li r7, 1 + stw r7, TI_PREEMPT(r8) + + isync + + /* + * In case the read goes wrong, we catch it and write an invalid value + * in LAST_INST instead. + */ +1: lwepx r9, 0, r5 +2: +.section .fixup, "ax" +3: li r9, KVM_INST_FETCH_FAILED + b 2b +.previous +.section __ex_table,"a" + PPC_LONG_ALIGN + PPC_LONG 1b,3b +.previous + + mtspr SPRN_EPLC, r3 + li r7, 0 + stw r7, TI_PREEMPT(r8) + stw r9, VCPU_LAST_INST(r4) + .endif + + .if \flags & NEED_ESR + mfspr r8, SPRN_ESR + PPC_STL r8, VCPU_FAULT_ESR(r4) + .endif + + .if \flags & NEED_DEAR + mfspr r9, SPRN_DEAR + PPC_STL r9, VCPU_FAULT_DEAR(r4) + .endif + + b kvmppc_resume_host +.endm + +/* + * For input register values, see arch/powerpc/include/asm/kvm_booke_hv_asm.h + */ +.macro kvm_handler intno srr0, srr1, flags +_GLOBAL(kvmppc_handler_\intno\()_\srr1) + GET_VCPU(r11, r10) + PPC_STL r3, VCPU_GPR(r3)(r11) + mfspr r3, SPRN_SPRG_RSCRATCH0 + PPC_STL r4, VCPU_GPR(r4)(r11) + PPC_LL r4, THREAD_NORMSAVE(0)(r10) + PPC_STL r5, VCPU_GPR(r5)(r11) + stw r13, VCPU_CR(r11) + mfspr r5, \srr0 + PPC_STL r3, VCPU_GPR(r10)(r11) + PPC_LL r3, THREAD_NORMSAVE(2)(r10) + PPC_STL r6, VCPU_GPR(r6)(r11) + PPC_STL r4, VCPU_GPR(r11)(r11) + mfspr r6, \srr1 + PPC_STL r7, VCPU_GPR(r7)(r11) + PPC_STL r8, VCPU_GPR(r8)(r11) + PPC_STL r9, VCPU_GPR(r9)(r11) + PPC_STL r3, VCPU_GPR(r13)(r11) + mfctr r7 + PPC_STL r12, VCPU_GPR(r12)(r11) + PPC_STL r7, VCPU_CTR(r11) + mr r4, r11 + kvm_handler_common \intno, \srr0, \flags +.endm + +.macro kvm_lvl_handler intno scratch srr0, srr1, flags +_GLOBAL(kvmppc_handler_\intno\()_\srr1) + mfspr r10, SPRN_SPRG_THREAD + GET_VCPU(r11, r10) + PPC_STL r3, VCPU_GPR(r3)(r11) + mfspr r3, \scratch + PPC_STL r4, VCPU_GPR(r4)(r11) + PPC_LL r4, GPR9(r8) + PPC_STL r5, VCPU_GPR(r5)(r11) + stw r9, VCPU_CR(r11) + mfspr r5, \srr0 + PPC_STL r3, VCPU_GPR(r8)(r11) + PPC_LL r3, GPR10(r8) + PPC_STL r6, VCPU_GPR(r6)(r11) + PPC_STL r4, VCPU_GPR(r9)(r11) + mfspr r6, \srr1 + PPC_LL r4, GPR11(r8) + PPC_STL r7, VCPU_GPR(r7)(r11) + PPC_STL r3, VCPU_GPR(r10)(r11) + mfctr r7 + PPC_STL r12, VCPU_GPR(r12)(r11) + PPC_STL r13, VCPU_GPR(r13)(r11) + PPC_STL r4, VCPU_GPR(r11)(r11) + PPC_STL r7, VCPU_CTR(r11) + mr r4, r11 + kvm_handler_common \intno, \srr0, \flags +.endm + +kvm_lvl_handler BOOKE_INTERRUPT_CRITICAL, \ + SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0 +kvm_lvl_handler BOOKE_INTERRUPT_MACHINE_CHECK, \ + SPRN_SPRG_RSCRATCH_MC, SPRN_MCSRR0, SPRN_MCSRR1, 0 +kvm_handler BOOKE_INTERRUPT_DATA_STORAGE, \ + SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR) +kvm_handler BOOKE_INTERRUPT_INST_STORAGE, SPRN_SRR0, SPRN_SRR1, NEED_ESR +kvm_handler BOOKE_INTERRUPT_EXTERNAL, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_ALIGNMENT, \ + SPRN_SRR0, SPRN_SRR1, (NEED_DEAR | NEED_ESR) +kvm_handler BOOKE_INTERRUPT_PROGRAM, SPRN_SRR0, SPRN_SRR1, NEED_ESR +kvm_handler BOOKE_INTERRUPT_FP_UNAVAIL, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_SYSCALL, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_AP_UNAVAIL, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_DECREMENTER, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_FIT, SPRN_SRR0, SPRN_SRR1, 0 +kvm_lvl_handler BOOKE_INTERRUPT_WATCHDOG, \ + SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0 +kvm_handler BOOKE_INTERRUPT_DTLB_MISS, \ + SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR | NEED_ESR) +kvm_handler BOOKE_INTERRUPT_ITLB_MISS, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_SPE_UNAVAIL, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_SPE_FP_DATA, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_SPE_FP_ROUND, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_PERFORMANCE_MONITOR, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_DOORBELL, SPRN_SRR0, SPRN_SRR1, 0 +kvm_lvl_handler BOOKE_INTERRUPT_DOORBELL_CRITICAL, \ + SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0 +kvm_handler BOOKE_INTERRUPT_HV_PRIV, SPRN_SRR0, SPRN_SRR1, NEED_EMU +kvm_handler BOOKE_INTERRUPT_HV_SYSCALL, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_GUEST_DBELL, SPRN_GSRR0, SPRN_GSRR1, 0 +kvm_lvl_handler BOOKE_INTERRUPT_GUEST_DBELL_CRIT, \ + SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0 +kvm_lvl_handler BOOKE_INTERRUPT_DEBUG, \ + SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0 +kvm_lvl_handler BOOKE_INTERRUPT_DEBUG, \ + SPRN_SPRG_RSCRATCH_DBG, SPRN_DSRR0, SPRN_DSRR1, 0 + + +/* Registers: + * SPRG_SCRATCH0: guest r10 + * r4: vcpu pointer + * r11: vcpu->arch.shared + * r14: KVM exit number + */ +_GLOBAL(kvmppc_resume_host) + /* Save remaining volatile guest register state to vcpu. */ + mfspr r3, SPRN_VRSAVE + PPC_STL r0, VCPU_GPR(r0)(r4) + mflr r5 + mfspr r6, SPRN_SPRG4 + PPC_STL r5, VCPU_LR(r4) + mfspr r7, SPRN_SPRG5 + stw r3, VCPU_VRSAVE(r4) + PPC_STD(r6, VCPU_SHARED_SPRG4, r11) + mfspr r8, SPRN_SPRG6 + PPC_STD(r7, VCPU_SHARED_SPRG5, r11) + mfspr r9, SPRN_SPRG7 + PPC_STD(r8, VCPU_SHARED_SPRG6, r11) + mfxer r3 + PPC_STD(r9, VCPU_SHARED_SPRG7, r11) + + /* save guest MAS registers and restore host mas4 & mas6 */ + mfspr r5, SPRN_MAS0 + PPC_STL r3, VCPU_XER(r4) + mfspr r6, SPRN_MAS1 + stw r5, VCPU_SHARED_MAS0(r11) + mfspr r7, SPRN_MAS2 + stw r6, VCPU_SHARED_MAS1(r11) + PPC_STD(r7, VCPU_SHARED_MAS2, r11) + mfspr r5, SPRN_MAS3 + mfspr r6, SPRN_MAS4 + stw r5, VCPU_SHARED_MAS7_3+4(r11) + mfspr r7, SPRN_MAS6 + stw r6, VCPU_SHARED_MAS4(r11) + mfspr r5, SPRN_MAS7 + lwz r6, VCPU_HOST_MAS4(r4) + stw r7, VCPU_SHARED_MAS6(r11) + lwz r8, VCPU_HOST_MAS6(r4) + mtspr SPRN_MAS4, r6 + stw r5, VCPU_SHARED_MAS7_3+0(r11) + mtspr SPRN_MAS6, r8 + /* Enable MAS register updates via exception */ + mfspr r3, SPRN_EPCR + rlwinm r3, r3, 0, ~SPRN_EPCR_DMIUH + mtspr SPRN_EPCR, r3 + isync + + /* Switch to kernel stack and jump to handler. */ + PPC_LL r3, HOST_RUN(r1) + mr r5, r14 /* intno */ + mr r14, r4 /* Save vcpu pointer. */ + bl kvmppc_handle_exit + + /* Restore vcpu pointer and the nonvolatiles we used. */ + mr r4, r14 + PPC_LL r14, VCPU_GPR(r14)(r4) + + andi. r5, r3, RESUME_FLAG_NV + beq skip_nv_load + PPC_LL r15, VCPU_GPR(r15)(r4) + PPC_LL r16, VCPU_GPR(r16)(r4) + PPC_LL r17, VCPU_GPR(r17)(r4) + PPC_LL r18, VCPU_GPR(r18)(r4) + PPC_LL r19, VCPU_GPR(r19)(r4) + PPC_LL r20, VCPU_GPR(r20)(r4) + PPC_LL r21, VCPU_GPR(r21)(r4) + PPC_LL r22, VCPU_GPR(r22)(r4) + PPC_LL r23, VCPU_GPR(r23)(r4) + PPC_LL r24, VCPU_GPR(r24)(r4) + PPC_LL r25, VCPU_GPR(r25)(r4) + PPC_LL r26, VCPU_GPR(r26)(r4) + PPC_LL r27, VCPU_GPR(r27)(r4) + PPC_LL r28, VCPU_GPR(r28)(r4) + PPC_LL r29, VCPU_GPR(r29)(r4) + PPC_LL r30, VCPU_GPR(r30)(r4) + PPC_LL r31, VCPU_GPR(r31)(r4) +skip_nv_load: + /* Should we return to the guest? */ + andi. r5, r3, RESUME_FLAG_HOST + beq lightweight_exit + + srawi r3, r3, 2 /* Shift -ERR back down. */ + +heavyweight_exit: + /* Not returning to guest. */ + PPC_LL r5, HOST_STACK_LR(r1) + lwz r6, HOST_CR(r1) + + /* + * We already saved guest volatile register state; now save the + * non-volatiles. + */ + + PPC_STL r15, VCPU_GPR(r15)(r4) + PPC_STL r16, VCPU_GPR(r16)(r4) + PPC_STL r17, VCPU_GPR(r17)(r4) + PPC_STL r18, VCPU_GPR(r18)(r4) + PPC_STL r19, VCPU_GPR(r19)(r4) + PPC_STL r20, VCPU_GPR(r20)(r4) + PPC_STL r21, VCPU_GPR(r21)(r4) + PPC_STL r22, VCPU_GPR(r22)(r4) + PPC_STL r23, VCPU_GPR(r23)(r4) + PPC_STL r24, VCPU_GPR(r24)(r4) + PPC_STL r25, VCPU_GPR(r25)(r4) + PPC_STL r26, VCPU_GPR(r26)(r4) + PPC_STL r27, VCPU_GPR(r27)(r4) + PPC_STL r28, VCPU_GPR(r28)(r4) + PPC_STL r29, VCPU_GPR(r29)(r4) + PPC_STL r30, VCPU_GPR(r30)(r4) + PPC_STL r31, VCPU_GPR(r31)(r4) + + /* Load host non-volatile register state from host stack. */ + PPC_LL r14, HOST_NV_GPR(r14)(r1) + PPC_LL r15, HOST_NV_GPR(r15)(r1) + PPC_LL r16, HOST_NV_GPR(r16)(r1) + PPC_LL r17, HOST_NV_GPR(r17)(r1) + PPC_LL r18, HOST_NV_GPR(r18)(r1) + PPC_LL r19, HOST_NV_GPR(r19)(r1) + PPC_LL r20, HOST_NV_GPR(r20)(r1) + PPC_LL r21, HOST_NV_GPR(r21)(r1) + PPC_LL r22, HOST_NV_GPR(r22)(r1) + PPC_LL r23, HOST_NV_GPR(r23)(r1) + PPC_LL r24, HOST_NV_GPR(r24)(r1) + PPC_LL r25, HOST_NV_GPR(r25)(r1) + PPC_LL r26, HOST_NV_GPR(r26)(r1) + PPC_LL r27, HOST_NV_GPR(r27)(r1) + PPC_LL r28, HOST_NV_GPR(r28)(r1) + PPC_LL r29, HOST_NV_GPR(r29)(r1) + PPC_LL r30, HOST_NV_GPR(r30)(r1) + PPC_LL r31, HOST_NV_GPR(r31)(r1) + + /* Return to kvm_vcpu_run(). */ + mtlr r5 + mtcr r6 + addi r1, r1, HOST_STACK_SIZE + /* r3 still contains the return code from kvmppc_handle_exit(). */ + blr + +/* Registers: + * r3: kvm_run pointer + * r4: vcpu pointer + */ +_GLOBAL(__kvmppc_vcpu_run) + stwu r1, -HOST_STACK_SIZE(r1) + PPC_STL r1, VCPU_HOST_STACK(r4) /* Save stack pointer to vcpu. */ + + /* Save host state to stack. */ + PPC_STL r3, HOST_RUN(r1) + mflr r3 + mfcr r5 + PPC_STL r3, HOST_STACK_LR(r1) + + stw r5, HOST_CR(r1) + + /* Save host non-volatile register state to stack. */ + PPC_STL r14, HOST_NV_GPR(r14)(r1) + PPC_STL r15, HOST_NV_GPR(r15)(r1) + PPC_STL r16, HOST_NV_GPR(r16)(r1) + PPC_STL r17, HOST_NV_GPR(r17)(r1) + PPC_STL r18, HOST_NV_GPR(r18)(r1) + PPC_STL r19, HOST_NV_GPR(r19)(r1) + PPC_STL r20, HOST_NV_GPR(r20)(r1) + PPC_STL r21, HOST_NV_GPR(r21)(r1) + PPC_STL r22, HOST_NV_GPR(r22)(r1) + PPC_STL r23, HOST_NV_GPR(r23)(r1) + PPC_STL r24, HOST_NV_GPR(r24)(r1) + PPC_STL r25, HOST_NV_GPR(r25)(r1) + PPC_STL r26, HOST_NV_GPR(r26)(r1) + PPC_STL r27, HOST_NV_GPR(r27)(r1) + PPC_STL r28, HOST_NV_GPR(r28)(r1) + PPC_STL r29, HOST_NV_GPR(r29)(r1) + PPC_STL r30, HOST_NV_GPR(r30)(r1) + PPC_STL r31, HOST_NV_GPR(r31)(r1) + + /* Load guest non-volatiles. */ + PPC_LL r14, VCPU_GPR(r14)(r4) + PPC_LL r15, VCPU_GPR(r15)(r4) + PPC_LL r16, VCPU_GPR(r16)(r4) + PPC_LL r17, VCPU_GPR(r17)(r4) + PPC_LL r18, VCPU_GPR(r18)(r4) + PPC_LL r19, VCPU_GPR(r19)(r4) + PPC_LL r20, VCPU_GPR(r20)(r4) + PPC_LL r21, VCPU_GPR(r21)(r4) + PPC_LL r22, VCPU_GPR(r22)(r4) + PPC_LL r23, VCPU_GPR(r23)(r4) + PPC_LL r24, VCPU_GPR(r24)(r4) + PPC_LL r25, VCPU_GPR(r25)(r4) + PPC_LL r26, VCPU_GPR(r26)(r4) + PPC_LL r27, VCPU_GPR(r27)(r4) + PPC_LL r28, VCPU_GPR(r28)(r4) + PPC_LL r29, VCPU_GPR(r29)(r4) + PPC_LL r30, VCPU_GPR(r30)(r4) + PPC_LL r31, VCPU_GPR(r31)(r4) + + +lightweight_exit: + PPC_STL r2, HOST_R2(r1) + + mfspr r3, SPRN_PID + stw r3, VCPU_HOST_PID(r4) + lwz r3, VCPU_GUEST_PID(r4) + mtspr SPRN_PID, r3 + + PPC_LL r11, VCPU_SHARED(r4) + /* Disable MAS register updates via exception */ + mfspr r3, SPRN_EPCR + oris r3, r3, SPRN_EPCR_DMIUH@h + mtspr SPRN_EPCR, r3 + isync + /* Save host mas4 and mas6 and load guest MAS registers */ + mfspr r3, SPRN_MAS4 + stw r3, VCPU_HOST_MAS4(r4) + mfspr r3, SPRN_MAS6 + stw r3, VCPU_HOST_MAS6(r4) + lwz r3, VCPU_SHARED_MAS0(r11) + lwz r5, VCPU_SHARED_MAS1(r11) + PPC_LD(r6, VCPU_SHARED_MAS2, r11) + lwz r7, VCPU_SHARED_MAS7_3+4(r11) + lwz r8, VCPU_SHARED_MAS4(r11) + mtspr SPRN_MAS0, r3 + mtspr SPRN_MAS1, r5 + mtspr SPRN_MAS2, r6 + mtspr SPRN_MAS3, r7 + mtspr SPRN_MAS4, r8 + lwz r3, VCPU_SHARED_MAS6(r11) + lwz r5, VCPU_SHARED_MAS7_3+0(r11) + mtspr SPRN_MAS6, r3 + mtspr SPRN_MAS7, r5 + + /* + * Host interrupt handlers may have clobbered these guest-readable + * SPRGs, so we need to reload them here with the guest's values. + */ + lwz r3, VCPU_VRSAVE(r4) + PPC_LD(r5, VCPU_SHARED_SPRG4, r11) + mtspr SPRN_VRSAVE, r3 + PPC_LD(r6, VCPU_SHARED_SPRG5, r11) + mtspr SPRN_SPRG4W, r5 + PPC_LD(r7, VCPU_SHARED_SPRG6, r11) + mtspr SPRN_SPRG5W, r6 + PPC_LD(r8, VCPU_SHARED_SPRG7, r11) + mtspr SPRN_SPRG6W, r7 + mtspr SPRN_SPRG7W, r8 + + /* Load some guest volatiles. */ + PPC_LL r3, VCPU_LR(r4) + PPC_LL r5, VCPU_XER(r4) + PPC_LL r6, VCPU_CTR(r4) + lwz r7, VCPU_CR(r4) + PPC_LL r8, VCPU_PC(r4) + PPC_LD(r9, VCPU_SHARED_MSR, r11) + PPC_LL r0, VCPU_GPR(r0)(r4) + PPC_LL r1, VCPU_GPR(r1)(r4) + PPC_LL r2, VCPU_GPR(r2)(r4) + PPC_LL r10, VCPU_GPR(r10)(r4) + PPC_LL r11, VCPU_GPR(r11)(r4) + PPC_LL r12, VCPU_GPR(r12)(r4) + PPC_LL r13, VCPU_GPR(r13)(r4) + mtlr r3 + mtxer r5 + mtctr r6 + mtsrr0 r8 + mtsrr1 r9 + +#ifdef CONFIG_KVM_EXIT_TIMING + /* save enter time */ +1: + mfspr r6, SPRN_TBRU + mfspr r9, SPRN_TBRL + mfspr r8, SPRN_TBRU + cmpw r8, r6 + stw r9, VCPU_TIMING_LAST_ENTER_TBL(r4) + bne 1b + stw r8, VCPU_TIMING_LAST_ENTER_TBU(r4) +#endif + + /* + * Don't execute any instruction which can change CR after + * below instruction. + */ + mtcr r7 + + /* Finish loading guest volatiles and jump to guest. */ + PPC_LL r5, VCPU_GPR(r5)(r4) + PPC_LL r6, VCPU_GPR(r6)(r4) + PPC_LL r7, VCPU_GPR(r7)(r4) + PPC_LL r8, VCPU_GPR(r8)(r4) + PPC_LL r9, VCPU_GPR(r9)(r4) + + PPC_LL r3, VCPU_GPR(r3)(r4) + PPC_LL r4, VCPU_GPR(r4)(r4) + rfi diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index ddcd896..b479ed7 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -20,11 +20,282 @@ #include <asm/reg.h> #include <asm/cputable.h> #include <asm/tlbflush.h> -#include <asm/kvm_e500.h> #include <asm/kvm_ppc.h> +#include "../mm/mmu_decl.h" #include "booke.h" -#include "e500_tlb.h" +#include "e500.h" + +struct id { + unsigned long val; + struct id **pentry; +}; + +#define NUM_TIDS 256 + +/* + * This table provide mappings from: + * (guestAS,guestTID,guestPR) --> ID of physical cpu + * guestAS [0..1] + * guestTID [0..255] + * guestPR [0..1] + * ID [1..255] + * Each vcpu keeps one vcpu_id_table. + */ +struct vcpu_id_table { + struct id id[2][NUM_TIDS][2]; +}; + +/* + * This table provide reversed mappings of vcpu_id_table: + * ID --> address of vcpu_id_table item. + * Each physical core has one pcpu_id_table. + */ +struct pcpu_id_table { + struct id *entry[NUM_TIDS]; +}; + +static DEFINE_PER_CPU(struct pcpu_id_table, pcpu_sids); + +/* This variable keeps last used shadow ID on local core. + * The valid range of shadow ID is [1..255] */ +static DEFINE_PER_CPU(unsigned long, pcpu_last_used_sid); + +/* + * Allocate a free shadow id and setup a valid sid mapping in given entry. + * A mapping is only valid when vcpu_id_table and pcpu_id_table are match. + * + * The caller must have preemption disabled, and keep it that way until + * it has finished with the returned shadow id (either written into the + * TLB or arch.shadow_pid, or discarded). + */ +static inline int local_sid_setup_one(struct id *entry) +{ + unsigned long sid; + int ret = -1; + + sid = ++(__get_cpu_var(pcpu_last_used_sid)); + if (sid < NUM_TIDS) { + __get_cpu_var(pcpu_sids).entry[sid] = entry; + entry->val = sid; + entry->pentry = &__get_cpu_var(pcpu_sids).entry[sid]; + ret = sid; + } + + /* + * If sid == NUM_TIDS, we've run out of sids. We return -1, and + * the caller will invalidate everything and start over. + * + * sid > NUM_TIDS indicates a race, which we disable preemption to + * avoid. + */ + WARN_ON(sid > NUM_TIDS); + + return ret; +} + +/* + * Check if given entry contain a valid shadow id mapping. + * An ID mapping is considered valid only if + * both vcpu and pcpu know this mapping. + * + * The caller must have preemption disabled, and keep it that way until + * it has finished with the returned shadow id (either written into the + * TLB or arch.shadow_pid, or discarded). + */ +static inline int local_sid_lookup(struct id *entry) +{ + if (entry && entry->val != 0 && + __get_cpu_var(pcpu_sids).entry[entry->val] == entry && + entry->pentry == &__get_cpu_var(pcpu_sids).entry[entry->val]) + return entry->val; + return -1; +} + +/* Invalidate all id mappings on local core -- call with preempt disabled */ +static inline void local_sid_destroy_all(void) +{ + __get_cpu_var(pcpu_last_used_sid) = 0; + memset(&__get_cpu_var(pcpu_sids), 0, sizeof(__get_cpu_var(pcpu_sids))); +} + +static void *kvmppc_e500_id_table_alloc(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + vcpu_e500->idt = kzalloc(sizeof(struct vcpu_id_table), GFP_KERNEL); + return vcpu_e500->idt; +} + +static void kvmppc_e500_id_table_free(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + kfree(vcpu_e500->idt); + vcpu_e500->idt = NULL; +} + +/* Map guest pid to shadow. + * We use PID to keep shadow of current guest non-zero PID, + * and use PID1 to keep shadow of guest zero PID. + * So that guest tlbe with TID=0 can be accessed at any time */ +static void kvmppc_e500_recalc_shadow_pid(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + preempt_disable(); + vcpu_e500->vcpu.arch.shadow_pid = kvmppc_e500_get_sid(vcpu_e500, + get_cur_as(&vcpu_e500->vcpu), + get_cur_pid(&vcpu_e500->vcpu), + get_cur_pr(&vcpu_e500->vcpu), 1); + vcpu_e500->vcpu.arch.shadow_pid1 = kvmppc_e500_get_sid(vcpu_e500, + get_cur_as(&vcpu_e500->vcpu), 0, + get_cur_pr(&vcpu_e500->vcpu), 1); + preempt_enable(); +} + +/* Invalidate all mappings on vcpu */ +static void kvmppc_e500_id_table_reset_all(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + memset(vcpu_e500->idt, 0, sizeof(struct vcpu_id_table)); + + /* Update shadow pid when mappings are changed */ + kvmppc_e500_recalc_shadow_pid(vcpu_e500); +} + +/* Invalidate one ID mapping on vcpu */ +static inline void kvmppc_e500_id_table_reset_one( + struct kvmppc_vcpu_e500 *vcpu_e500, + int as, int pid, int pr) +{ + struct vcpu_id_table *idt = vcpu_e500->idt; + + BUG_ON(as >= 2); + BUG_ON(pid >= NUM_TIDS); + BUG_ON(pr >= 2); + + idt->id[as][pid][pr].val = 0; + idt->id[as][pid][pr].pentry = NULL; + + /* Update shadow pid when mappings are changed */ + kvmppc_e500_recalc_shadow_pid(vcpu_e500); +} + +/* + * Map guest (vcpu,AS,ID,PR) to physical core shadow id. + * This function first lookup if a valid mapping exists, + * if not, then creates a new one. + * + * The caller must have preemption disabled, and keep it that way until + * it has finished with the returned shadow id (either written into the + * TLB or arch.shadow_pid, or discarded). + */ +unsigned int kvmppc_e500_get_sid(struct kvmppc_vcpu_e500 *vcpu_e500, + unsigned int as, unsigned int gid, + unsigned int pr, int avoid_recursion) +{ + struct vcpu_id_table *idt = vcpu_e500->idt; + int sid; + + BUG_ON(as >= 2); + BUG_ON(gid >= NUM_TIDS); + BUG_ON(pr >= 2); + + sid = local_sid_lookup(&idt->id[as][gid][pr]); + + while (sid <= 0) { + /* No mapping yet */ + sid = local_sid_setup_one(&idt->id[as][gid][pr]); + if (sid <= 0) { + _tlbil_all(); + local_sid_destroy_all(); + } + + /* Update shadow pid when mappings are changed */ + if (!avoid_recursion) + kvmppc_e500_recalc_shadow_pid(vcpu_e500); + } + + return sid; +} + +unsigned int kvmppc_e500_get_tlb_stid(struct kvm_vcpu *vcpu, + struct kvm_book3e_206_tlb_entry *gtlbe) +{ + return kvmppc_e500_get_sid(to_e500(vcpu), get_tlb_ts(gtlbe), + get_tlb_tid(gtlbe), get_cur_pr(vcpu), 0); +} + +void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + + if (vcpu->arch.pid != pid) { + vcpu_e500->pid[0] = vcpu->arch.pid = pid; + kvmppc_e500_recalc_shadow_pid(vcpu_e500); + } +} + +/* gtlbe must not be mapped by more than one host tlbe */ +void kvmppc_e500_tlbil_one(struct kvmppc_vcpu_e500 *vcpu_e500, + struct kvm_book3e_206_tlb_entry *gtlbe) +{ + struct vcpu_id_table *idt = vcpu_e500->idt; + unsigned int pr, tid, ts, pid; + u32 val, eaddr; + unsigned long flags; + + ts = get_tlb_ts(gtlbe); + tid = get_tlb_tid(gtlbe); + + preempt_disable(); + + /* One guest ID may be mapped to two shadow IDs */ + for (pr = 0; pr < 2; pr++) { + /* + * The shadow PID can have a valid mapping on at most one + * host CPU. In the common case, it will be valid on this + * CPU, in which case we do a local invalidation of the + * specific address. + * + * If the shadow PID is not valid on the current host CPU, + * we invalidate the entire shadow PID. + */ + pid = local_sid_lookup(&idt->id[ts][tid][pr]); + if (pid <= 0) { + kvmppc_e500_id_table_reset_one(vcpu_e500, ts, tid, pr); + continue; + } + + /* + * The guest is invalidating a 4K entry which is in a PID + * that has a valid shadow mapping on this host CPU. We + * search host TLB to invalidate it's shadow TLB entry, + * similar to __tlbil_va except that we need to look in AS1. + */ + val = (pid << MAS6_SPID_SHIFT) | MAS6_SAS; + eaddr = get_tlb_eaddr(gtlbe); + + local_irq_save(flags); + + mtspr(SPRN_MAS6, val); + asm volatile("tlbsx 0, %[eaddr]" : : [eaddr] "r" (eaddr)); + val = mfspr(SPRN_MAS1); + if (val & MAS1_VALID) { + mtspr(SPRN_MAS1, val & ~MAS1_VALID); + asm volatile("tlbwe"); + } + + local_irq_restore(flags); + } + + preempt_enable(); +} + +void kvmppc_e500_tlbil_all(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + kvmppc_e500_id_table_reset_all(vcpu_e500); +} + +void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr) +{ + /* Recalc shadow pid since MSR changes */ + kvmppc_e500_recalc_shadow_pid(to_e500(vcpu)); +} void kvmppc_core_load_host_debugstate(struct kvm_vcpu *vcpu) { @@ -36,17 +307,20 @@ void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu) void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { - kvmppc_e500_tlb_load(vcpu, cpu); + kvmppc_booke_vcpu_load(vcpu, cpu); + + /* Shadow PID may be expired on local core */ + kvmppc_e500_recalc_shadow_pid(to_e500(vcpu)); } void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) { - kvmppc_e500_tlb_put(vcpu); - #ifdef CONFIG_SPE if (vcpu->arch.shadow_msr & MSR_SPE) kvmppc_vcpu_disable_spe(vcpu); #endif + + kvmppc_booke_vcpu_put(vcpu); } int kvmppc_core_check_processor_compat(void) @@ -61,6 +335,23 @@ int kvmppc_core_check_processor_compat(void) return r; } +static void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + struct kvm_book3e_206_tlb_entry *tlbe; + + /* Insert large initial mapping for guest. */ + tlbe = get_entry(vcpu_e500, 1, 0); + tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_256M); + tlbe->mas2 = 0; + tlbe->mas7_3 = E500_TLB_SUPER_PERM_MASK; + + /* 4K map for serial output. Used by kernel wrapper. */ + tlbe = get_entry(vcpu_e500, 1, 1); + tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_4K); + tlbe->mas2 = (0xe0004500 & 0xFFFFF000) | MAS2_I | MAS2_G; + tlbe->mas7_3 = (0xe0004500 & 0xFFFFF000) | E500_TLB_SUPER_PERM_MASK; +} + int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); @@ -76,32 +367,6 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu) return 0; } -/* 'linear_address' is actually an encoding of AS|PID|EADDR . */ -int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu, - struct kvm_translation *tr) -{ - int index; - gva_t eaddr; - u8 pid; - u8 as; - - eaddr = tr->linear_address; - pid = (tr->linear_address >> 32) & 0xff; - as = (tr->linear_address >> 40) & 0x1; - - index = kvmppc_e500_tlb_search(vcpu, eaddr, pid, as); - if (index < 0) { - tr->valid = 0; - return 0; - } - - tr->physical_address = kvmppc_mmu_xlate(vcpu, index, eaddr); - /* XXX what does "writeable" and "usermode" even mean? */ - tr->valid = 1; - - return 0; -} - void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); @@ -115,19 +380,6 @@ void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) sregs->u.e.impl.fsl.hid0 = vcpu_e500->hid0; sregs->u.e.impl.fsl.mcar = vcpu_e500->mcar; - sregs->u.e.mas0 = vcpu->arch.shared->mas0; - sregs->u.e.mas1 = vcpu->arch.shared->mas1; - sregs->u.e.mas2 = vcpu->arch.shared->mas2; - sregs->u.e.mas7_3 = vcpu->arch.shared->mas7_3; - sregs->u.e.mas4 = vcpu->arch.shared->mas4; - sregs->u.e.mas6 = vcpu->arch.shared->mas6; - - sregs->u.e.mmucfg = mfspr(SPRN_MMUCFG); - sregs->u.e.tlbcfg[0] = vcpu_e500->tlb0cfg; - sregs->u.e.tlbcfg[1] = vcpu_e500->tlb1cfg; - sregs->u.e.tlbcfg[2] = 0; - sregs->u.e.tlbcfg[3] = 0; - sregs->u.e.ivor_high[0] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL]; sregs->u.e.ivor_high[1] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA]; sregs->u.e.ivor_high[2] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND]; @@ -135,11 +387,13 @@ void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR]; kvmppc_get_sregs_ivor(vcpu, sregs); + kvmppc_get_sregs_e500_tlb(vcpu, sregs); } int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + int ret; if (sregs->u.e.impl_id == KVM_SREGS_E_IMPL_FSL) { vcpu_e500->svr = sregs->u.e.impl.fsl.svr; @@ -147,14 +401,9 @@ int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) vcpu_e500->mcar = sregs->u.e.impl.fsl.mcar; } - if (sregs->u.e.features & KVM_SREGS_E_ARCH206_MMU) { - vcpu->arch.shared->mas0 = sregs->u.e.mas0; - vcpu->arch.shared->mas1 = sregs->u.e.mas1; - vcpu->arch.shared->mas2 = sregs->u.e.mas2; - vcpu->arch.shared->mas7_3 = sregs->u.e.mas7_3; - vcpu->arch.shared->mas4 = sregs->u.e.mas4; - vcpu->arch.shared->mas6 = sregs->u.e.mas6; - } + ret = kvmppc_set_sregs_e500_tlb(vcpu, sregs); + if (ret < 0) + return ret; if (!(sregs->u.e.features & KVM_SREGS_E_IVOR)) return 0; @@ -193,9 +442,12 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) if (err) goto free_vcpu; + if (kvmppc_e500_id_table_alloc(vcpu_e500) == NULL) + goto uninit_vcpu; + err = kvmppc_e500_tlb_init(vcpu_e500); if (err) - goto uninit_vcpu; + goto uninit_id; vcpu->arch.shared = (void*)__get_free_page(GFP_KERNEL|__GFP_ZERO); if (!vcpu->arch.shared) @@ -205,6 +457,8 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) uninit_tlb: kvmppc_e500_tlb_uninit(vcpu_e500); +uninit_id: + kvmppc_e500_id_table_free(vcpu_e500); uninit_vcpu: kvm_vcpu_uninit(vcpu); free_vcpu: @@ -218,11 +472,21 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); free_page((unsigned long)vcpu->arch.shared); - kvm_vcpu_uninit(vcpu); kvmppc_e500_tlb_uninit(vcpu_e500); + kvmppc_e500_id_table_free(vcpu_e500); + kvm_vcpu_uninit(vcpu); kmem_cache_free(kvm_vcpu_cache, vcpu_e500); } +int kvmppc_core_init_vm(struct kvm *kvm) +{ + return 0; +} + +void kvmppc_core_destroy_vm(struct kvm *kvm) +{ +} + static int __init kvmppc_e500_init(void) { int r, i; diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h new file mode 100644 index 0000000..aa8b814 --- /dev/null +++ b/arch/powerpc/kvm/e500.h @@ -0,0 +1,306 @@ +/* + * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved. + * + * Author: Yu Liu <yu.liu@freescale.com> + * Scott Wood <scottwood@freescale.com> + * Ashish Kalra <ashish.kalra@freescale.com> + * Varun Sethi <varun.sethi@freescale.com> + * + * Description: + * This file is based on arch/powerpc/kvm/44x_tlb.h and + * arch/powerpc/include/asm/kvm_44x.h by Hollis Blanchard <hollisb@us.ibm.com>, + * Copyright IBM Corp. 2007-2008 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#ifndef KVM_E500_H +#define KVM_E500_H + +#include <linux/kvm_host.h> +#include <asm/mmu-book3e.h> +#include <asm/tlb.h> + +#define E500_PID_NUM 3 +#define E500_TLB_NUM 2 + +#define E500_TLB_VALID 1 +#define E500_TLB_DIRTY 2 +#define E500_TLB_BITMAP 4 + +struct tlbe_ref { + pfn_t pfn; + unsigned int flags; /* E500_TLB_* */ +}; + +struct tlbe_priv { + struct tlbe_ref ref; /* TLB0 only -- TLB1 uses tlb_refs */ +}; + +#ifdef CONFIG_KVM_E500V2 +struct vcpu_id_table; +#endif + +struct kvmppc_e500_tlb_params { + int entries, ways, sets; +}; + +struct kvmppc_vcpu_e500 { + struct kvm_vcpu vcpu; + + /* Unmodified copy of the guest's TLB -- shared with host userspace. */ + struct kvm_book3e_206_tlb_entry *gtlb_arch; + + /* Starting entry number in gtlb_arch[] */ + int gtlb_offset[E500_TLB_NUM]; + + /* KVM internal information associated with each guest TLB entry */ + struct tlbe_priv *gtlb_priv[E500_TLB_NUM]; + + struct kvmppc_e500_tlb_params gtlb_params[E500_TLB_NUM]; + + unsigned int gtlb_nv[E500_TLB_NUM]; + + /* + * information associated with each host TLB entry -- + * TLB1 only for now. If/when guest TLB1 entries can be + * mapped with host TLB0, this will be used for that too. + * + * We don't want to use this for guest TLB0 because then we'd + * have the overhead of doing the translation again even if + * the entry is still in the guest TLB (e.g. we swapped out + * and back, and our host TLB entries got evicted). + */ + struct tlbe_ref *tlb_refs[E500_TLB_NUM]; + unsigned int host_tlb1_nv; + + u32 svr; + u32 l1csr0; + u32 l1csr1; + u32 hid0; + u32 hid1; + u64 mcar; + + struct page **shared_tlb_pages; + int num_shared_tlb_pages; + + u64 *g2h_tlb1_map; + unsigned int *h2g_tlb1_rmap; + + /* Minimum and maximum address mapped my TLB1 */ + unsigned long tlb1_min_eaddr; + unsigned long tlb1_max_eaddr; + +#ifdef CONFIG_KVM_E500V2 + u32 pid[E500_PID_NUM]; + + /* vcpu id table */ + struct vcpu_id_table *idt; +#endif +}; + +static inline struct kvmppc_vcpu_e500 *to_e500(struct kvm_vcpu *vcpu) +{ + return container_of(vcpu, struct kvmppc_vcpu_e500, vcpu); +} + + +/* This geometry is the legacy default -- can be overridden by userspace */ +#define KVM_E500_TLB0_WAY_SIZE 128 +#define KVM_E500_TLB0_WAY_NUM 2 + +#define KVM_E500_TLB0_SIZE (KVM_E500_TLB0_WAY_SIZE * KVM_E500_TLB0_WAY_NUM) +#define KVM_E500_TLB1_SIZE 16 + +#define index_of(tlbsel, esel) (((tlbsel) << 16) | ((esel) & 0xFFFF)) +#define tlbsel_of(index) ((index) >> 16) +#define esel_of(index) ((index) & 0xFFFF) + +#define E500_TLB_USER_PERM_MASK (MAS3_UX|MAS3_UR|MAS3_UW) +#define E500_TLB_SUPER_PERM_MASK (MAS3_SX|MAS3_SR|MAS3_SW) +#define MAS2_ATTRIB_MASK \ + (MAS2_X0 | MAS2_X1) +#define MAS3_ATTRIB_MASK \ + (MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3 \ + | E500_TLB_USER_PERM_MASK | E500_TLB_SUPER_PERM_MASK) + +int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500, + ulong value); +int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu); +int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu); +int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb); +int kvmppc_e500_emul_tlbilx(struct kvm_vcpu *vcpu, int rt, int ra, int rb); +int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb); +int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500); +void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500); + +void kvmppc_get_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); +int kvmppc_set_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); + + +#ifdef CONFIG_KVM_E500V2 +unsigned int kvmppc_e500_get_sid(struct kvmppc_vcpu_e500 *vcpu_e500, + unsigned int as, unsigned int gid, + unsigned int pr, int avoid_recursion); +#endif + +/* TLB helper functions */ +static inline unsigned int +get_tlb_size(const struct kvm_book3e_206_tlb_entry *tlbe) +{ + return (tlbe->mas1 >> 7) & 0x1f; +} + +static inline gva_t get_tlb_eaddr(const struct kvm_book3e_206_tlb_entry *tlbe) +{ + return tlbe->mas2 & 0xfffff000; +} + +static inline u64 get_tlb_bytes(const struct kvm_book3e_206_tlb_entry *tlbe) +{ + unsigned int pgsize = get_tlb_size(tlbe); + return 1ULL << 10 << pgsize; +} + +static inline gva_t get_tlb_end(const struct kvm_book3e_206_tlb_entry *tlbe) +{ + u64 bytes = get_tlb_bytes(tlbe); + return get_tlb_eaddr(tlbe) + bytes - 1; +} + +static inline u64 get_tlb_raddr(const struct kvm_book3e_206_tlb_entry *tlbe) +{ + return tlbe->mas7_3 & ~0xfffULL; +} + +static inline unsigned int +get_tlb_tid(const struct kvm_book3e_206_tlb_entry *tlbe) +{ + return (tlbe->mas1 >> 16) & 0xff; +} + +static inline unsigned int +get_tlb_ts(const struct kvm_book3e_206_tlb_entry *tlbe) +{ + return (tlbe->mas1 >> 12) & 0x1; +} + +static inline unsigned int +get_tlb_v(const struct kvm_book3e_206_tlb_entry *tlbe) +{ + return (tlbe->mas1 >> 31) & 0x1; +} + +static inline unsigned int +get_tlb_iprot(const struct kvm_book3e_206_tlb_entry *tlbe) +{ + return (tlbe->mas1 >> 30) & 0x1; +} + +static inline unsigned int +get_tlb_tsize(const struct kvm_book3e_206_tlb_entry *tlbe) +{ + return (tlbe->mas1 & MAS1_TSIZE_MASK) >> MAS1_TSIZE_SHIFT; +} + +static inline unsigned int get_cur_pid(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.pid & 0xff; +} + +static inline unsigned int get_cur_as(struct kvm_vcpu *vcpu) +{ + return !!(vcpu->arch.shared->msr & (MSR_IS | MSR_DS)); +} + +static inline unsigned int get_cur_pr(struct kvm_vcpu *vcpu) +{ + return !!(vcpu->arch.shared->msr & MSR_PR); +} + +static inline unsigned int get_cur_spid(const struct kvm_vcpu *vcpu) +{ + return (vcpu->arch.shared->mas6 >> 16) & 0xff; +} + +static inline unsigned int get_cur_sas(const struct kvm_vcpu *vcpu) +{ + return vcpu->arch.shared->mas6 & 0x1; +} + +static inline unsigned int get_tlb_tlbsel(const struct kvm_vcpu *vcpu) +{ + /* + * Manual says that tlbsel has 2 bits wide. + * Since we only have two TLBs, only lower bit is used. + */ + return (vcpu->arch.shared->mas0 >> 28) & 0x1; +} + +static inline unsigned int get_tlb_nv_bit(const struct kvm_vcpu *vcpu) +{ + return vcpu->arch.shared->mas0 & 0xfff; +} + +static inline unsigned int get_tlb_esel_bit(const struct kvm_vcpu *vcpu) +{ + return (vcpu->arch.shared->mas0 >> 16) & 0xfff; +} + +static inline int tlbe_is_host_safe(const struct kvm_vcpu *vcpu, + const struct kvm_book3e_206_tlb_entry *tlbe) +{ + gpa_t gpa; + + if (!get_tlb_v(tlbe)) + return 0; + +#ifndef CONFIG_KVM_BOOKE_HV + /* Does it match current guest AS? */ + /* XXX what about IS != DS? */ + if (get_tlb_ts(tlbe) != !!(vcpu->arch.shared->msr & MSR_IS)) + return 0; +#endif + + gpa = get_tlb_raddr(tlbe); + if (!gfn_to_memslot(vcpu->kvm, gpa >> PAGE_SHIFT)) + /* Mapping is not for RAM. */ + return 0; + + return 1; +} + +static inline struct kvm_book3e_206_tlb_entry *get_entry( + struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel, int entry) +{ + int offset = vcpu_e500->gtlb_offset[tlbsel]; + return &vcpu_e500->gtlb_arch[offset + entry]; +} + +void kvmppc_e500_tlbil_one(struct kvmppc_vcpu_e500 *vcpu_e500, + struct kvm_book3e_206_tlb_entry *gtlbe); +void kvmppc_e500_tlbil_all(struct kvmppc_vcpu_e500 *vcpu_e500); + +#ifdef CONFIG_KVM_BOOKE_HV +#define kvmppc_e500_get_tlb_stid(vcpu, gtlbe) get_tlb_tid(gtlbe) +#define get_tlbmiss_tid(vcpu) get_cur_pid(vcpu) +#define get_tlb_sts(gtlbe) (gtlbe->mas1 & MAS1_TS) +#else +unsigned int kvmppc_e500_get_tlb_stid(struct kvm_vcpu *vcpu, + struct kvm_book3e_206_tlb_entry *gtlbe); + +static inline unsigned int get_tlbmiss_tid(struct kvm_vcpu *vcpu) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + unsigned int tidseld = (vcpu->arch.shared->mas4 >> 16) & 0xf; + + return vcpu_e500->pid[tidseld]; +} + +/* Force TS=1 for all guest mappings. */ +#define get_tlb_sts(gtlbe) (MAS1_TS) +#endif /* !BOOKE_HV */ + +#endif /* KVM_E500_H */ diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c index 6d0b2bd..8b99e07 100644 --- a/arch/powerpc/kvm/e500_emulate.c +++ b/arch/powerpc/kvm/e500_emulate.c @@ -14,27 +14,96 @@ #include <asm/kvm_ppc.h> #include <asm/disassemble.h> -#include <asm/kvm_e500.h> +#include <asm/dbell.h> #include "booke.h" -#include "e500_tlb.h" +#include "e500.h" +#define XOP_MSGSND 206 +#define XOP_MSGCLR 238 #define XOP_TLBIVAX 786 #define XOP_TLBSX 914 #define XOP_TLBRE 946 #define XOP_TLBWE 978 +#define XOP_TLBILX 18 + +#ifdef CONFIG_KVM_E500MC +static int dbell2prio(ulong param) +{ + int msg = param & PPC_DBELL_TYPE_MASK; + int prio = -1; + + switch (msg) { + case PPC_DBELL_TYPE(PPC_DBELL): + prio = BOOKE_IRQPRIO_DBELL; + break; + case PPC_DBELL_TYPE(PPC_DBELL_CRIT): + prio = BOOKE_IRQPRIO_DBELL_CRIT; + break; + default: + break; + } + + return prio; +} + +static int kvmppc_e500_emul_msgclr(struct kvm_vcpu *vcpu, int rb) +{ + ulong param = vcpu->arch.gpr[rb]; + int prio = dbell2prio(param); + + if (prio < 0) + return EMULATE_FAIL; + + clear_bit(prio, &vcpu->arch.pending_exceptions); + return EMULATE_DONE; +} + +static int kvmppc_e500_emul_msgsnd(struct kvm_vcpu *vcpu, int rb) +{ + ulong param = vcpu->arch.gpr[rb]; + int prio = dbell2prio(rb); + int pir = param & PPC_DBELL_PIR_MASK; + int i; + struct kvm_vcpu *cvcpu; + + if (prio < 0) + return EMULATE_FAIL; + + kvm_for_each_vcpu(i, cvcpu, vcpu->kvm) { + int cpir = cvcpu->arch.shared->pir; + if ((param & PPC_DBELL_MSG_BRDCAST) || (cpir == pir)) { + set_bit(prio, &cvcpu->arch.pending_exceptions); + kvm_vcpu_kick(cvcpu); + } + } + + return EMULATE_DONE; +} +#endif int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int inst, int *advance) { int emulated = EMULATE_DONE; - int ra; - int rb; + int ra = get_ra(inst); + int rb = get_rb(inst); + int rt = get_rt(inst); switch (get_op(inst)) { case 31: switch (get_xop(inst)) { +#ifdef CONFIG_KVM_E500MC + case XOP_MSGSND: + emulated = kvmppc_e500_emul_msgsnd(vcpu, rb); + break; + + case XOP_MSGCLR: + emulated = kvmppc_e500_emul_msgclr(vcpu, rb); + break; +#endif + case XOP_TLBRE: emulated = kvmppc_e500_emul_tlbre(vcpu); break; @@ -44,13 +113,14 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, break; case XOP_TLBSX: - rb = get_rb(inst); emulated = kvmppc_e500_emul_tlbsx(vcpu,rb); break; + case XOP_TLBILX: + emulated = kvmppc_e500_emul_tlbilx(vcpu, rt, ra, rb); + break; + case XOP_TLBIVAX: - ra = get_ra(inst); - rb = get_rb(inst); emulated = kvmppc_e500_emul_tlbivax(vcpu, ra, rb); break; @@ -70,52 +140,63 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, return emulated; } -int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) +int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); int emulated = EMULATE_DONE; - ulong spr_val = kvmppc_get_gpr(vcpu, rs); switch (sprn) { +#ifndef CONFIG_KVM_BOOKE_HV case SPRN_PID: kvmppc_set_pid(vcpu, spr_val); break; case SPRN_PID1: if (spr_val != 0) return EMULATE_FAIL; - vcpu_e500->pid[1] = spr_val; break; + vcpu_e500->pid[1] = spr_val; + break; case SPRN_PID2: if (spr_val != 0) return EMULATE_FAIL; - vcpu_e500->pid[2] = spr_val; break; + vcpu_e500->pid[2] = spr_val; + break; case SPRN_MAS0: - vcpu->arch.shared->mas0 = spr_val; break; + vcpu->arch.shared->mas0 = spr_val; + break; case SPRN_MAS1: - vcpu->arch.shared->mas1 = spr_val; break; + vcpu->arch.shared->mas1 = spr_val; + break; case SPRN_MAS2: - vcpu->arch.shared->mas2 = spr_val; break; + vcpu->arch.shared->mas2 = spr_val; + break; case SPRN_MAS3: vcpu->arch.shared->mas7_3 &= ~(u64)0xffffffff; vcpu->arch.shared->mas7_3 |= spr_val; break; case SPRN_MAS4: - vcpu->arch.shared->mas4 = spr_val; break; + vcpu->arch.shared->mas4 = spr_val; + break; case SPRN_MAS6: - vcpu->arch.shared->mas6 = spr_val; break; + vcpu->arch.shared->mas6 = spr_val; + break; case SPRN_MAS7: vcpu->arch.shared->mas7_3 &= (u64)0xffffffff; vcpu->arch.shared->mas7_3 |= (u64)spr_val << 32; break; +#endif case SPRN_L1CSR0: vcpu_e500->l1csr0 = spr_val; vcpu_e500->l1csr0 &= ~(L1CSR0_DCFI | L1CSR0_CLFC); break; case SPRN_L1CSR1: - vcpu_e500->l1csr1 = spr_val; break; + vcpu_e500->l1csr1 = spr_val; + break; case SPRN_HID0: - vcpu_e500->hid0 = spr_val; break; + vcpu_e500->hid0 = spr_val; + break; case SPRN_HID1: - vcpu_e500->hid1 = spr_val; break; + vcpu_e500->hid1 = spr_val; + break; case SPRN_MMUCSR0: emulated = kvmppc_e500_emul_mt_mmucsr0(vcpu_e500, @@ -135,81 +216,112 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) case SPRN_IVOR35: vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR] = spr_val; break; - +#ifdef CONFIG_KVM_BOOKE_HV + case SPRN_IVOR36: + vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL] = spr_val; + break; + case SPRN_IVOR37: + vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL_CRIT] = spr_val; + break; +#endif default: - emulated = kvmppc_booke_emulate_mtspr(vcpu, sprn, rs); + emulated = kvmppc_booke_emulate_mtspr(vcpu, sprn, spr_val); } return emulated; } -int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) +int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); int emulated = EMULATE_DONE; - unsigned long val; switch (sprn) { +#ifndef CONFIG_KVM_BOOKE_HV case SPRN_PID: - kvmppc_set_gpr(vcpu, rt, vcpu_e500->pid[0]); break; + *spr_val = vcpu_e500->pid[0]; + break; case SPRN_PID1: - kvmppc_set_gpr(vcpu, rt, vcpu_e500->pid[1]); break; + *spr_val = vcpu_e500->pid[1]; + break; case SPRN_PID2: - kvmppc_set_gpr(vcpu, rt, vcpu_e500->pid[2]); break; + *spr_val = vcpu_e500->pid[2]; + break; case SPRN_MAS0: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->mas0); break; + *spr_val = vcpu->arch.shared->mas0; + break; case SPRN_MAS1: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->mas1); break; + *spr_val = vcpu->arch.shared->mas1; + break; case SPRN_MAS2: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->mas2); break; + *spr_val = vcpu->arch.shared->mas2; + break; case SPRN_MAS3: - val = (u32)vcpu->arch.shared->mas7_3; - kvmppc_set_gpr(vcpu, rt, val); + *spr_val = (u32)vcpu->arch.shared->mas7_3; break; case SPRN_MAS4: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->mas4); break; + *spr_val = vcpu->arch.shared->mas4; + break; case SPRN_MAS6: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->mas6); break; + *spr_val = vcpu->arch.shared->mas6; + break; case SPRN_MAS7: - val = vcpu->arch.shared->mas7_3 >> 32; - kvmppc_set_gpr(vcpu, rt, val); + *spr_val = vcpu->arch.shared->mas7_3 >> 32; break; +#endif case SPRN_TLB0CFG: - kvmppc_set_gpr(vcpu, rt, vcpu_e500->tlb0cfg); break; + *spr_val = vcpu->arch.tlbcfg[0]; + break; case SPRN_TLB1CFG: - kvmppc_set_gpr(vcpu, rt, vcpu_e500->tlb1cfg); break; + *spr_val = vcpu->arch.tlbcfg[1]; + break; case SPRN_L1CSR0: - kvmppc_set_gpr(vcpu, rt, vcpu_e500->l1csr0); break; + *spr_val = vcpu_e500->l1csr0; + break; case SPRN_L1CSR1: - kvmppc_set_gpr(vcpu, rt, vcpu_e500->l1csr1); break; + *spr_val = vcpu_e500->l1csr1; + break; case SPRN_HID0: - kvmppc_set_gpr(vcpu, rt, vcpu_e500->hid0); break; + *spr_val = vcpu_e500->hid0; + break; case SPRN_HID1: - kvmppc_set_gpr(vcpu, rt, vcpu_e500->hid1); break; + *spr_val = vcpu_e500->hid1; + break; case SPRN_SVR: - kvmppc_set_gpr(vcpu, rt, vcpu_e500->svr); break; + *spr_val = vcpu_e500->svr; + break; case SPRN_MMUCSR0: - kvmppc_set_gpr(vcpu, rt, 0); break; + *spr_val = 0; + break; case SPRN_MMUCFG: - kvmppc_set_gpr(vcpu, rt, mfspr(SPRN_MMUCFG)); break; + *spr_val = vcpu->arch.mmucfg; + break; /* extra exceptions */ case SPRN_IVOR32: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL]); + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL]; break; case SPRN_IVOR33: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA]); + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA]; break; case SPRN_IVOR34: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND]); + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND]; break; case SPRN_IVOR35: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR]); + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR]; + break; +#ifdef CONFIG_KVM_BOOKE_HV + case SPRN_IVOR36: + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL]; + break; + case SPRN_IVOR37: + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL_CRIT]; break; +#endif default: - emulated = kvmppc_booke_emulate_mfspr(vcpu, sprn, rt); + emulated = kvmppc_booke_emulate_mfspr(vcpu, sprn, spr_val); } return emulated; diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c index 6e53e41..c510fc9 100644 --- a/arch/powerpc/kvm/e500_tlb.c +++ b/arch/powerpc/kvm/e500_tlb.c @@ -2,6 +2,9 @@ * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved. * * Author: Yu Liu, yu.liu@freescale.com + * Scott Wood, scottwood@freescale.com + * Ashish Kalra, ashish.kalra@freescale.com + * Varun Sethi, varun.sethi@freescale.com * * Description: * This file is based on arch/powerpc/kvm/44x_tlb.c, @@ -26,210 +29,15 @@ #include <linux/vmalloc.h> #include <linux/hugetlb.h> #include <asm/kvm_ppc.h> -#include <asm/kvm_e500.h> -#include "../mm/mmu_decl.h" -#include "e500_tlb.h" +#include "e500.h" #include "trace.h" #include "timing.h" #define to_htlb1_esel(esel) (host_tlb_params[1].entries - (esel) - 1) -struct id { - unsigned long val; - struct id **pentry; -}; - -#define NUM_TIDS 256 - -/* - * This table provide mappings from: - * (guestAS,guestTID,guestPR) --> ID of physical cpu - * guestAS [0..1] - * guestTID [0..255] - * guestPR [0..1] - * ID [1..255] - * Each vcpu keeps one vcpu_id_table. - */ -struct vcpu_id_table { - struct id id[2][NUM_TIDS][2]; -}; - -/* - * This table provide reversed mappings of vcpu_id_table: - * ID --> address of vcpu_id_table item. - * Each physical core has one pcpu_id_table. - */ -struct pcpu_id_table { - struct id *entry[NUM_TIDS]; -}; - -static DEFINE_PER_CPU(struct pcpu_id_table, pcpu_sids); - -/* This variable keeps last used shadow ID on local core. - * The valid range of shadow ID is [1..255] */ -static DEFINE_PER_CPU(unsigned long, pcpu_last_used_sid); - static struct kvmppc_e500_tlb_params host_tlb_params[E500_TLB_NUM]; -static struct kvm_book3e_206_tlb_entry *get_entry( - struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel, int entry) -{ - int offset = vcpu_e500->gtlb_offset[tlbsel]; - return &vcpu_e500->gtlb_arch[offset + entry]; -} - -/* - * Allocate a free shadow id and setup a valid sid mapping in given entry. - * A mapping is only valid when vcpu_id_table and pcpu_id_table are match. - * - * The caller must have preemption disabled, and keep it that way until - * it has finished with the returned shadow id (either written into the - * TLB or arch.shadow_pid, or discarded). - */ -static inline int local_sid_setup_one(struct id *entry) -{ - unsigned long sid; - int ret = -1; - - sid = ++(__get_cpu_var(pcpu_last_used_sid)); - if (sid < NUM_TIDS) { - __get_cpu_var(pcpu_sids).entry[sid] = entry; - entry->val = sid; - entry->pentry = &__get_cpu_var(pcpu_sids).entry[sid]; - ret = sid; - } - - /* - * If sid == NUM_TIDS, we've run out of sids. We return -1, and - * the caller will invalidate everything and start over. - * - * sid > NUM_TIDS indicates a race, which we disable preemption to - * avoid. - */ - WARN_ON(sid > NUM_TIDS); - - return ret; -} - -/* - * Check if given entry contain a valid shadow id mapping. - * An ID mapping is considered valid only if - * both vcpu and pcpu know this mapping. - * - * The caller must have preemption disabled, and keep it that way until - * it has finished with the returned shadow id (either written into the - * TLB or arch.shadow_pid, or discarded). - */ -static inline int local_sid_lookup(struct id *entry) -{ - if (entry && entry->val != 0 && - __get_cpu_var(pcpu_sids).entry[entry->val] == entry && - entry->pentry == &__get_cpu_var(pcpu_sids).entry[entry->val]) - return entry->val; - return -1; -} - -/* Invalidate all id mappings on local core -- call with preempt disabled */ -static inline void local_sid_destroy_all(void) -{ - __get_cpu_var(pcpu_last_used_sid) = 0; - memset(&__get_cpu_var(pcpu_sids), 0, sizeof(__get_cpu_var(pcpu_sids))); -} - -static void *kvmppc_e500_id_table_alloc(struct kvmppc_vcpu_e500 *vcpu_e500) -{ - vcpu_e500->idt = kzalloc(sizeof(struct vcpu_id_table), GFP_KERNEL); - return vcpu_e500->idt; -} - -static void kvmppc_e500_id_table_free(struct kvmppc_vcpu_e500 *vcpu_e500) -{ - kfree(vcpu_e500->idt); -} - -/* Invalidate all mappings on vcpu */ -static void kvmppc_e500_id_table_reset_all(struct kvmppc_vcpu_e500 *vcpu_e500) -{ - memset(vcpu_e500->idt, 0, sizeof(struct vcpu_id_table)); - - /* Update shadow pid when mappings are changed */ - kvmppc_e500_recalc_shadow_pid(vcpu_e500); -} - -/* Invalidate one ID mapping on vcpu */ -static inline void kvmppc_e500_id_table_reset_one( - struct kvmppc_vcpu_e500 *vcpu_e500, - int as, int pid, int pr) -{ - struct vcpu_id_table *idt = vcpu_e500->idt; - - BUG_ON(as >= 2); - BUG_ON(pid >= NUM_TIDS); - BUG_ON(pr >= 2); - - idt->id[as][pid][pr].val = 0; - idt->id[as][pid][pr].pentry = NULL; - - /* Update shadow pid when mappings are changed */ - kvmppc_e500_recalc_shadow_pid(vcpu_e500); -} - -/* - * Map guest (vcpu,AS,ID,PR) to physical core shadow id. - * This function first lookup if a valid mapping exists, - * if not, then creates a new one. - * - * The caller must have preemption disabled, and keep it that way until - * it has finished with the returned shadow id (either written into the - * TLB or arch.shadow_pid, or discarded). - */ -static unsigned int kvmppc_e500_get_sid(struct kvmppc_vcpu_e500 *vcpu_e500, - unsigned int as, unsigned int gid, - unsigned int pr, int avoid_recursion) -{ - struct vcpu_id_table *idt = vcpu_e500->idt; - int sid; - - BUG_ON(as >= 2); - BUG_ON(gid >= NUM_TIDS); - BUG_ON(pr >= 2); - - sid = local_sid_lookup(&idt->id[as][gid][pr]); - - while (sid <= 0) { - /* No mapping yet */ - sid = local_sid_setup_one(&idt->id[as][gid][pr]); - if (sid <= 0) { - _tlbil_all(); - local_sid_destroy_all(); - } - - /* Update shadow pid when mappings are changed */ - if (!avoid_recursion) - kvmppc_e500_recalc_shadow_pid(vcpu_e500); - } - - return sid; -} - -/* Map guest pid to shadow. - * We use PID to keep shadow of current guest non-zero PID, - * and use PID1 to keep shadow of guest zero PID. - * So that guest tlbe with TID=0 can be accessed at any time */ -void kvmppc_e500_recalc_shadow_pid(struct kvmppc_vcpu_e500 *vcpu_e500) -{ - preempt_disable(); - vcpu_e500->vcpu.arch.shadow_pid = kvmppc_e500_get_sid(vcpu_e500, - get_cur_as(&vcpu_e500->vcpu), - get_cur_pid(&vcpu_e500->vcpu), - get_cur_pr(&vcpu_e500->vcpu), 1); - vcpu_e500->vcpu.arch.shadow_pid1 = kvmppc_e500_get_sid(vcpu_e500, - get_cur_as(&vcpu_e500->vcpu), 0, - get_cur_pr(&vcpu_e500->vcpu), 1); - preempt_enable(); -} - static inline unsigned int gtlb0_get_next_victim( struct kvmppc_vcpu_e500 *vcpu_e500) { @@ -258,6 +66,7 @@ static inline u32 e500_shadow_mas3_attrib(u32 mas3, int usermode) /* Mask off reserved bits. */ mas3 &= MAS3_ATTRIB_MASK; +#ifndef CONFIG_KVM_BOOKE_HV if (!usermode) { /* Guest is in supervisor mode, * so we need to translate guest @@ -265,8 +74,9 @@ static inline u32 e500_shadow_mas3_attrib(u32 mas3, int usermode) mas3 &= ~E500_TLB_USER_PERM_MASK; mas3 |= (mas3 & E500_TLB_SUPER_PERM_MASK) << 1; } - - return mas3 | E500_TLB_SUPER_PERM_MASK; + mas3 |= E500_TLB_SUPER_PERM_MASK; +#endif + return mas3; } static inline u32 e500_shadow_mas2_attrib(u32 mas2, int usermode) @@ -292,7 +102,16 @@ static inline void __write_host_tlbe(struct kvm_book3e_206_tlb_entry *stlbe, mtspr(SPRN_MAS2, (unsigned long)stlbe->mas2); mtspr(SPRN_MAS3, (u32)stlbe->mas7_3); mtspr(SPRN_MAS7, (u32)(stlbe->mas7_3 >> 32)); +#ifdef CONFIG_KVM_BOOKE_HV + mtspr(SPRN_MAS8, stlbe->mas8); +#endif asm volatile("isync; tlbwe" : : : "memory"); + +#ifdef CONFIG_KVM_BOOKE_HV + /* Must clear mas8 for other host tlbwe's */ + mtspr(SPRN_MAS8, 0); + isync(); +#endif local_irq_restore(flags); trace_kvm_booke206_stlb_write(mas0, stlbe->mas8, stlbe->mas1, @@ -337,6 +156,7 @@ static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500, } } +#ifdef CONFIG_KVM_E500V2 void kvmppc_map_magic(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); @@ -361,75 +181,41 @@ void kvmppc_map_magic(struct kvm_vcpu *vcpu) __write_host_tlbe(&magic, MAS0_TLBSEL(1) | MAS0_ESEL(tlbcam_index)); preempt_enable(); } - -void kvmppc_e500_tlb_load(struct kvm_vcpu *vcpu, int cpu) -{ - struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); - - /* Shadow PID may be expired on local core */ - kvmppc_e500_recalc_shadow_pid(vcpu_e500); -} - -void kvmppc_e500_tlb_put(struct kvm_vcpu *vcpu) -{ -} +#endif static void inval_gtlbe_on_host(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel, int esel) { struct kvm_book3e_206_tlb_entry *gtlbe = get_entry(vcpu_e500, tlbsel, esel); - struct vcpu_id_table *idt = vcpu_e500->idt; - unsigned int pr, tid, ts, pid; - u32 val, eaddr; - unsigned long flags; - - ts = get_tlb_ts(gtlbe); - tid = get_tlb_tid(gtlbe); - - preempt_disable(); - - /* One guest ID may be mapped to two shadow IDs */ - for (pr = 0; pr < 2; pr++) { - /* - * The shadow PID can have a valid mapping on at most one - * host CPU. In the common case, it will be valid on this - * CPU, in which case (for TLB0) we do a local invalidation - * of the specific address. - * - * If the shadow PID is not valid on the current host CPU, or - * if we're invalidating a TLB1 entry, we invalidate the - * entire shadow PID. - */ - if (tlbsel == 1 || - (pid = local_sid_lookup(&idt->id[ts][tid][pr])) <= 0) { - kvmppc_e500_id_table_reset_one(vcpu_e500, ts, tid, pr); - continue; - } - /* - * The guest is invalidating a TLB0 entry which is in a PID - * that has a valid shadow mapping on this host CPU. We - * search host TLB0 to invalidate it's shadow TLB entry, - * similar to __tlbil_va except that we need to look in AS1. - */ - val = (pid << MAS6_SPID_SHIFT) | MAS6_SAS; - eaddr = get_tlb_eaddr(gtlbe); + if (tlbsel == 1 && + vcpu_e500->gtlb_priv[1][esel].ref.flags & E500_TLB_BITMAP) { + u64 tmp = vcpu_e500->g2h_tlb1_map[esel]; + int hw_tlb_indx; + unsigned long flags; local_irq_save(flags); - - mtspr(SPRN_MAS6, val); - asm volatile("tlbsx 0, %[eaddr]" : : [eaddr] "r" (eaddr)); - val = mfspr(SPRN_MAS1); - if (val & MAS1_VALID) { - mtspr(SPRN_MAS1, val & ~MAS1_VALID); + while (tmp) { + hw_tlb_indx = __ilog2_u64(tmp & -tmp); + mtspr(SPRN_MAS0, + MAS0_TLBSEL(1) | + MAS0_ESEL(to_htlb1_esel(hw_tlb_indx))); + mtspr(SPRN_MAS1, 0); asm volatile("tlbwe"); + vcpu_e500->h2g_tlb1_rmap[hw_tlb_indx] = 0; + tmp &= tmp - 1; } - + mb(); + vcpu_e500->g2h_tlb1_map[esel] = 0; + vcpu_e500->gtlb_priv[1][esel].ref.flags &= ~E500_TLB_BITMAP; local_irq_restore(flags); + + return; } - preempt_enable(); + /* Guest tlbe is backed by at most one host tlbe per shadow pid. */ + kvmppc_e500_tlbil_one(vcpu_e500, gtlbe); } static int tlb0_set_base(gva_t addr, int sets, int ways) @@ -475,6 +261,9 @@ static int kvmppc_e500_tlb_index(struct kvmppc_vcpu_e500 *vcpu_e500, set_base = gtlb0_set_base(vcpu_e500, eaddr); size = vcpu_e500->gtlb_params[0].ways; } else { + if (eaddr < vcpu_e500->tlb1_min_eaddr || + eaddr > vcpu_e500->tlb1_max_eaddr) + return -1; set_base = 0; } @@ -530,6 +319,16 @@ static inline void kvmppc_e500_ref_release(struct tlbe_ref *ref) } } +static void clear_tlb1_bitmap(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + if (vcpu_e500->g2h_tlb1_map) + memset(vcpu_e500->g2h_tlb1_map, + sizeof(u64) * vcpu_e500->gtlb_params[1].entries, 0); + if (vcpu_e500->h2g_tlb1_rmap) + memset(vcpu_e500->h2g_tlb1_rmap, + sizeof(unsigned int) * host_tlb_params[1].entries, 0); +} + static void clear_tlb_privs(struct kvmppc_vcpu_e500 *vcpu_e500) { int tlbsel = 0; @@ -547,7 +346,7 @@ static void clear_tlb_refs(struct kvmppc_vcpu_e500 *vcpu_e500) int stlbsel = 1; int i; - kvmppc_e500_id_table_reset_all(vcpu_e500); + kvmppc_e500_tlbil_all(vcpu_e500); for (i = 0; i < host_tlb_params[stlbsel].entries; i++) { struct tlbe_ref *ref = @@ -562,19 +361,18 @@ static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu, unsigned int eaddr, int as) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); - unsigned int victim, pidsel, tsized; + unsigned int victim, tsized; int tlbsel; /* since we only have two TLBs, only lower bit is used. */ tlbsel = (vcpu->arch.shared->mas4 >> 28) & 0x1; victim = (tlbsel == 0) ? gtlb0_get_next_victim(vcpu_e500) : 0; - pidsel = (vcpu->arch.shared->mas4 >> 16) & 0xf; tsized = (vcpu->arch.shared->mas4 >> 7) & 0x1f; vcpu->arch.shared->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim) | MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]); vcpu->arch.shared->mas1 = MAS1_VALID | (as ? MAS1_TS : 0) - | MAS1_TID(vcpu_e500->pid[pidsel]) + | MAS1_TID(get_tlbmiss_tid(vcpu)) | MAS1_TSIZE(tsized); vcpu->arch.shared->mas2 = (eaddr & MAS2_EPN) | (vcpu->arch.shared->mas4 & MAS2_ATTRIB_MASK); @@ -586,23 +384,26 @@ static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu, /* TID must be supplied by the caller */ static inline void kvmppc_e500_setup_stlbe( - struct kvmppc_vcpu_e500 *vcpu_e500, + struct kvm_vcpu *vcpu, struct kvm_book3e_206_tlb_entry *gtlbe, int tsize, struct tlbe_ref *ref, u64 gvaddr, struct kvm_book3e_206_tlb_entry *stlbe) { pfn_t pfn = ref->pfn; + u32 pr = vcpu->arch.shared->msr & MSR_PR; BUG_ON(!(ref->flags & E500_TLB_VALID)); - /* Force TS=1 IPROT=0 for all guest mappings. */ - stlbe->mas1 = MAS1_TSIZE(tsize) | MAS1_TS | MAS1_VALID; - stlbe->mas2 = (gvaddr & MAS2_EPN) - | e500_shadow_mas2_attrib(gtlbe->mas2, - vcpu_e500->vcpu.arch.shared->msr & MSR_PR); - stlbe->mas7_3 = ((u64)pfn << PAGE_SHIFT) - | e500_shadow_mas3_attrib(gtlbe->mas7_3, - vcpu_e500->vcpu.arch.shared->msr & MSR_PR); + /* Force IPROT=0 for all guest mappings. */ + stlbe->mas1 = MAS1_TSIZE(tsize) | get_tlb_sts(gtlbe) | MAS1_VALID; + stlbe->mas2 = (gvaddr & MAS2_EPN) | + e500_shadow_mas2_attrib(gtlbe->mas2, pr); + stlbe->mas7_3 = ((u64)pfn << PAGE_SHIFT) | + e500_shadow_mas3_attrib(gtlbe->mas7_3, pr); + +#ifdef CONFIG_KVM_BOOKE_HV + stlbe->mas8 = MAS8_TGS | vcpu->kvm->arch.lpid; +#endif } static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, @@ -736,7 +537,8 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, kvmppc_e500_ref_release(ref); kvmppc_e500_ref_setup(ref, gtlbe, pfn); - kvmppc_e500_setup_stlbe(vcpu_e500, gtlbe, tsize, ref, gvaddr, stlbe); + kvmppc_e500_setup_stlbe(&vcpu_e500->vcpu, gtlbe, tsize, + ref, gvaddr, stlbe); } /* XXX only map the one-one case, for now use TLB0 */ @@ -760,7 +562,7 @@ static void kvmppc_e500_tlb0_map(struct kvmppc_vcpu_e500 *vcpu_e500, /* XXX for both one-one and one-to-many , for now use TLB1 */ static int kvmppc_e500_tlb1_map(struct kvmppc_vcpu_e500 *vcpu_e500, u64 gvaddr, gfn_t gfn, struct kvm_book3e_206_tlb_entry *gtlbe, - struct kvm_book3e_206_tlb_entry *stlbe) + struct kvm_book3e_206_tlb_entry *stlbe, int esel) { struct tlbe_ref *ref; unsigned int victim; @@ -773,15 +575,74 @@ static int kvmppc_e500_tlb1_map(struct kvmppc_vcpu_e500 *vcpu_e500, ref = &vcpu_e500->tlb_refs[1][victim]; kvmppc_e500_shadow_map(vcpu_e500, gvaddr, gfn, gtlbe, 1, stlbe, ref); + vcpu_e500->g2h_tlb1_map[esel] |= (u64)1 << victim; + vcpu_e500->gtlb_priv[1][esel].ref.flags |= E500_TLB_BITMAP; + if (vcpu_e500->h2g_tlb1_rmap[victim]) { + unsigned int idx = vcpu_e500->h2g_tlb1_rmap[victim]; + vcpu_e500->g2h_tlb1_map[idx] &= ~(1ULL << victim); + } + vcpu_e500->h2g_tlb1_rmap[victim] = esel; + return victim; } -void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr) +static void kvmppc_recalc_tlb1map_range(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + int size = vcpu_e500->gtlb_params[1].entries; + unsigned int offset; + gva_t eaddr; + int i; + + vcpu_e500->tlb1_min_eaddr = ~0UL; + vcpu_e500->tlb1_max_eaddr = 0; + offset = vcpu_e500->gtlb_offset[1]; + + for (i = 0; i < size; i++) { + struct kvm_book3e_206_tlb_entry *tlbe = + &vcpu_e500->gtlb_arch[offset + i]; + + if (!get_tlb_v(tlbe)) + continue; + + eaddr = get_tlb_eaddr(tlbe); + vcpu_e500->tlb1_min_eaddr = + min(vcpu_e500->tlb1_min_eaddr, eaddr); + + eaddr = get_tlb_end(tlbe); + vcpu_e500->tlb1_max_eaddr = + max(vcpu_e500->tlb1_max_eaddr, eaddr); + } +} + +static int kvmppc_need_recalc_tlb1map_range(struct kvmppc_vcpu_e500 *vcpu_e500, + struct kvm_book3e_206_tlb_entry *gtlbe) { + unsigned long start, end, size; + + size = get_tlb_bytes(gtlbe); + start = get_tlb_eaddr(gtlbe) & ~(size - 1); + end = start + size - 1; + + return vcpu_e500->tlb1_min_eaddr == start || + vcpu_e500->tlb1_max_eaddr == end; +} + +/* This function is supposed to be called for a adding a new valid tlb entry */ +static void kvmppc_set_tlb1map_range(struct kvm_vcpu *vcpu, + struct kvm_book3e_206_tlb_entry *gtlbe) +{ + unsigned long start, end, size; struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); - /* Recalc shadow pid since MSR changes */ - kvmppc_e500_recalc_shadow_pid(vcpu_e500); + if (!get_tlb_v(gtlbe)) + return; + + size = get_tlb_bytes(gtlbe); + start = get_tlb_eaddr(gtlbe) & ~(size - 1); + end = start + size - 1; + + vcpu_e500->tlb1_min_eaddr = min(vcpu_e500->tlb1_min_eaddr, start); + vcpu_e500->tlb1_max_eaddr = max(vcpu_e500->tlb1_max_eaddr, end); } static inline int kvmppc_e500_gtlbe_invalidate( @@ -794,6 +655,9 @@ static inline int kvmppc_e500_gtlbe_invalidate( if (unlikely(get_tlb_iprot(gtlbe))) return -1; + if (tlbsel == 1 && kvmppc_need_recalc_tlb1map_range(vcpu_e500, gtlbe)) + kvmppc_recalc_tlb1map_range(vcpu_e500); + gtlbe->mas1 = 0; return 0; @@ -811,7 +675,7 @@ int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500, ulong value) kvmppc_e500_gtlbe_invalidate(vcpu_e500, 1, esel); /* Invalidate all vcpu id mappings */ - kvmppc_e500_id_table_reset_all(vcpu_e500); + kvmppc_e500_tlbil_all(vcpu_e500); return EMULATE_DONE; } @@ -844,7 +708,59 @@ int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb) } /* Invalidate all vcpu id mappings */ - kvmppc_e500_id_table_reset_all(vcpu_e500); + kvmppc_e500_tlbil_all(vcpu_e500); + + return EMULATE_DONE; +} + +static void tlbilx_all(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel, + int pid, int rt) +{ + struct kvm_book3e_206_tlb_entry *tlbe; + int tid, esel; + + /* invalidate all entries */ + for (esel = 0; esel < vcpu_e500->gtlb_params[tlbsel].entries; esel++) { + tlbe = get_entry(vcpu_e500, tlbsel, esel); + tid = get_tlb_tid(tlbe); + if (rt == 0 || tid == pid) { + inval_gtlbe_on_host(vcpu_e500, tlbsel, esel); + kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel); + } + } +} + +static void tlbilx_one(struct kvmppc_vcpu_e500 *vcpu_e500, int pid, + int ra, int rb) +{ + int tlbsel, esel; + gva_t ea; + + ea = kvmppc_get_gpr(&vcpu_e500->vcpu, rb); + if (ra) + ea += kvmppc_get_gpr(&vcpu_e500->vcpu, ra); + + for (tlbsel = 0; tlbsel < 2; tlbsel++) { + esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, -1); + if (esel >= 0) { + inval_gtlbe_on_host(vcpu_e500, tlbsel, esel); + kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel); + break; + } + } +} + +int kvmppc_e500_emul_tlbilx(struct kvm_vcpu *vcpu, int rt, int ra, int rb) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + int pid = get_cur_spid(vcpu); + + if (rt == 0 || rt == 1) { + tlbilx_all(vcpu_e500, 0, pid, rt); + tlbilx_all(vcpu_e500, 1, pid, rt); + } else if (rt == 3) { + tlbilx_one(vcpu_e500, pid, ra, rb); + } return EMULATE_DONE; } @@ -929,9 +845,7 @@ static void write_stlbe(struct kvmppc_vcpu_e500 *vcpu_e500, int stid; preempt_disable(); - stid = kvmppc_e500_get_sid(vcpu_e500, get_tlb_ts(gtlbe), - get_tlb_tid(gtlbe), - get_cur_pr(&vcpu_e500->vcpu), 0); + stid = kvmppc_e500_get_tlb_stid(&vcpu_e500->vcpu, gtlbe); stlbe->mas1 |= MAS1_TID(stid); write_host_tlbe(vcpu_e500, stlbsel, sesel, stlbe); @@ -941,16 +855,21 @@ static void write_stlbe(struct kvmppc_vcpu_e500 *vcpu_e500, int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); - struct kvm_book3e_206_tlb_entry *gtlbe; - int tlbsel, esel; + struct kvm_book3e_206_tlb_entry *gtlbe, stlbe; + int tlbsel, esel, stlbsel, sesel; + int recal = 0; tlbsel = get_tlb_tlbsel(vcpu); esel = get_tlb_esel(vcpu, tlbsel); gtlbe = get_entry(vcpu_e500, tlbsel, esel); - if (get_tlb_v(gtlbe)) + if (get_tlb_v(gtlbe)) { inval_gtlbe_on_host(vcpu_e500, tlbsel, esel); + if ((tlbsel == 1) && + kvmppc_need_recalc_tlb1map_range(vcpu_e500, gtlbe)) + recal = 1; + } gtlbe->mas1 = vcpu->arch.shared->mas1; gtlbe->mas2 = vcpu->arch.shared->mas2; @@ -959,10 +878,20 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) trace_kvm_booke206_gtlb_write(vcpu->arch.shared->mas0, gtlbe->mas1, gtlbe->mas2, gtlbe->mas7_3); + if (tlbsel == 1) { + /* + * If a valid tlb1 entry is overwritten then recalculate the + * min/max TLB1 map address range otherwise no need to look + * in tlb1 array. + */ + if (recal) + kvmppc_recalc_tlb1map_range(vcpu_e500); + else + kvmppc_set_tlb1map_range(vcpu, gtlbe); + } + /* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */ if (tlbe_is_host_safe(vcpu, gtlbe)) { - struct kvm_book3e_206_tlb_entry stlbe; - int stlbsel, sesel; u64 eaddr; u64 raddr; @@ -989,7 +918,7 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) * are mapped on the fly. */ stlbsel = 1; sesel = kvmppc_e500_tlb1_map(vcpu_e500, eaddr, - raddr >> PAGE_SHIFT, gtlbe, &stlbe); + raddr >> PAGE_SHIFT, gtlbe, &stlbe, esel); break; default: @@ -1003,6 +932,48 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) return EMULATE_DONE; } +static int kvmppc_e500_tlb_search(struct kvm_vcpu *vcpu, + gva_t eaddr, unsigned int pid, int as) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + int esel, tlbsel; + + for (tlbsel = 0; tlbsel < 2; tlbsel++) { + esel = kvmppc_e500_tlb_index(vcpu_e500, eaddr, tlbsel, pid, as); + if (esel >= 0) + return index_of(tlbsel, esel); + } + + return -1; +} + +/* 'linear_address' is actually an encoding of AS|PID|EADDR . */ +int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu, + struct kvm_translation *tr) +{ + int index; + gva_t eaddr; + u8 pid; + u8 as; + + eaddr = tr->linear_address; + pid = (tr->linear_address >> 32) & 0xff; + as = (tr->linear_address >> 40) & 0x1; + + index = kvmppc_e500_tlb_search(vcpu, eaddr, pid, as); + if (index < 0) { + tr->valid = 0; + return 0; + } + + tr->physical_address = kvmppc_mmu_xlate(vcpu, index, eaddr); + /* XXX what does "writeable" and "usermode" even mean? */ + tr->valid = 1; + + return 0; +} + + int kvmppc_mmu_itlb_index(struct kvm_vcpu *vcpu, gva_t eaddr) { unsigned int as = !!(vcpu->arch.shared->msr & MSR_IS); @@ -1066,7 +1037,7 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr, sesel = 0; /* unused */ priv = &vcpu_e500->gtlb_priv[tlbsel][esel]; - kvmppc_e500_setup_stlbe(vcpu_e500, gtlbe, BOOK3E_PAGESZ_4K, + kvmppc_e500_setup_stlbe(vcpu, gtlbe, BOOK3E_PAGESZ_4K, &priv->ref, eaddr, &stlbe); break; @@ -1075,7 +1046,7 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr, stlbsel = 1; sesel = kvmppc_e500_tlb1_map(vcpu_e500, eaddr, gfn, - gtlbe, &stlbe); + gtlbe, &stlbe, esel); break; } @@ -1087,52 +1058,13 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr, write_stlbe(vcpu_e500, gtlbe, &stlbe, stlbsel, sesel); } -int kvmppc_e500_tlb_search(struct kvm_vcpu *vcpu, - gva_t eaddr, unsigned int pid, int as) -{ - struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); - int esel, tlbsel; - - for (tlbsel = 0; tlbsel < 2; tlbsel++) { - esel = kvmppc_e500_tlb_index(vcpu_e500, eaddr, tlbsel, pid, as); - if (esel >= 0) - return index_of(tlbsel, esel); - } - - return -1; -} - -void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid) -{ - struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); - - if (vcpu->arch.pid != pid) { - vcpu_e500->pid[0] = vcpu->arch.pid = pid; - kvmppc_e500_recalc_shadow_pid(vcpu_e500); - } -} - -void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *vcpu_e500) -{ - struct kvm_book3e_206_tlb_entry *tlbe; - - /* Insert large initial mapping for guest. */ - tlbe = get_entry(vcpu_e500, 1, 0); - tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_256M); - tlbe->mas2 = 0; - tlbe->mas7_3 = E500_TLB_SUPER_PERM_MASK; - - /* 4K map for serial output. Used by kernel wrapper. */ - tlbe = get_entry(vcpu_e500, 1, 1); - tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_4K); - tlbe->mas2 = (0xe0004500 & 0xFFFFF000) | MAS2_I | MAS2_G; - tlbe->mas7_3 = (0xe0004500 & 0xFFFFF000) | E500_TLB_SUPER_PERM_MASK; -} - static void free_gtlb(struct kvmppc_vcpu_e500 *vcpu_e500) { int i; + clear_tlb1_bitmap(vcpu_e500); + kfree(vcpu_e500->g2h_tlb1_map); + clear_tlb_refs(vcpu_e500); kfree(vcpu_e500->gtlb_priv[0]); kfree(vcpu_e500->gtlb_priv[1]); @@ -1155,6 +1087,36 @@ static void free_gtlb(struct kvmppc_vcpu_e500 *vcpu_e500) vcpu_e500->gtlb_arch = NULL; } +void kvmppc_get_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + sregs->u.e.mas0 = vcpu->arch.shared->mas0; + sregs->u.e.mas1 = vcpu->arch.shared->mas1; + sregs->u.e.mas2 = vcpu->arch.shared->mas2; + sregs->u.e.mas7_3 = vcpu->arch.shared->mas7_3; + sregs->u.e.mas4 = vcpu->arch.shared->mas4; + sregs->u.e.mas6 = vcpu->arch.shared->mas6; + + sregs->u.e.mmucfg = vcpu->arch.mmucfg; + sregs->u.e.tlbcfg[0] = vcpu->arch.tlbcfg[0]; + sregs->u.e.tlbcfg[1] = vcpu->arch.tlbcfg[1]; + sregs->u.e.tlbcfg[2] = 0; + sregs->u.e.tlbcfg[3] = 0; +} + +int kvmppc_set_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + if (sregs->u.e.features & KVM_SREGS_E_ARCH206_MMU) { + vcpu->arch.shared->mas0 = sregs->u.e.mas0; + vcpu->arch.shared->mas1 = sregs->u.e.mas1; + vcpu->arch.shared->mas2 = sregs->u.e.mas2; + vcpu->arch.shared->mas7_3 = sregs->u.e.mas7_3; + vcpu->arch.shared->mas4 = sregs->u.e.mas4; + vcpu->arch.shared->mas6 = sregs->u.e.mas6; + } + + return 0; +} + int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu, struct kvm_config_tlb *cfg) { @@ -1163,6 +1125,7 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu, char *virt; struct page **pages; struct tlbe_priv *privs[2] = {}; + u64 *g2h_bitmap = NULL; size_t array_len; u32 sets; int num_pages, ret, i; @@ -1224,10 +1187,16 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu, if (!privs[0] || !privs[1]) goto err_put_page; + g2h_bitmap = kzalloc(sizeof(u64) * params.tlb_sizes[1], + GFP_KERNEL); + if (!g2h_bitmap) + goto err_put_page; + free_gtlb(vcpu_e500); vcpu_e500->gtlb_priv[0] = privs[0]; vcpu_e500->gtlb_priv[1] = privs[1]; + vcpu_e500->g2h_tlb1_map = g2h_bitmap; vcpu_e500->gtlb_arch = (struct kvm_book3e_206_tlb_entry *) (virt + (cfg->array & (PAGE_SIZE - 1))); @@ -1238,14 +1207,16 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu, vcpu_e500->gtlb_offset[0] = 0; vcpu_e500->gtlb_offset[1] = params.tlb_sizes[0]; - vcpu_e500->tlb0cfg &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); + vcpu->arch.mmucfg = mfspr(SPRN_MMUCFG) & ~MMUCFG_LPIDSIZE; + + vcpu->arch.tlbcfg[0] &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); if (params.tlb_sizes[0] <= 2048) - vcpu_e500->tlb0cfg |= params.tlb_sizes[0]; - vcpu_e500->tlb0cfg |= params.tlb_ways[0] << TLBnCFG_ASSOC_SHIFT; + vcpu->arch.tlbcfg[0] |= params.tlb_sizes[0]; + vcpu->arch.tlbcfg[0] |= params.tlb_ways[0] << TLBnCFG_ASSOC_SHIFT; - vcpu_e500->tlb1cfg &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); - vcpu_e500->tlb1cfg |= params.tlb_sizes[1]; - vcpu_e500->tlb1cfg |= params.tlb_ways[1] << TLBnCFG_ASSOC_SHIFT; + vcpu->arch.tlbcfg[1] &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); + vcpu->arch.tlbcfg[1] |= params.tlb_sizes[1]; + vcpu->arch.tlbcfg[1] |= params.tlb_ways[1] << TLBnCFG_ASSOC_SHIFT; vcpu_e500->shared_tlb_pages = pages; vcpu_e500->num_shared_tlb_pages = num_pages; @@ -1256,6 +1227,7 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu, vcpu_e500->gtlb_params[1].ways = params.tlb_sizes[1]; vcpu_e500->gtlb_params[1].sets = 1; + kvmppc_recalc_tlb1map_range(vcpu_e500); return 0; err_put_page: @@ -1274,13 +1246,14 @@ int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu, struct kvm_dirty_tlb *dirty) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); - + kvmppc_recalc_tlb1map_range(vcpu_e500); clear_tlb_refs(vcpu_e500); return 0; } int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500) { + struct kvm_vcpu *vcpu = &vcpu_e500->vcpu; int entry_size = sizeof(struct kvm_book3e_206_tlb_entry); int entries = KVM_E500_TLB0_SIZE + KVM_E500_TLB1_SIZE; @@ -1357,22 +1330,32 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500) if (!vcpu_e500->gtlb_priv[1]) goto err; - if (kvmppc_e500_id_table_alloc(vcpu_e500) == NULL) + vcpu_e500->g2h_tlb1_map = kzalloc(sizeof(unsigned int) * + vcpu_e500->gtlb_params[1].entries, + GFP_KERNEL); + if (!vcpu_e500->g2h_tlb1_map) + goto err; + + vcpu_e500->h2g_tlb1_rmap = kzalloc(sizeof(unsigned int) * + host_tlb_params[1].entries, + GFP_KERNEL); + if (!vcpu_e500->h2g_tlb1_rmap) goto err; /* Init TLB configuration register */ - vcpu_e500->tlb0cfg = mfspr(SPRN_TLB0CFG) & + vcpu->arch.tlbcfg[0] = mfspr(SPRN_TLB0CFG) & ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); - vcpu_e500->tlb0cfg |= vcpu_e500->gtlb_params[0].entries; - vcpu_e500->tlb0cfg |= + vcpu->arch.tlbcfg[0] |= vcpu_e500->gtlb_params[0].entries; + vcpu->arch.tlbcfg[0] |= vcpu_e500->gtlb_params[0].ways << TLBnCFG_ASSOC_SHIFT; - vcpu_e500->tlb1cfg = mfspr(SPRN_TLB1CFG) & + vcpu->arch.tlbcfg[1] = mfspr(SPRN_TLB1CFG) & ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); - vcpu_e500->tlb0cfg |= vcpu_e500->gtlb_params[1].entries; - vcpu_e500->tlb0cfg |= + vcpu->arch.tlbcfg[1] |= vcpu_e500->gtlb_params[1].entries; + vcpu->arch.tlbcfg[1] |= vcpu_e500->gtlb_params[1].ways << TLBnCFG_ASSOC_SHIFT; + kvmppc_recalc_tlb1map_range(vcpu_e500); return 0; err: @@ -1385,8 +1368,7 @@ err: void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500) { free_gtlb(vcpu_e500); - kvmppc_e500_id_table_free(vcpu_e500); - + kfree(vcpu_e500->h2g_tlb1_rmap); kfree(vcpu_e500->tlb_refs[0]); kfree(vcpu_e500->tlb_refs[1]); } diff --git a/arch/powerpc/kvm/e500_tlb.h b/arch/powerpc/kvm/e500_tlb.h deleted file mode 100644 index 5c6d2d7..0000000 --- a/arch/powerpc/kvm/e500_tlb.h +++ /dev/null @@ -1,174 +0,0 @@ -/* - * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved. - * - * Author: Yu Liu, yu.liu@freescale.com - * - * Description: - * This file is based on arch/powerpc/kvm/44x_tlb.h, - * by Hollis Blanchard <hollisb@us.ibm.com>. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License, version 2, as - * published by the Free Software Foundation. - */ - -#ifndef __KVM_E500_TLB_H__ -#define __KVM_E500_TLB_H__ - -#include <linux/kvm_host.h> -#include <asm/mmu-book3e.h> -#include <asm/tlb.h> -#include <asm/kvm_e500.h> - -/* This geometry is the legacy default -- can be overridden by userspace */ -#define KVM_E500_TLB0_WAY_SIZE 128 -#define KVM_E500_TLB0_WAY_NUM 2 - -#define KVM_E500_TLB0_SIZE (KVM_E500_TLB0_WAY_SIZE * KVM_E500_TLB0_WAY_NUM) -#define KVM_E500_TLB1_SIZE 16 - -#define index_of(tlbsel, esel) (((tlbsel) << 16) | ((esel) & 0xFFFF)) -#define tlbsel_of(index) ((index) >> 16) -#define esel_of(index) ((index) & 0xFFFF) - -#define E500_TLB_USER_PERM_MASK (MAS3_UX|MAS3_UR|MAS3_UW) -#define E500_TLB_SUPER_PERM_MASK (MAS3_SX|MAS3_SR|MAS3_SW) -#define MAS2_ATTRIB_MASK \ - (MAS2_X0 | MAS2_X1) -#define MAS3_ATTRIB_MASK \ - (MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3 \ - | E500_TLB_USER_PERM_MASK | E500_TLB_SUPER_PERM_MASK) - -extern void kvmppc_dump_tlbs(struct kvm_vcpu *); -extern int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *, ulong); -extern int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *); -extern int kvmppc_e500_emul_tlbre(struct kvm_vcpu *); -extern int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *, int, int); -extern int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *, int); -extern int kvmppc_e500_tlb_search(struct kvm_vcpu *, gva_t, unsigned int, int); -extern void kvmppc_e500_tlb_put(struct kvm_vcpu *); -extern void kvmppc_e500_tlb_load(struct kvm_vcpu *, int); -extern int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *); -extern void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *); -extern void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *); -extern void kvmppc_e500_recalc_shadow_pid(struct kvmppc_vcpu_e500 *); - -/* TLB helper functions */ -static inline unsigned int -get_tlb_size(const struct kvm_book3e_206_tlb_entry *tlbe) -{ - return (tlbe->mas1 >> 7) & 0x1f; -} - -static inline gva_t get_tlb_eaddr(const struct kvm_book3e_206_tlb_entry *tlbe) -{ - return tlbe->mas2 & 0xfffff000; -} - -static inline u64 get_tlb_bytes(const struct kvm_book3e_206_tlb_entry *tlbe) -{ - unsigned int pgsize = get_tlb_size(tlbe); - return 1ULL << 10 << pgsize; -} - -static inline gva_t get_tlb_end(const struct kvm_book3e_206_tlb_entry *tlbe) -{ - u64 bytes = get_tlb_bytes(tlbe); - return get_tlb_eaddr(tlbe) + bytes - 1; -} - -static inline u64 get_tlb_raddr(const struct kvm_book3e_206_tlb_entry *tlbe) -{ - return tlbe->mas7_3 & ~0xfffULL; -} - -static inline unsigned int -get_tlb_tid(const struct kvm_book3e_206_tlb_entry *tlbe) -{ - return (tlbe->mas1 >> 16) & 0xff; -} - -static inline unsigned int -get_tlb_ts(const struct kvm_book3e_206_tlb_entry *tlbe) -{ - return (tlbe->mas1 >> 12) & 0x1; -} - -static inline unsigned int -get_tlb_v(const struct kvm_book3e_206_tlb_entry *tlbe) -{ - return (tlbe->mas1 >> 31) & 0x1; -} - -static inline unsigned int -get_tlb_iprot(const struct kvm_book3e_206_tlb_entry *tlbe) -{ - return (tlbe->mas1 >> 30) & 0x1; -} - -static inline unsigned int get_cur_pid(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.pid & 0xff; -} - -static inline unsigned int get_cur_as(struct kvm_vcpu *vcpu) -{ - return !!(vcpu->arch.shared->msr & (MSR_IS | MSR_DS)); -} - -static inline unsigned int get_cur_pr(struct kvm_vcpu *vcpu) -{ - return !!(vcpu->arch.shared->msr & MSR_PR); -} - -static inline unsigned int get_cur_spid(const struct kvm_vcpu *vcpu) -{ - return (vcpu->arch.shared->mas6 >> 16) & 0xff; -} - -static inline unsigned int get_cur_sas(const struct kvm_vcpu *vcpu) -{ - return vcpu->arch.shared->mas6 & 0x1; -} - -static inline unsigned int get_tlb_tlbsel(const struct kvm_vcpu *vcpu) -{ - /* - * Manual says that tlbsel has 2 bits wide. - * Since we only have two TLBs, only lower bit is used. - */ - return (vcpu->arch.shared->mas0 >> 28) & 0x1; -} - -static inline unsigned int get_tlb_nv_bit(const struct kvm_vcpu *vcpu) -{ - return vcpu->arch.shared->mas0 & 0xfff; -} - -static inline unsigned int get_tlb_esel_bit(const struct kvm_vcpu *vcpu) -{ - return (vcpu->arch.shared->mas0 >> 16) & 0xfff; -} - -static inline int tlbe_is_host_safe(const struct kvm_vcpu *vcpu, - const struct kvm_book3e_206_tlb_entry *tlbe) -{ - gpa_t gpa; - - if (!get_tlb_v(tlbe)) - return 0; - - /* Does it match current guest AS? */ - /* XXX what about IS != DS? */ - if (get_tlb_ts(tlbe) != !!(vcpu->arch.shared->msr & MSR_IS)) - return 0; - - gpa = get_tlb_raddr(tlbe); - if (!gfn_to_memslot(vcpu->kvm, gpa >> PAGE_SHIFT)) - /* Mapping is not for RAM. */ - return 0; - - return 1; -} - -#endif /* __KVM_E500_TLB_H__ */ diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c new file mode 100644 index 0000000..fe6c1de --- /dev/null +++ b/arch/powerpc/kvm/e500mc.c @@ -0,0 +1,342 @@ +/* + * Copyright (C) 2010 Freescale Semiconductor, Inc. All rights reserved. + * + * Author: Varun Sethi, <varun.sethi@freescale.com> + * + * Description: + * This file is derived from arch/powerpc/kvm/e500.c, + * by Yu Liu <yu.liu@freescale.com>. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#include <linux/kvm_host.h> +#include <linux/slab.h> +#include <linux/err.h> +#include <linux/export.h> + +#include <asm/reg.h> +#include <asm/cputable.h> +#include <asm/tlbflush.h> +#include <asm/kvm_ppc.h> +#include <asm/dbell.h> + +#include "booke.h" +#include "e500.h" + +void kvmppc_set_pending_interrupt(struct kvm_vcpu *vcpu, enum int_class type) +{ + enum ppc_dbell dbell_type; + unsigned long tag; + + switch (type) { + case INT_CLASS_NONCRIT: + dbell_type = PPC_G_DBELL; + break; + case INT_CLASS_CRIT: + dbell_type = PPC_G_DBELL_CRIT; + break; + case INT_CLASS_MC: + dbell_type = PPC_G_DBELL_MC; + break; + default: + WARN_ONCE(1, "%s: unknown int type %d\n", __func__, type); + return; + } + + + tag = PPC_DBELL_LPID(vcpu->kvm->arch.lpid) | vcpu->vcpu_id; + mb(); + ppc_msgsnd(dbell_type, 0, tag); +} + +/* gtlbe must not be mapped by more than one host tlb entry */ +void kvmppc_e500_tlbil_one(struct kvmppc_vcpu_e500 *vcpu_e500, + struct kvm_book3e_206_tlb_entry *gtlbe) +{ + unsigned int tid, ts; + u32 val, eaddr, lpid; + unsigned long flags; + + ts = get_tlb_ts(gtlbe); + tid = get_tlb_tid(gtlbe); + lpid = vcpu_e500->vcpu.kvm->arch.lpid; + + /* We search the host TLB to invalidate its shadow TLB entry */ + val = (tid << 16) | ts; + eaddr = get_tlb_eaddr(gtlbe); + + local_irq_save(flags); + + mtspr(SPRN_MAS6, val); + mtspr(SPRN_MAS5, MAS5_SGS | lpid); + + asm volatile("tlbsx 0, %[eaddr]\n" : : [eaddr] "r" (eaddr)); + val = mfspr(SPRN_MAS1); + if (val & MAS1_VALID) { + mtspr(SPRN_MAS1, val & ~MAS1_VALID); + asm volatile("tlbwe"); + } + mtspr(SPRN_MAS5, 0); + /* NOTE: tlbsx also updates mas8, so clear it for host tlbwe */ + mtspr(SPRN_MAS8, 0); + isync(); + + local_irq_restore(flags); +} + +void kvmppc_e500_tlbil_all(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + unsigned long flags; + + local_irq_save(flags); + mtspr(SPRN_MAS5, MAS5_SGS | vcpu_e500->vcpu.kvm->arch.lpid); + asm volatile("tlbilxlpid"); + mtspr(SPRN_MAS5, 0); + local_irq_restore(flags); +} + +void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid) +{ + vcpu->arch.pid = pid; +} + +void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr) +{ +} + +void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + + kvmppc_booke_vcpu_load(vcpu, cpu); + + mtspr(SPRN_LPID, vcpu->kvm->arch.lpid); + mtspr(SPRN_EPCR, vcpu->arch.shadow_epcr); + mtspr(SPRN_GPIR, vcpu->vcpu_id); + mtspr(SPRN_MSRP, vcpu->arch.shadow_msrp); + mtspr(SPRN_EPLC, vcpu->arch.eplc); + mtspr(SPRN_EPSC, vcpu->arch.epsc); + + mtspr(SPRN_GIVPR, vcpu->arch.ivpr); + mtspr(SPRN_GIVOR2, vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE]); + mtspr(SPRN_GIVOR8, vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL]); + mtspr(SPRN_GSPRG0, (unsigned long)vcpu->arch.shared->sprg0); + mtspr(SPRN_GSPRG1, (unsigned long)vcpu->arch.shared->sprg1); + mtspr(SPRN_GSPRG2, (unsigned long)vcpu->arch.shared->sprg2); + mtspr(SPRN_GSPRG3, (unsigned long)vcpu->arch.shared->sprg3); + + mtspr(SPRN_GSRR0, vcpu->arch.shared->srr0); + mtspr(SPRN_GSRR1, vcpu->arch.shared->srr1); + + mtspr(SPRN_GEPR, vcpu->arch.epr); + mtspr(SPRN_GDEAR, vcpu->arch.shared->dar); + mtspr(SPRN_GESR, vcpu->arch.shared->esr); + + if (vcpu->arch.oldpir != mfspr(SPRN_PIR)) + kvmppc_e500_tlbil_all(vcpu_e500); + + kvmppc_load_guest_fp(vcpu); +} + +void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) +{ + vcpu->arch.eplc = mfspr(SPRN_EPLC); + vcpu->arch.epsc = mfspr(SPRN_EPSC); + + vcpu->arch.shared->sprg0 = mfspr(SPRN_GSPRG0); + vcpu->arch.shared->sprg1 = mfspr(SPRN_GSPRG1); + vcpu->arch.shared->sprg2 = mfspr(SPRN_GSPRG2); + vcpu->arch.shared->sprg3 = mfspr(SPRN_GSPRG3); + + vcpu->arch.shared->srr0 = mfspr(SPRN_GSRR0); + vcpu->arch.shared->srr1 = mfspr(SPRN_GSRR1); + + vcpu->arch.epr = mfspr(SPRN_GEPR); + vcpu->arch.shared->dar = mfspr(SPRN_GDEAR); + vcpu->arch.shared->esr = mfspr(SPRN_GESR); + + vcpu->arch.oldpir = mfspr(SPRN_PIR); + + kvmppc_booke_vcpu_put(vcpu); +} + +int kvmppc_core_check_processor_compat(void) +{ + int r; + + if (strcmp(cur_cpu_spec->cpu_name, "e500mc") == 0) + r = 0; + else if (strcmp(cur_cpu_spec->cpu_name, "e5500") == 0) + r = 0; + else + r = -ENOTSUPP; + + return r; +} + +int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + + vcpu->arch.shadow_epcr = SPRN_EPCR_DSIGS | SPRN_EPCR_DGTMI | \ + SPRN_EPCR_DUVD; + vcpu->arch.shadow_msrp = MSRP_UCLEP | MSRP_DEP | MSRP_PMMP; + vcpu->arch.eplc = EPC_EGS | (vcpu->kvm->arch.lpid << EPC_ELPID_SHIFT); + vcpu->arch.epsc = vcpu->arch.eplc; + + vcpu->arch.pvr = mfspr(SPRN_PVR); + vcpu_e500->svr = mfspr(SPRN_SVR); + + vcpu->arch.cpu_type = KVM_CPU_E500MC; + + return 0; +} + +void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + + sregs->u.e.features |= KVM_SREGS_E_ARCH206_MMU | KVM_SREGS_E_PM | + KVM_SREGS_E_PC; + sregs->u.e.impl_id = KVM_SREGS_E_IMPL_FSL; + + sregs->u.e.impl.fsl.features = 0; + sregs->u.e.impl.fsl.svr = vcpu_e500->svr; + sregs->u.e.impl.fsl.hid0 = vcpu_e500->hid0; + sregs->u.e.impl.fsl.mcar = vcpu_e500->mcar; + + kvmppc_get_sregs_e500_tlb(vcpu, sregs); + + sregs->u.e.ivor_high[3] = + vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR]; + sregs->u.e.ivor_high[4] = vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL]; + sregs->u.e.ivor_high[5] = vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL_CRIT]; + + kvmppc_get_sregs_ivor(vcpu, sregs); +} + +int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + int ret; + + if (sregs->u.e.impl_id == KVM_SREGS_E_IMPL_FSL) { + vcpu_e500->svr = sregs->u.e.impl.fsl.svr; + vcpu_e500->hid0 = sregs->u.e.impl.fsl.hid0; + vcpu_e500->mcar = sregs->u.e.impl.fsl.mcar; + } + + ret = kvmppc_set_sregs_e500_tlb(vcpu, sregs); + if (ret < 0) + return ret; + + if (!(sregs->u.e.features & KVM_SREGS_E_IVOR)) + return 0; + + if (sregs->u.e.features & KVM_SREGS_E_PM) { + vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR] = + sregs->u.e.ivor_high[3]; + } + + if (sregs->u.e.features & KVM_SREGS_E_PC) { + vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL] = + sregs->u.e.ivor_high[4]; + vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL_CRIT] = + sregs->u.e.ivor_high[5]; + } + + return kvmppc_set_sregs_ivor(vcpu, sregs); +} + +struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) +{ + struct kvmppc_vcpu_e500 *vcpu_e500; + struct kvm_vcpu *vcpu; + int err; + + vcpu_e500 = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); + if (!vcpu_e500) { + err = -ENOMEM; + goto out; + } + vcpu = &vcpu_e500->vcpu; + + /* Invalid PIR value -- this LPID dosn't have valid state on any cpu */ + vcpu->arch.oldpir = 0xffffffff; + + err = kvm_vcpu_init(vcpu, kvm, id); + if (err) + goto free_vcpu; + + err = kvmppc_e500_tlb_init(vcpu_e500); + if (err) + goto uninit_vcpu; + + vcpu->arch.shared = (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + if (!vcpu->arch.shared) + goto uninit_tlb; + + return vcpu; + +uninit_tlb: + kvmppc_e500_tlb_uninit(vcpu_e500); +uninit_vcpu: + kvm_vcpu_uninit(vcpu); + +free_vcpu: + kmem_cache_free(kvm_vcpu_cache, vcpu_e500); +out: + return ERR_PTR(err); +} + +void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + + free_page((unsigned long)vcpu->arch.shared); + kvmppc_e500_tlb_uninit(vcpu_e500); + kvm_vcpu_uninit(vcpu); + kmem_cache_free(kvm_vcpu_cache, vcpu_e500); +} + +int kvmppc_core_init_vm(struct kvm *kvm) +{ + int lpid; + + lpid = kvmppc_alloc_lpid(); + if (lpid < 0) + return lpid; + + kvm->arch.lpid = lpid; + return 0; +} + +void kvmppc_core_destroy_vm(struct kvm *kvm) +{ + kvmppc_free_lpid(kvm->arch.lpid); +} + +static int __init kvmppc_e500mc_init(void) +{ + int r; + + r = kvmppc_booke_init(); + if (r) + return r; + + kvmppc_init_lpid(64); + kvmppc_claim_lpid(0); /* host */ + + return kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE); +} + +static void __exit kvmppc_e500mc_exit(void) +{ + kvmppc_booke_exit(); +} + +module_init(kvmppc_e500mc_init); +module_exit(kvmppc_e500mc_exit); diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c index 968f401..f90e86d 100644 --- a/arch/powerpc/kvm/emulate.c +++ b/arch/powerpc/kvm/emulate.c @@ -23,6 +23,7 @@ #include <linux/types.h> #include <linux/string.h> #include <linux/kvm_host.h> +#include <linux/clockchips.h> #include <asm/reg.h> #include <asm/time.h> @@ -35,7 +36,9 @@ #define OP_TRAP 3 #define OP_TRAP_64 2 +#define OP_31_XOP_TRAP 4 #define OP_31_XOP_LWZX 23 +#define OP_31_XOP_TRAP_64 68 #define OP_31_XOP_LBZX 87 #define OP_31_XOP_STWX 151 #define OP_31_XOP_STBX 215 @@ -102,8 +105,12 @@ void kvmppc_emulate_dec(struct kvm_vcpu *vcpu) */ dec_time = vcpu->arch.dec; - dec_time *= 1000; - do_div(dec_time, tb_ticks_per_usec); + /* + * Guest timebase ticks at the same frequency as host decrementer. + * So use the host decrementer calculations for decrementer emulation. + */ + dec_time = dec_time << decrementer_clockevent.shift; + do_div(dec_time, decrementer_clockevent.mult); dec_nsec = do_div(dec_time, NSEC_PER_SEC); hrtimer_start(&vcpu->arch.dec_timer, ktime_set(dec_time, dec_nsec), HRTIMER_MODE_REL); @@ -141,14 +148,13 @@ u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb) int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) { u32 inst = kvmppc_get_last_inst(vcpu); - u32 ea; - int ra; - int rb; - int rs; - int rt; - int sprn; + int ra = get_ra(inst); + int rs = get_rs(inst); + int rt = get_rt(inst); + int sprn = get_sprn(inst); enum emulation_result emulated = EMULATE_DONE; int advance = 1; + ulong spr_val = 0; /* this default type might be overwritten by subcategories */ kvmppc_set_exit_type(vcpu, EMULATED_INST_EXITS); @@ -170,173 +176,143 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) case 31: switch (get_xop(inst)) { + case OP_31_XOP_TRAP: +#ifdef CONFIG_64BIT + case OP_31_XOP_TRAP_64: +#endif +#ifdef CONFIG_PPC_BOOK3S + kvmppc_core_queue_program(vcpu, SRR1_PROGTRAP); +#else + kvmppc_core_queue_program(vcpu, + vcpu->arch.shared->esr | ESR_PTR); +#endif + advance = 0; + break; case OP_31_XOP_LWZX: - rt = get_rt(inst); emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1); break; case OP_31_XOP_LBZX: - rt = get_rt(inst); emulated = kvmppc_handle_load(run, vcpu, rt, 1, 1); break; case OP_31_XOP_LBZUX: - rt = get_rt(inst); - ra = get_ra(inst); - rb = get_rb(inst); - - ea = kvmppc_get_gpr(vcpu, rb); - if (ra) - ea += kvmppc_get_gpr(vcpu, ra); - emulated = kvmppc_handle_load(run, vcpu, rt, 1, 1); - kvmppc_set_gpr(vcpu, ra, ea); + kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); break; case OP_31_XOP_STWX: - rs = get_rs(inst); emulated = kvmppc_handle_store(run, vcpu, kvmppc_get_gpr(vcpu, rs), 4, 1); break; case OP_31_XOP_STBX: - rs = get_rs(inst); emulated = kvmppc_handle_store(run, vcpu, kvmppc_get_gpr(vcpu, rs), 1, 1); break; case OP_31_XOP_STBUX: - rs = get_rs(inst); - ra = get_ra(inst); - rb = get_rb(inst); - - ea = kvmppc_get_gpr(vcpu, rb); - if (ra) - ea += kvmppc_get_gpr(vcpu, ra); - emulated = kvmppc_handle_store(run, vcpu, kvmppc_get_gpr(vcpu, rs), 1, 1); - kvmppc_set_gpr(vcpu, rs, ea); + kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); break; case OP_31_XOP_LHAX: - rt = get_rt(inst); emulated = kvmppc_handle_loads(run, vcpu, rt, 2, 1); break; case OP_31_XOP_LHZX: - rt = get_rt(inst); emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1); break; case OP_31_XOP_LHZUX: - rt = get_rt(inst); - ra = get_ra(inst); - rb = get_rb(inst); - - ea = kvmppc_get_gpr(vcpu, rb); - if (ra) - ea += kvmppc_get_gpr(vcpu, ra); - emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1); - kvmppc_set_gpr(vcpu, ra, ea); + kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); break; case OP_31_XOP_MFSPR: - sprn = get_sprn(inst); - rt = get_rt(inst); - switch (sprn) { case SPRN_SRR0: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->srr0); + spr_val = vcpu->arch.shared->srr0; break; case SPRN_SRR1: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->srr1); + spr_val = vcpu->arch.shared->srr1; break; case SPRN_PVR: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.pvr); break; + spr_val = vcpu->arch.pvr; + break; case SPRN_PIR: - kvmppc_set_gpr(vcpu, rt, vcpu->vcpu_id); break; + spr_val = vcpu->vcpu_id; + break; case SPRN_MSSSR0: - kvmppc_set_gpr(vcpu, rt, 0); break; + spr_val = 0; + break; /* Note: mftb and TBRL/TBWL are user-accessible, so * the guest can always access the real TB anyways. * In fact, we probably will never see these traps. */ case SPRN_TBWL: - kvmppc_set_gpr(vcpu, rt, get_tb() >> 32); break; + spr_val = get_tb() >> 32; + break; case SPRN_TBWU: - kvmppc_set_gpr(vcpu, rt, get_tb()); break; + spr_val = get_tb(); + break; case SPRN_SPRG0: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->sprg0); + spr_val = vcpu->arch.shared->sprg0; break; case SPRN_SPRG1: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->sprg1); + spr_val = vcpu->arch.shared->sprg1; break; case SPRN_SPRG2: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->sprg2); + spr_val = vcpu->arch.shared->sprg2; break; case SPRN_SPRG3: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->sprg3); + spr_val = vcpu->arch.shared->sprg3; break; /* Note: SPRG4-7 are user-readable, so we don't get * a trap. */ case SPRN_DEC: - { - kvmppc_set_gpr(vcpu, rt, - kvmppc_get_dec(vcpu, get_tb())); + spr_val = kvmppc_get_dec(vcpu, get_tb()); break; - } default: - emulated = kvmppc_core_emulate_mfspr(vcpu, sprn, rt); - if (emulated == EMULATE_FAIL) { - printk("mfspr: unknown spr %x\n", sprn); - kvmppc_set_gpr(vcpu, rt, 0); + emulated = kvmppc_core_emulate_mfspr(vcpu, sprn, + &spr_val); + if (unlikely(emulated == EMULATE_FAIL)) { + printk(KERN_INFO "mfspr: unknown spr " + "0x%x\n", sprn); } break; } + kvmppc_set_gpr(vcpu, rt, spr_val); kvmppc_set_exit_type(vcpu, EMULATED_MFSPR_EXITS); break; case OP_31_XOP_STHX: - rs = get_rs(inst); - ra = get_ra(inst); - rb = get_rb(inst); - emulated = kvmppc_handle_store(run, vcpu, kvmppc_get_gpr(vcpu, rs), 2, 1); break; case OP_31_XOP_STHUX: - rs = get_rs(inst); - ra = get_ra(inst); - rb = get_rb(inst); - - ea = kvmppc_get_gpr(vcpu, rb); - if (ra) - ea += kvmppc_get_gpr(vcpu, ra); - emulated = kvmppc_handle_store(run, vcpu, kvmppc_get_gpr(vcpu, rs), 2, 1); - kvmppc_set_gpr(vcpu, ra, ea); + kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); break; case OP_31_XOP_MTSPR: - sprn = get_sprn(inst); - rs = get_rs(inst); + spr_val = kvmppc_get_gpr(vcpu, rs); switch (sprn) { case SPRN_SRR0: - vcpu->arch.shared->srr0 = kvmppc_get_gpr(vcpu, rs); + vcpu->arch.shared->srr0 = spr_val; break; case SPRN_SRR1: - vcpu->arch.shared->srr1 = kvmppc_get_gpr(vcpu, rs); + vcpu->arch.shared->srr1 = spr_val; break; /* XXX We need to context-switch the timebase for @@ -347,27 +323,29 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) case SPRN_MSSSR0: break; case SPRN_DEC: - vcpu->arch.dec = kvmppc_get_gpr(vcpu, rs); + vcpu->arch.dec = spr_val; kvmppc_emulate_dec(vcpu); break; case SPRN_SPRG0: - vcpu->arch.shared->sprg0 = kvmppc_get_gpr(vcpu, rs); + vcpu->arch.shared->sprg0 = spr_val; break; case SPRN_SPRG1: - vcpu->arch.shared->sprg1 = kvmppc_get_gpr(vcpu, rs); + vcpu->arch.shared->sprg1 = spr_val; break; case SPRN_SPRG2: - vcpu->arch.shared->sprg2 = kvmppc_get_gpr(vcpu, rs); + vcpu->arch.shared->sprg2 = spr_val; break; case SPRN_SPRG3: - vcpu->arch.shared->sprg3 = kvmppc_get_gpr(vcpu, rs); + vcpu->arch.shared->sprg3 = spr_val; break; default: - emulated = kvmppc_core_emulate_mtspr(vcpu, sprn, rs); + emulated = kvmppc_core_emulate_mtspr(vcpu, sprn, + spr_val); if (emulated == EMULATE_FAIL) - printk("mtspr: unknown spr %x\n", sprn); + printk(KERN_INFO "mtspr: unknown spr " + "0x%x\n", sprn); break; } kvmppc_set_exit_type(vcpu, EMULATED_MTSPR_EXITS); @@ -382,7 +360,6 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) break; case OP_31_XOP_LWBRX: - rt = get_rt(inst); emulated = kvmppc_handle_load(run, vcpu, rt, 4, 0); break; @@ -390,25 +367,16 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) break; case OP_31_XOP_STWBRX: - rs = get_rs(inst); - ra = get_ra(inst); - rb = get_rb(inst); - emulated = kvmppc_handle_store(run, vcpu, kvmppc_get_gpr(vcpu, rs), 4, 0); break; case OP_31_XOP_LHBRX: - rt = get_rt(inst); emulated = kvmppc_handle_load(run, vcpu, rt, 2, 0); break; case OP_31_XOP_STHBRX: - rs = get_rs(inst); - ra = get_ra(inst); - rb = get_rb(inst); - emulated = kvmppc_handle_store(run, vcpu, kvmppc_get_gpr(vcpu, rs), 2, 0); @@ -421,99 +389,78 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) break; case OP_LWZ: - rt = get_rt(inst); emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1); break; case OP_LWZU: - ra = get_ra(inst); - rt = get_rt(inst); emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1); - kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed); + kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); break; case OP_LBZ: - rt = get_rt(inst); emulated = kvmppc_handle_load(run, vcpu, rt, 1, 1); break; case OP_LBZU: - ra = get_ra(inst); - rt = get_rt(inst); emulated = kvmppc_handle_load(run, vcpu, rt, 1, 1); - kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed); + kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); break; case OP_STW: - rs = get_rs(inst); emulated = kvmppc_handle_store(run, vcpu, kvmppc_get_gpr(vcpu, rs), 4, 1); break; case OP_STWU: - ra = get_ra(inst); - rs = get_rs(inst); emulated = kvmppc_handle_store(run, vcpu, kvmppc_get_gpr(vcpu, rs), 4, 1); - kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed); + kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); break; case OP_STB: - rs = get_rs(inst); emulated = kvmppc_handle_store(run, vcpu, kvmppc_get_gpr(vcpu, rs), 1, 1); break; case OP_STBU: - ra = get_ra(inst); - rs = get_rs(inst); emulated = kvmppc_handle_store(run, vcpu, kvmppc_get_gpr(vcpu, rs), 1, 1); - kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed); + kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); break; case OP_LHZ: - rt = get_rt(inst); emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1); break; case OP_LHZU: - ra = get_ra(inst); - rt = get_rt(inst); emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1); - kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed); + kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); break; case OP_LHA: - rt = get_rt(inst); emulated = kvmppc_handle_loads(run, vcpu, rt, 2, 1); break; case OP_LHAU: - ra = get_ra(inst); - rt = get_rt(inst); emulated = kvmppc_handle_loads(run, vcpu, rt, 2, 1); - kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed); + kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); break; case OP_STH: - rs = get_rs(inst); emulated = kvmppc_handle_store(run, vcpu, kvmppc_get_gpr(vcpu, rs), 2, 1); break; case OP_STHU: - ra = get_ra(inst); - rs = get_rs(inst); emulated = kvmppc_handle_store(run, vcpu, kvmppc_get_gpr(vcpu, rs), 2, 1); - kvmppc_set_gpr(vcpu, ra, vcpu->arch.paddr_accessed); + kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); break; default: diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 00d7e34..1493c8d 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -43,6 +43,11 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) v->requests; } +int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) +{ + return 1; +} + int kvmppc_kvm_pv(struct kvm_vcpu *vcpu) { int nr = kvmppc_get_gpr(vcpu, 11); @@ -74,7 +79,7 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu) } case HC_VENDOR_KVM | KVM_HC_FEATURES: r = HC_EV_SUCCESS; -#if defined(CONFIG_PPC_BOOK3S) || defined(CONFIG_KVM_E500) +#if defined(CONFIG_PPC_BOOK3S) || defined(CONFIG_KVM_E500V2) /* XXX Missing magic page on 44x */ r2 |= (1 << KVM_FEATURE_MAGIC_PAGE); #endif @@ -109,6 +114,11 @@ int kvmppc_sanity_check(struct kvm_vcpu *vcpu) goto out; #endif +#ifdef CONFIG_KVM_BOOKE_HV + if (!cpu_has_feature(CPU_FTR_EMB_HV)) + goto out; +#endif + r = true; out: @@ -225,7 +235,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_PPC_PAIRED_SINGLES: case KVM_CAP_PPC_OSI: case KVM_CAP_PPC_GET_PVINFO: -#ifdef CONFIG_KVM_E500 +#if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC) case KVM_CAP_SW_TLB: #endif r = 1; @@ -234,10 +244,12 @@ int kvm_dev_ioctl_check_extension(long ext) r = KVM_COALESCED_MMIO_PAGE_OFFSET; break; #endif -#ifdef CONFIG_KVM_BOOK3S_64_HV +#ifdef CONFIG_PPC_BOOK3S_64 case KVM_CAP_SPAPR_TCE: r = 1; break; +#endif /* CONFIG_PPC_BOOK3S_64 */ +#ifdef CONFIG_KVM_BOOK3S_64_HV case KVM_CAP_PPC_SMT: r = threads_per_core; break; @@ -267,6 +279,11 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_MAX_VCPUS: r = KVM_MAX_VCPUS; break; +#ifdef CONFIG_PPC_BOOK3S_64 + case KVM_CAP_PPC_GET_SMMU_INFO: + r = 1; + break; +#endif default: r = 0; break; @@ -588,21 +605,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) return r; } -void kvm_vcpu_kick(struct kvm_vcpu *vcpu) -{ - int me; - int cpu = vcpu->cpu; - - me = get_cpu(); - if (waitqueue_active(vcpu->arch.wqp)) { - wake_up_interruptible(vcpu->arch.wqp); - vcpu->stat.halt_wakeup++; - } else if (cpu != me && cpu != -1) { - smp_send_reschedule(vcpu->cpu); - } - put_cpu(); -} - int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) { if (irq->irq == KVM_INTERRUPT_UNSET) { @@ -611,6 +613,7 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) } kvmppc_core_queue_external(vcpu, irq); + kvm_vcpu_kick(vcpu); return 0; @@ -633,7 +636,7 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, r = 0; vcpu->arch.papr_enabled = true; break; -#ifdef CONFIG_KVM_E500 +#if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC) case KVM_CAP_SW_TLB: { struct kvm_config_tlb cfg; void __user *user_ptr = (void __user *)(uintptr_t)cap->args[0]; @@ -710,7 +713,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } -#ifdef CONFIG_KVM_E500 +#if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC) case KVM_DIRTY_TLB: { struct kvm_dirty_tlb dirty; r = -EFAULT; @@ -720,7 +723,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } #endif - default: r = -EINVAL; } @@ -777,7 +779,7 @@ long kvm_arch_vm_ioctl(struct file *filp, break; } -#ifdef CONFIG_KVM_BOOK3S_64_HV +#ifdef CONFIG_PPC_BOOK3S_64 case KVM_CREATE_SPAPR_TCE: { struct kvm_create_spapr_tce create_tce; struct kvm *kvm = filp->private_data; @@ -788,7 +790,9 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce); goto out; } +#endif /* CONFIG_PPC_BOOK3S_64 */ +#ifdef CONFIG_KVM_BOOK3S_64_HV case KVM_ALLOCATE_RMA: { struct kvm *kvm = filp->private_data; struct kvm_allocate_rma rma; @@ -800,6 +804,18 @@ long kvm_arch_vm_ioctl(struct file *filp, } #endif /* CONFIG_KVM_BOOK3S_64_HV */ +#ifdef CONFIG_PPC_BOOK3S_64 + case KVM_PPC_GET_SMMU_INFO: { + struct kvm *kvm = filp->private_data; + struct kvm_ppc_smmu_info info; + + memset(&info, 0, sizeof(info)); + r = kvm_vm_ioctl_get_smmu_info(kvm, &info); + if (r >= 0 && copy_to_user(argp, &info, sizeof(info))) + r = -EFAULT; + break; + } +#endif /* CONFIG_PPC_BOOK3S_64 */ default: r = -ENOTTY; } @@ -808,6 +824,40 @@ out: return r; } +static unsigned long lpid_inuse[BITS_TO_LONGS(KVMPPC_NR_LPIDS)]; +static unsigned long nr_lpids; + +long kvmppc_alloc_lpid(void) +{ + long lpid; + + do { + lpid = find_first_zero_bit(lpid_inuse, KVMPPC_NR_LPIDS); + if (lpid >= nr_lpids) { + pr_err("%s: No LPIDs free\n", __func__); + return -ENOMEM; + } + } while (test_and_set_bit(lpid, lpid_inuse)); + + return lpid; +} + +void kvmppc_claim_lpid(long lpid) +{ + set_bit(lpid, lpid_inuse); +} + +void kvmppc_free_lpid(long lpid) +{ + clear_bit(lpid, lpid_inuse); +} + +void kvmppc_init_lpid(unsigned long nr_lpids_param) +{ + nr_lpids = min_t(unsigned long, KVMPPC_NR_LPIDS, nr_lpids_param); + memset(lpid_inuse, 0, sizeof(lpid_inuse)); +} + int kvm_arch_init(void *opaque) { return 0; diff --git a/arch/powerpc/kvm/timing.h b/arch/powerpc/kvm/timing.h index 8167d42..bf191e7 100644 --- a/arch/powerpc/kvm/timing.h +++ b/arch/powerpc/kvm/timing.h @@ -93,6 +93,12 @@ static inline void kvmppc_account_exit_stat(struct kvm_vcpu *vcpu, int type) case SIGNAL_EXITS: vcpu->stat.signal_exits++; break; + case DBELL_EXITS: + vcpu->stat.dbell_exits++; + break; + case GDBELL_EXITS: + vcpu->stat.gdbell_exits++; + break; } } diff --git a/arch/s390/include/asm/kvm.h b/arch/s390/include/asm/kvm.h index 9607667..bdcbe0f 100644 --- a/arch/s390/include/asm/kvm.h +++ b/arch/s390/include/asm/kvm.h @@ -52,4 +52,9 @@ struct kvm_sync_regs { __u32 acrs[16]; /* access registers */ __u64 crs[16]; /* control registers */ }; + +#define KVM_REG_S390_TODPR (KVM_REG_S390 | KVM_REG_SIZE_U32 | 0x1) +#define KVM_REG_S390_EPOCHDIFF (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x2) +#define KVM_REG_S390_CPU_TIMER (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x3) +#define KVM_REG_S390_CLOCK_COMP (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x4) #endif diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 7343872..dd17537 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -148,6 +148,7 @@ struct kvm_vcpu_stat { u32 instruction_sigp_restart; u32 diagnose_10; u32 diagnose_44; + u32 diagnose_9c; }; struct kvm_s390_io_info { diff --git a/arch/s390/include/asm/kvm_para.h b/arch/s390/include/asm/kvm_para.h index 6964db2..a988329 100644 --- a/arch/s390/include/asm/kvm_para.h +++ b/arch/s390/include/asm/kvm_para.h @@ -149,6 +149,11 @@ static inline unsigned int kvm_arch_para_features(void) return 0; } +static inline bool kvm_check_and_clear_guest_paused(void) +{ + return false; +} + #endif #endif /* __S390_KVM_PARA_H */ diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h index fed7bee..bf238c5 100644 --- a/arch/s390/include/asm/sclp.h +++ b/arch/s390/include/asm/sclp.h @@ -48,6 +48,7 @@ int sclp_cpu_deconfigure(u8 cpu); void sclp_facilities_detect(void); unsigned long long sclp_get_rnmax(void); unsigned long long sclp_get_rzm(void); +u8 sclp_get_fac85(void); int sclp_sdias_blk_count(void); int sclp_sdias_copy(void *dest, int blk_num, int nr_blks); int sclp_chp_configure(struct chp_id chpid); diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c index a353f0e..b23d9ac 100644 --- a/arch/s390/kvm/diag.c +++ b/arch/s390/kvm/diag.c @@ -47,9 +47,30 @@ static int __diag_time_slice_end(struct kvm_vcpu *vcpu) { VCPU_EVENT(vcpu, 5, "%s", "diag time slice end"); vcpu->stat.diagnose_44++; - vcpu_put(vcpu); - yield(); - vcpu_load(vcpu); + kvm_vcpu_on_spin(vcpu); + return 0; +} + +static int __diag_time_slice_end_directed(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_vcpu *tcpu; + int tid; + int i; + + tid = vcpu->run->s.regs.gprs[(vcpu->arch.sie_block->ipa & 0xf0) >> 4]; + vcpu->stat.diagnose_9c++; + VCPU_EVENT(vcpu, 5, "diag time slice end directed to %d", tid); + + if (tid == vcpu->vcpu_id) + return 0; + + kvm_for_each_vcpu(i, tcpu, kvm) + if (tcpu->vcpu_id == tid) { + kvm_vcpu_yield_to(tcpu); + break; + } + return 0; } @@ -89,6 +110,8 @@ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu) return diag_release_pages(vcpu); case 0x44: return __diag_time_slice_end(vcpu); + case 0x9c: + return __diag_time_slice_end_directed(vcpu); case 0x308: return __diag_ipl_functions(vcpu); default: diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 3614565..979cbe5 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -101,6 +101,7 @@ static int handle_lctl(struct kvm_vcpu *vcpu) } static intercept_handler_t instruction_handlers[256] = { + [0x01] = kvm_s390_handle_01, [0x83] = kvm_s390_handle_diag, [0xae] = kvm_s390_handle_sigp, [0xb2] = kvm_s390_handle_b2, diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 217ce44..664766d 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -28,6 +28,7 @@ #include <asm/pgtable.h> #include <asm/nmi.h> #include <asm/switch_to.h> +#include <asm/sclp.h> #include "kvm-s390.h" #include "gaccess.h" @@ -74,6 +75,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "instruction_sigp_restart", VCPU_STAT(instruction_sigp_restart) }, { "diagnose_10", VCPU_STAT(diagnose_10) }, { "diagnose_44", VCPU_STAT(diagnose_44) }, + { "diagnose_9c", VCPU_STAT(diagnose_9c) }, { NULL } }; @@ -133,8 +135,16 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_S390_UCONTROL: #endif case KVM_CAP_SYNC_REGS: + case KVM_CAP_ONE_REG: r = 1; break; + case KVM_CAP_NR_VCPUS: + case KVM_CAP_MAX_VCPUS: + r = KVM_MAX_VCPUS; + break; + case KVM_CAP_S390_COW: + r = sclp_get_fac85() & 0x2; + break; default: r = 0; } @@ -423,6 +433,71 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) return 0; } +int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) +{ + /* kvm common code refers to this, but never calls it */ + BUG(); + return 0; +} + +static int kvm_arch_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, + struct kvm_one_reg *reg) +{ + int r = -EINVAL; + + switch (reg->id) { + case KVM_REG_S390_TODPR: + r = put_user(vcpu->arch.sie_block->todpr, + (u32 __user *)reg->addr); + break; + case KVM_REG_S390_EPOCHDIFF: + r = put_user(vcpu->arch.sie_block->epoch, + (u64 __user *)reg->addr); + break; + case KVM_REG_S390_CPU_TIMER: + r = put_user(vcpu->arch.sie_block->cputm, + (u64 __user *)reg->addr); + break; + case KVM_REG_S390_CLOCK_COMP: + r = put_user(vcpu->arch.sie_block->ckc, + (u64 __user *)reg->addr); + break; + default: + break; + } + + return r; +} + +static int kvm_arch_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, + struct kvm_one_reg *reg) +{ + int r = -EINVAL; + + switch (reg->id) { + case KVM_REG_S390_TODPR: + r = get_user(vcpu->arch.sie_block->todpr, + (u32 __user *)reg->addr); + break; + case KVM_REG_S390_EPOCHDIFF: + r = get_user(vcpu->arch.sie_block->epoch, + (u64 __user *)reg->addr); + break; + case KVM_REG_S390_CPU_TIMER: + r = get_user(vcpu->arch.sie_block->cputm, + (u64 __user *)reg->addr); + break; + case KVM_REG_S390_CLOCK_COMP: + r = get_user(vcpu->arch.sie_block->ckc, + (u64 __user *)reg->addr); + break; + default: + break; + } + + return r; +} + static int kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu) { kvm_s390_vcpu_initial_reset(vcpu); @@ -753,6 +828,18 @@ long kvm_arch_vcpu_ioctl(struct file *filp, case KVM_S390_INITIAL_RESET: r = kvm_arch_vcpu_ioctl_initial_reset(vcpu); break; + case KVM_SET_ONE_REG: + case KVM_GET_ONE_REG: { + struct kvm_one_reg reg; + r = -EFAULT; + if (copy_from_user(®, argp, sizeof(reg))) + break; + if (ioctl == KVM_SET_ONE_REG) + r = kvm_arch_vcpu_ioctl_set_one_reg(vcpu, ®); + else + r = kvm_arch_vcpu_ioctl_get_one_reg(vcpu, ®); + break; + } #ifdef CONFIG_KVM_S390_UCONTROL case KVM_S390_UCAS_MAP: { struct kvm_s390_ucas_mapping ucasmap; diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index ff28f9d..2294377 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -79,6 +79,7 @@ int kvm_s390_inject_sigp_stop(struct kvm_vcpu *vcpu, int action); /* implemented in priv.c */ int kvm_s390_handle_b2(struct kvm_vcpu *vcpu); int kvm_s390_handle_e5(struct kvm_vcpu *vcpu); +int kvm_s390_handle_01(struct kvm_vcpu *vcpu); /* implemented in sigp.c */ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu); diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index e5a45db..68a6b2e 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -380,3 +380,34 @@ int kvm_s390_handle_e5(struct kvm_vcpu *vcpu) return -EOPNOTSUPP; } +static int handle_sckpf(struct kvm_vcpu *vcpu) +{ + u32 value; + + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, + PGM_PRIVILEGED_OPERATION); + + if (vcpu->run->s.regs.gprs[0] & 0x00000000ffff0000) + return kvm_s390_inject_program_int(vcpu, + PGM_SPECIFICATION); + + value = vcpu->run->s.regs.gprs[0] & 0x000000000000ffff; + vcpu->arch.sie_block->todpr = value; + + return 0; +} + +static intercept_handler_t x01_handlers[256] = { + [0x07] = handle_sckpf, +}; + +int kvm_s390_handle_01(struct kvm_vcpu *vcpu) +{ + intercept_handler_t handler; + + handler = x01_handlers[vcpu->arch.sie_block->ipa & 0x00ff]; + if (handler) + return handler(vcpu); + return -EOPNOTSUPP; +} diff --git a/arch/score/include/asm/kvm_para.h b/arch/score/include/asm/kvm_para.h new file mode 100644 index 0000000..14fab8f --- /dev/null +++ b/arch/score/include/asm/kvm_para.h @@ -0,0 +1 @@ +#include <asm-generic/kvm_para.h> diff --git a/arch/sh/include/asm/kvm_para.h b/arch/sh/include/asm/kvm_para.h new file mode 100644 index 0000000..14fab8f --- /dev/null +++ b/arch/sh/include/asm/kvm_para.h @@ -0,0 +1 @@ +#include <asm-generic/kvm_para.h> diff --git a/arch/sparc/include/asm/kvm_para.h b/arch/sparc/include/asm/kvm_para.h new file mode 100644 index 0000000..14fab8f --- /dev/null +++ b/arch/sparc/include/asm/kvm_para.h @@ -0,0 +1 @@ +#include <asm-generic/kvm_para.h> diff --git a/arch/tile/include/asm/kvm_para.h b/arch/tile/include/asm/kvm_para.h new file mode 100644 index 0000000..14fab8f --- /dev/null +++ b/arch/tile/include/asm/kvm_para.h @@ -0,0 +1 @@ +#include <asm-generic/kvm_para.h> diff --git a/arch/um/include/asm/kvm_para.h b/arch/um/include/asm/kvm_para.h new file mode 100644 index 0000000..14fab8f --- /dev/null +++ b/arch/um/include/asm/kvm_para.h @@ -0,0 +1 @@ +#include <asm-generic/kvm_para.h> diff --git a/arch/unicore32/include/asm/kvm_para.h b/arch/unicore32/include/asm/kvm_para.h new file mode 100644 index 0000000..14fab8f --- /dev/null +++ b/arch/unicore32/include/asm/kvm_para.h @@ -0,0 +1 @@ +#include <asm-generic/kvm_para.h> diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index c222e1a..1ac46c22 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -200,7 +200,7 @@ typedef u32 __attribute__((vector_size(16))) sse128_t; /* Type, address-of, and value of an instruction's operand. */ struct operand { - enum { OP_REG, OP_MEM, OP_IMM, OP_XMM, OP_NONE } type; + enum { OP_REG, OP_MEM, OP_IMM, OP_XMM, OP_MM, OP_NONE } type; unsigned int bytes; union { unsigned long orig_val; @@ -213,12 +213,14 @@ struct operand { unsigned seg; } mem; unsigned xmm; + unsigned mm; } addr; union { unsigned long val; u64 val64; char valptr[sizeof(unsigned long) + 2]; sse128_t vec_val; + u64 mm_val; }; }; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e5b97be..db7c1f2 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -173,6 +173,9 @@ enum { #define DR7_FIXED_1 0x00000400 #define DR7_VOLATILE 0xffff23ff +/* apic attention bits */ +#define KVM_APIC_CHECK_VAPIC 0 + /* * We don't want allocation failures within the mmu code, so we preallocate * enough memory for a single page fault in a cache. @@ -238,8 +241,6 @@ struct kvm_mmu_page { #endif int write_flooding_count; - - struct rcu_head rcu; }; struct kvm_pio_request { @@ -338,6 +339,7 @@ struct kvm_vcpu_arch { u64 efer; u64 apic_base; struct kvm_lapic *apic; /* kernel irqchip context */ + unsigned long apic_attention; int32_t apic_arb_prio; int mp_state; int sipi_vector; @@ -537,8 +539,6 @@ struct kvm_arch { u64 hv_guest_os_id; u64 hv_hypercall; - atomic_t reader_counter; - #ifdef CONFIG_KVM_MMU_AUDIT int audit_point; #endif @@ -713,8 +713,9 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); -int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn, - struct kvm_memory_slot *slot); +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask); void kvm_mmu_zap_all(struct kvm *kvm); unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 183922e..63ab166 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -95,6 +95,14 @@ struct kvm_vcpu_pv_apf_data { extern void kvmclock_init(void); extern int kvm_register_clock(char *txt); +#ifdef CONFIG_KVM_CLOCK +bool kvm_check_and_clear_guest_paused(void); +#else +static inline bool kvm_check_and_clear_guest_paused(void) +{ + return false; +} +#endif /* CONFIG_KVMCLOCK */ /* This instruction is vmcall. On non-VT architectures, it will generate a * trap that we will then rewrite to the appropriate instruction. @@ -173,14 +181,16 @@ static inline int kvm_para_available(void) if (boot_cpu_data.cpuid_level < 0) return 0; /* So we don't blow up on old processors */ - cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx); - memcpy(signature + 0, &ebx, 4); - memcpy(signature + 4, &ecx, 4); - memcpy(signature + 8, &edx, 4); - signature[12] = 0; + if (cpu_has_hypervisor) { + cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx); + memcpy(signature + 0, &ebx, 4); + memcpy(signature + 4, &ecx, 4); + memcpy(signature + 8, &edx, 4); + signature[12] = 0; - if (strcmp(signature, "KVMKVMKVM") == 0) - return 1; + if (strcmp(signature, "KVMKVMKVM") == 0) + return 1; + } return 0; } diff --git a/arch/x86/include/asm/pvclock-abi.h b/arch/x86/include/asm/pvclock-abi.h index 35f2d19..6167fd7 100644 --- a/arch/x86/include/asm/pvclock-abi.h +++ b/arch/x86/include/asm/pvclock-abi.h @@ -40,5 +40,6 @@ struct pvclock_wall_clock { } __attribute__((__packed__)); #define PVCLOCK_TSC_STABLE_BIT (1 << 0) +#define PVCLOCK_GUEST_STOPPED (1 << 1) #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_PVCLOCK_ABI_H */ diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index f8492da6..086eb58 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -22,6 +22,7 @@ #include <asm/msr.h> #include <asm/apic.h> #include <linux/percpu.h> +#include <linux/hardirq.h> #include <asm/x86_init.h> #include <asm/reboot.h> @@ -114,6 +115,25 @@ static void kvm_get_preset_lpj(void) preset_lpj = lpj; } +bool kvm_check_and_clear_guest_paused(void) +{ + bool ret = false; + struct pvclock_vcpu_time_info *src; + + /* + * per_cpu() is safe here because this function is only called from + * timer functions where preemption is already disabled. + */ + WARN_ON(!in_atomic()); + src = &__get_cpu_var(hv_clock); + if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) { + __this_cpu_and(hv_clock.flags, ~PVCLOCK_GUEST_STOPPED); + ret = true; + } + + return ret; +} + static struct clocksource kvm_clock = { .name = "kvm-clock", .read = kvm_clock_get_cycles, diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 1a7fe86..a28f338 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -36,6 +36,7 @@ config KVM select TASKSTATS select TASK_DELAY_ACCT select PERF_EVENTS + select HAVE_KVM_MSI ---help--- Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 9fed5be..7df1c6d 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -247,7 +247,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 7.0.ebx */ const u32 kvm_supported_word9_x86_features = - F(FSGSBASE) | F(BMI1) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS); + F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | + F(BMI2) | F(ERMS) | F(RTM); /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); @@ -397,7 +398,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, case KVM_CPUID_SIGNATURE: { char signature[12] = "KVMKVMKVM\0\0"; u32 *sigptr = (u32 *)signature; - entry->eax = 0; + entry->eax = KVM_CPUID_FEATURES; entry->ebx = sigptr[0]; entry->ecx = sigptr[1]; entry->edx = sigptr[2]; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 8375622..f95d242 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -142,6 +142,10 @@ #define Src2FS (OpFS << Src2Shift) #define Src2GS (OpGS << Src2Shift) #define Src2Mask (OpMask << Src2Shift) +#define Mmx ((u64)1 << 40) /* MMX Vector instruction */ +#define Aligned ((u64)1 << 41) /* Explicitly aligned (e.g. MOVDQA) */ +#define Unaligned ((u64)1 << 42) /* Explicitly unaligned (e.g. MOVDQU) */ +#define Avx ((u64)1 << 43) /* Advanced Vector Extensions */ #define X2(x...) x, x #define X3(x...) X2(x), x @@ -557,6 +561,29 @@ static void set_segment_selector(struct x86_emulate_ctxt *ctxt, u16 selector, ctxt->ops->set_segment(ctxt, selector, &desc, base3, seg); } +/* + * x86 defines three classes of vector instructions: explicitly + * aligned, explicitly unaligned, and the rest, which change behaviour + * depending on whether they're AVX encoded or not. + * + * Also included is CMPXCHG16B which is not a vector instruction, yet it is + * subject to the same check. + */ +static bool insn_aligned(struct x86_emulate_ctxt *ctxt, unsigned size) +{ + if (likely(size < 16)) + return false; + + if (ctxt->d & Aligned) + return true; + else if (ctxt->d & Unaligned) + return false; + else if (ctxt->d & Avx) + return false; + else + return true; +} + static int __linearize(struct x86_emulate_ctxt *ctxt, struct segmented_address addr, unsigned size, bool write, bool fetch, @@ -621,6 +648,8 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, } if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : ctxt->ad_bytes != 8) la &= (u32)-1; + if (insn_aligned(ctxt, size) && ((la & (size - 1)) != 0)) + return emulate_gp(ctxt, 0); *linear = la; return X86EMUL_CONTINUE; bad: @@ -859,6 +888,40 @@ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, ctxt->ops->put_fpu(ctxt); } +static void read_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg) +{ + ctxt->ops->get_fpu(ctxt); + switch (reg) { + case 0: asm("movq %%mm0, %0" : "=m"(*data)); break; + case 1: asm("movq %%mm1, %0" : "=m"(*data)); break; + case 2: asm("movq %%mm2, %0" : "=m"(*data)); break; + case 3: asm("movq %%mm3, %0" : "=m"(*data)); break; + case 4: asm("movq %%mm4, %0" : "=m"(*data)); break; + case 5: asm("movq %%mm5, %0" : "=m"(*data)); break; + case 6: asm("movq %%mm6, %0" : "=m"(*data)); break; + case 7: asm("movq %%mm7, %0" : "=m"(*data)); break; + default: BUG(); + } + ctxt->ops->put_fpu(ctxt); +} + +static void write_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg) +{ + ctxt->ops->get_fpu(ctxt); + switch (reg) { + case 0: asm("movq %0, %%mm0" : : "m"(*data)); break; + case 1: asm("movq %0, %%mm1" : : "m"(*data)); break; + case 2: asm("movq %0, %%mm2" : : "m"(*data)); break; + case 3: asm("movq %0, %%mm3" : : "m"(*data)); break; + case 4: asm("movq %0, %%mm4" : : "m"(*data)); break; + case 5: asm("movq %0, %%mm5" : : "m"(*data)); break; + case 6: asm("movq %0, %%mm6" : : "m"(*data)); break; + case 7: asm("movq %0, %%mm7" : : "m"(*data)); break; + default: BUG(); + } + ctxt->ops->put_fpu(ctxt); +} + static void decode_register_operand(struct x86_emulate_ctxt *ctxt, struct operand *op) { @@ -875,6 +938,13 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt, read_sse_reg(ctxt, &op->vec_val, reg); return; } + if (ctxt->d & Mmx) { + reg &= 7; + op->type = OP_MM; + op->bytes = 8; + op->addr.mm = reg; + return; + } op->type = OP_REG; if (ctxt->d & ByteOp) { @@ -902,7 +972,6 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, ctxt->modrm_rm = base_reg = (ctxt->rex_prefix & 1) << 3; /* REG.B */ } - ctxt->modrm = insn_fetch(u8, ctxt); ctxt->modrm_mod |= (ctxt->modrm & 0xc0) >> 6; ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3; ctxt->modrm_rm |= (ctxt->modrm & 0x07); @@ -920,6 +989,12 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, read_sse_reg(ctxt, &op->vec_val, ctxt->modrm_rm); return rc; } + if (ctxt->d & Mmx) { + op->type = OP_MM; + op->bytes = 8; + op->addr.xmm = ctxt->modrm_rm & 7; + return rc; + } fetch_register_operand(op); return rc; } @@ -1387,6 +1462,9 @@ static int writeback(struct x86_emulate_ctxt *ctxt) case OP_XMM: write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm); break; + case OP_MM: + write_mmx_reg(ctxt, &ctxt->dst.mm_val, ctxt->dst.addr.mm); + break; case OP_NONE: /* no writeback */ break; @@ -2790,7 +2868,7 @@ static int em_rdpmc(struct x86_emulate_ctxt *ctxt) static int em_mov(struct x86_emulate_ctxt *ctxt) { - ctxt->dst.val = ctxt->src.val; + memcpy(ctxt->dst.valptr, ctxt->src.valptr, ctxt->op_bytes); return X86EMUL_CONTINUE; } @@ -2870,12 +2948,6 @@ static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt) return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg); } -static int em_movdqu(struct x86_emulate_ctxt *ctxt) -{ - memcpy(&ctxt->dst.vec_val, &ctxt->src.vec_val, ctxt->op_bytes); - return X86EMUL_CONTINUE; -} - static int em_invlpg(struct x86_emulate_ctxt *ctxt) { int rc; @@ -3061,35 +3133,13 @@ static int em_btc(struct x86_emulate_ctxt *ctxt) static int em_bsf(struct x86_emulate_ctxt *ctxt) { - u8 zf; - - __asm__ ("bsf %2, %0; setz %1" - : "=r"(ctxt->dst.val), "=q"(zf) - : "r"(ctxt->src.val)); - - ctxt->eflags &= ~X86_EFLAGS_ZF; - if (zf) { - ctxt->eflags |= X86_EFLAGS_ZF; - /* Disable writeback. */ - ctxt->dst.type = OP_NONE; - } + emulate_2op_SrcV_nobyte(ctxt, "bsf"); return X86EMUL_CONTINUE; } static int em_bsr(struct x86_emulate_ctxt *ctxt) { - u8 zf; - - __asm__ ("bsr %2, %0; setz %1" - : "=r"(ctxt->dst.val), "=q"(zf) - : "r"(ctxt->src.val)); - - ctxt->eflags &= ~X86_EFLAGS_ZF; - if (zf) { - ctxt->eflags |= X86_EFLAGS_ZF; - /* Disable writeback. */ - ctxt->dst.type = OP_NONE; - } + emulate_2op_SrcV_nobyte(ctxt, "bsr"); return X86EMUL_CONTINUE; } @@ -3286,8 +3336,8 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt) .check_perm = (_p) } #define N D(0) #define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) } -#define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) } -#define GD(_f, _g) { .flags = ((_f) | GroupDual), .u.gdual = (_g) } +#define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) } +#define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) } #define I(_f, _e) { .flags = (_f), .u.execute = (_e) } #define II(_f, _e, _i) \ { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i } @@ -3307,25 +3357,25 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt) I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e) static struct opcode group7_rm1[] = { - DI(SrcNone | ModRM | Priv, monitor), - DI(SrcNone | ModRM | Priv, mwait), + DI(SrcNone | Priv, monitor), + DI(SrcNone | Priv, mwait), N, N, N, N, N, N, }; static struct opcode group7_rm3[] = { - DIP(SrcNone | ModRM | Prot | Priv, vmrun, check_svme_pa), - II(SrcNone | ModRM | Prot | VendorSpecific, em_vmmcall, vmmcall), - DIP(SrcNone | ModRM | Prot | Priv, vmload, check_svme_pa), - DIP(SrcNone | ModRM | Prot | Priv, vmsave, check_svme_pa), - DIP(SrcNone | ModRM | Prot | Priv, stgi, check_svme), - DIP(SrcNone | ModRM | Prot | Priv, clgi, check_svme), - DIP(SrcNone | ModRM | Prot | Priv, skinit, check_svme), - DIP(SrcNone | ModRM | Prot | Priv, invlpga, check_svme), + DIP(SrcNone | Prot | Priv, vmrun, check_svme_pa), + II(SrcNone | Prot | VendorSpecific, em_vmmcall, vmmcall), + DIP(SrcNone | Prot | Priv, vmload, check_svme_pa), + DIP(SrcNone | Prot | Priv, vmsave, check_svme_pa), + DIP(SrcNone | Prot | Priv, stgi, check_svme), + DIP(SrcNone | Prot | Priv, clgi, check_svme), + DIP(SrcNone | Prot | Priv, skinit, check_svme), + DIP(SrcNone | Prot | Priv, invlpga, check_svme), }; static struct opcode group7_rm7[] = { N, - DIP(SrcNone | ModRM, rdtscp, check_rdtsc), + DIP(SrcNone, rdtscp, check_rdtsc), N, N, N, N, N, N, }; @@ -3341,81 +3391,86 @@ static struct opcode group1[] = { }; static struct opcode group1A[] = { - I(DstMem | SrcNone | ModRM | Mov | Stack, em_pop), N, N, N, N, N, N, N, + I(DstMem | SrcNone | Mov | Stack, em_pop), N, N, N, N, N, N, N, }; static struct opcode group3[] = { - I(DstMem | SrcImm | ModRM, em_test), - I(DstMem | SrcImm | ModRM, em_test), - I(DstMem | SrcNone | ModRM | Lock, em_not), - I(DstMem | SrcNone | ModRM | Lock, em_neg), - I(SrcMem | ModRM, em_mul_ex), - I(SrcMem | ModRM, em_imul_ex), - I(SrcMem | ModRM, em_div_ex), - I(SrcMem | ModRM, em_idiv_ex), + I(DstMem | SrcImm, em_test), + I(DstMem | SrcImm, em_test), + I(DstMem | SrcNone | Lock, em_not), + I(DstMem | SrcNone | Lock, em_neg), + I(SrcMem, em_mul_ex), + I(SrcMem, em_imul_ex), + I(SrcMem, em_div_ex), + I(SrcMem, em_idiv_ex), }; static struct opcode group4[] = { - I(ByteOp | DstMem | SrcNone | ModRM | Lock, em_grp45), - I(ByteOp | DstMem | SrcNone | ModRM | Lock, em_grp45), + I(ByteOp | DstMem | SrcNone | Lock, em_grp45), + I(ByteOp | DstMem | SrcNone | Lock, em_grp45), N, N, N, N, N, N, }; static struct opcode group5[] = { - I(DstMem | SrcNone | ModRM | Lock, em_grp45), - I(DstMem | SrcNone | ModRM | Lock, em_grp45), - I(SrcMem | ModRM | Stack, em_grp45), - I(SrcMemFAddr | ModRM | ImplicitOps | Stack, em_call_far), - I(SrcMem | ModRM | Stack, em_grp45), - I(SrcMemFAddr | ModRM | ImplicitOps, em_grp45), - I(SrcMem | ModRM | Stack, em_grp45), N, + I(DstMem | SrcNone | Lock, em_grp45), + I(DstMem | SrcNone | Lock, em_grp45), + I(SrcMem | Stack, em_grp45), + I(SrcMemFAddr | ImplicitOps | Stack, em_call_far), + I(SrcMem | Stack, em_grp45), + I(SrcMemFAddr | ImplicitOps, em_grp45), + I(SrcMem | Stack, em_grp45), N, }; static struct opcode group6[] = { - DI(ModRM | Prot, sldt), - DI(ModRM | Prot, str), - DI(ModRM | Prot | Priv, lldt), - DI(ModRM | Prot | Priv, ltr), + DI(Prot, sldt), + DI(Prot, str), + DI(Prot | Priv, lldt), + DI(Prot | Priv, ltr), N, N, N, N, }; static struct group_dual group7 = { { - DI(ModRM | Mov | DstMem | Priv, sgdt), - DI(ModRM | Mov | DstMem | Priv, sidt), - II(ModRM | SrcMem | Priv, em_lgdt, lgdt), - II(ModRM | SrcMem | Priv, em_lidt, lidt), - II(SrcNone | ModRM | DstMem | Mov, em_smsw, smsw), N, - II(SrcMem16 | ModRM | Mov | Priv, em_lmsw, lmsw), - II(SrcMem | ModRM | ByteOp | Priv | NoAccess, em_invlpg, invlpg), + DI(Mov | DstMem | Priv, sgdt), + DI(Mov | DstMem | Priv, sidt), + II(SrcMem | Priv, em_lgdt, lgdt), + II(SrcMem | Priv, em_lidt, lidt), + II(SrcNone | DstMem | Mov, em_smsw, smsw), N, + II(SrcMem16 | Mov | Priv, em_lmsw, lmsw), + II(SrcMem | ByteOp | Priv | NoAccess, em_invlpg, invlpg), }, { - I(SrcNone | ModRM | Priv | VendorSpecific, em_vmcall), + I(SrcNone | Priv | VendorSpecific, em_vmcall), EXT(0, group7_rm1), N, EXT(0, group7_rm3), - II(SrcNone | ModRM | DstMem | Mov, em_smsw, smsw), N, - II(SrcMem16 | ModRM | Mov | Priv, em_lmsw, lmsw), EXT(0, group7_rm7), + II(SrcNone | DstMem | Mov, em_smsw, smsw), N, + II(SrcMem16 | Mov | Priv, em_lmsw, lmsw), + EXT(0, group7_rm7), } }; static struct opcode group8[] = { N, N, N, N, - I(DstMem | SrcImmByte | ModRM, em_bt), - I(DstMem | SrcImmByte | ModRM | Lock | PageTable, em_bts), - I(DstMem | SrcImmByte | ModRM | Lock, em_btr), - I(DstMem | SrcImmByte | ModRM | Lock | PageTable, em_btc), + I(DstMem | SrcImmByte, em_bt), + I(DstMem | SrcImmByte | Lock | PageTable, em_bts), + I(DstMem | SrcImmByte | Lock, em_btr), + I(DstMem | SrcImmByte | Lock | PageTable, em_btc), }; static struct group_dual group9 = { { - N, I(DstMem64 | ModRM | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N, + N, I(DstMem64 | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N, }, { N, N, N, N, N, N, N, N, } }; static struct opcode group11[] = { - I(DstMem | SrcImm | ModRM | Mov | PageTable, em_mov), + I(DstMem | SrcImm | Mov | PageTable, em_mov), X7(D(Undefined)), }; static struct gprefix pfx_0f_6f_0f_7f = { - N, N, N, I(Sse, em_movdqu), + I(Mmx, em_mov), I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov), +}; + +static struct gprefix pfx_vmovntpx = { + I(0, em_mov), N, N, N, }; static struct opcode opcode_table[256] = { @@ -3464,10 +3519,10 @@ static struct opcode opcode_table[256] = { /* 0x70 - 0x7F */ X16(D(SrcImmByte)), /* 0x80 - 0x87 */ - G(ByteOp | DstMem | SrcImm | ModRM | Group, group1), - G(DstMem | SrcImm | ModRM | Group, group1), - G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1), - G(DstMem | SrcImmByte | ModRM | Group, group1), + G(ByteOp | DstMem | SrcImm, group1), + G(DstMem | SrcImm, group1), + G(ByteOp | DstMem | SrcImm | No64, group1), + G(DstMem | SrcImmByte, group1), I2bv(DstMem | SrcReg | ModRM, em_test), I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_xchg), /* 0x88 - 0x8F */ @@ -3549,7 +3604,8 @@ static struct opcode twobyte_table[256] = { IIP(ModRM | SrcMem | Priv | Op3264, em_cr_write, cr_write, check_cr_write), IIP(ModRM | SrcMem | Priv | Op3264, em_dr_write, dr_write, check_dr_write), N, N, N, N, - N, N, N, N, N, N, N, N, + N, N, N, GP(ModRM | DstMem | SrcReg | Sse | Mov | Aligned, &pfx_vmovntpx), + N, N, N, N, /* 0x30 - 0x3F */ II(ImplicitOps | Priv, em_wrmsr, wrmsr), IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc), @@ -3897,17 +3953,16 @@ done_prefixes: } ctxt->d = opcode.flags; + if (ctxt->d & ModRM) + ctxt->modrm = insn_fetch(u8, ctxt); + while (ctxt->d & GroupMask) { switch (ctxt->d & GroupMask) { case Group: - ctxt->modrm = insn_fetch(u8, ctxt); - --ctxt->_eip; goffset = (ctxt->modrm >> 3) & 7; opcode = opcode.u.group[goffset]; break; case GroupDual: - ctxt->modrm = insn_fetch(u8, ctxt); - --ctxt->_eip; goffset = (ctxt->modrm >> 3) & 7; if ((ctxt->modrm >> 6) == 3) opcode = opcode.u.gdual->mod3[goffset]; @@ -3960,6 +4015,8 @@ done_prefixes: if (ctxt->d & Sse) ctxt->op_bytes = 16; + else if (ctxt->d & Mmx) + ctxt->op_bytes = 8; /* ModRM and SIB bytes. */ if (ctxt->d & ModRM) { @@ -4030,6 +4087,35 @@ static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) return false; } +static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt) +{ + bool fault = false; + + ctxt->ops->get_fpu(ctxt); + asm volatile("1: fwait \n\t" + "2: \n\t" + ".pushsection .fixup,\"ax\" \n\t" + "3: \n\t" + "movb $1, %[fault] \n\t" + "jmp 2b \n\t" + ".popsection \n\t" + _ASM_EXTABLE(1b, 3b) + : [fault]"+qm"(fault)); + ctxt->ops->put_fpu(ctxt); + + if (unlikely(fault)) + return emulate_exception(ctxt, MF_VECTOR, 0, false); + + return X86EMUL_CONTINUE; +} + +static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt, + struct operand *op) +{ + if (op->type == OP_MM) + read_mmx_reg(ctxt, &op->mm_val, op->addr.mm); +} + int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) { struct x86_emulate_ops *ops = ctxt->ops; @@ -4054,18 +4140,31 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) goto done; } - if ((ctxt->d & Sse) - && ((ops->get_cr(ctxt, 0) & X86_CR0_EM) - || !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) { + if (((ctxt->d & (Sse|Mmx)) && ((ops->get_cr(ctxt, 0) & X86_CR0_EM))) + || ((ctxt->d & Sse) && !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) { rc = emulate_ud(ctxt); goto done; } - if ((ctxt->d & Sse) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) { + if ((ctxt->d & (Sse|Mmx)) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) { rc = emulate_nm(ctxt); goto done; } + if (ctxt->d & Mmx) { + rc = flush_pending_x87_faults(ctxt); + if (rc != X86EMUL_CONTINUE) + goto done; + /* + * Now that we know the fpu is exception safe, we can fetch + * operands from it. + */ + fetch_possible_mmx_operand(ctxt, &ctxt->src); + fetch_possible_mmx_operand(ctxt, &ctxt->src2); + if (!(ctxt->d & Mov)) + fetch_possible_mmx_operand(ctxt, &ctxt->dst); + } + if (unlikely(ctxt->guest_mode) && ctxt->intercept) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_PRE_EXCEPT); diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index d68f99d..adba28f 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -34,7 +34,6 @@ #include <linux/kvm_host.h> #include <linux/slab.h> -#include <linux/workqueue.h> #include "irq.h" #include "i8254.h" @@ -249,7 +248,7 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) /* in this case, we had multiple outstanding pit interrupts * that we needed to inject. Reinject */ - queue_work(ps->pit->wq, &ps->pit->expired); + queue_kthread_work(&ps->pit->worker, &ps->pit->expired); ps->irq_ack = 1; spin_unlock(&ps->inject_lock); } @@ -270,7 +269,7 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) static void destroy_pit_timer(struct kvm_pit *pit) { hrtimer_cancel(&pit->pit_state.pit_timer.timer); - cancel_work_sync(&pit->expired); + flush_kthread_work(&pit->expired); } static bool kpit_is_periodic(struct kvm_timer *ktimer) @@ -284,7 +283,7 @@ static struct kvm_timer_ops kpit_ops = { .is_periodic = kpit_is_periodic, }; -static void pit_do_work(struct work_struct *work) +static void pit_do_work(struct kthread_work *work) { struct kvm_pit *pit = container_of(work, struct kvm_pit, expired); struct kvm *kvm = pit->kvm; @@ -328,7 +327,7 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) if (ktimer->reinject || !atomic_read(&ktimer->pending)) { atomic_inc(&ktimer->pending); - queue_work(pt->wq, &pt->expired); + queue_kthread_work(&pt->worker, &pt->expired); } if (ktimer->t_ops->is_periodic(ktimer)) { @@ -353,7 +352,7 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period) /* TODO The new value only affected after the retriggered */ hrtimer_cancel(&pt->timer); - cancel_work_sync(&ps->pit->expired); + flush_kthread_work(&ps->pit->expired); pt->period = interval; ps->is_periodic = is_period; @@ -669,6 +668,8 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) { struct kvm_pit *pit; struct kvm_kpit_state *pit_state; + struct pid *pid; + pid_t pid_nr; int ret; pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL); @@ -685,14 +686,20 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) mutex_lock(&pit->pit_state.lock); spin_lock_init(&pit->pit_state.inject_lock); - pit->wq = create_singlethread_workqueue("kvm-pit-wq"); - if (!pit->wq) { + pid = get_pid(task_tgid(current)); + pid_nr = pid_vnr(pid); + put_pid(pid); + + init_kthread_worker(&pit->worker); + pit->worker_task = kthread_run(kthread_worker_fn, &pit->worker, + "kvm-pit/%d", pid_nr); + if (IS_ERR(pit->worker_task)) { mutex_unlock(&pit->pit_state.lock); kvm_free_irq_source_id(kvm, pit->irq_source_id); kfree(pit); return NULL; } - INIT_WORK(&pit->expired, pit_do_work); + init_kthread_work(&pit->expired, pit_do_work); kvm->arch.vpit = pit; pit->kvm = kvm; @@ -736,7 +743,7 @@ fail: kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier); kvm_unregister_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier); kvm_free_irq_source_id(kvm, pit->irq_source_id); - destroy_workqueue(pit->wq); + kthread_stop(pit->worker_task); kfree(pit); return NULL; } @@ -756,10 +763,10 @@ void kvm_free_pit(struct kvm *kvm) mutex_lock(&kvm->arch.vpit->pit_state.lock); timer = &kvm->arch.vpit->pit_state.pit_timer.timer; hrtimer_cancel(timer); - cancel_work_sync(&kvm->arch.vpit->expired); + flush_kthread_work(&kvm->arch.vpit->expired); + kthread_stop(kvm->arch.vpit->worker_task); kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id); mutex_unlock(&kvm->arch.vpit->pit_state.lock); - destroy_workqueue(kvm->arch.vpit->wq); kfree(kvm->arch.vpit); } } diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index 51a9742..fdf4042 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -1,6 +1,8 @@ #ifndef __I8254_H #define __I8254_H +#include <linux/kthread.h> + #include "iodev.h" struct kvm_kpit_channel_state { @@ -39,8 +41,9 @@ struct kvm_pit { struct kvm_kpit_state pit_state; int irq_source_id; struct kvm_irq_mask_notifier mask_notifier; - struct workqueue_struct *wq; - struct work_struct expired; + struct kthread_worker worker; + struct task_struct *worker_task; + struct kthread_work expired; }; #define KVM_PIT_BASE_ADDRESS 0x40 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 8584322..93c1574 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -92,6 +92,11 @@ static inline int apic_test_and_clear_vector(int vec, void *bitmap) return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); } +static inline int apic_test_vector(int vec, void *bitmap) +{ + return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); +} + static inline void apic_set_vector(int vec, void *bitmap) { set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); @@ -480,7 +485,6 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2) static void apic_set_eoi(struct kvm_lapic *apic) { int vector = apic_find_highest_isr(apic); - int trigger_mode; /* * Not every write EOI will has corresponding ISR, * one example is when Kernel check timer on setup_IO_APIC @@ -491,12 +495,15 @@ static void apic_set_eoi(struct kvm_lapic *apic) apic_clear_vector(vector, apic->regs + APIC_ISR); apic_update_ppr(apic); - if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR)) - trigger_mode = IOAPIC_LEVEL_TRIG; - else - trigger_mode = IOAPIC_EDGE_TRIG; - if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) + if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) && + kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) { + int trigger_mode; + if (apic_test_vector(vector, apic->regs + APIC_TMR)) + trigger_mode = IOAPIC_LEVEL_TRIG; + else + trigger_mode = IOAPIC_EDGE_TRIG; kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); + } kvm_make_request(KVM_REQ_EVENT, apic->vcpu); } @@ -1081,6 +1088,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) apic_update_ppr(apic); vcpu->arch.apic_arb_prio = 0; + vcpu->arch.apic_attention = 0; apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr=" "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__, @@ -1280,7 +1288,7 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) u32 data; void *vapic; - if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr) + if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) return; vapic = kmap_atomic(vcpu->arch.apic->vapic_page); @@ -1297,7 +1305,7 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) struct kvm_lapic *apic; void *vapic; - if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr) + if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) return; apic = vcpu->arch.apic; @@ -1317,10 +1325,11 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr) { - if (!irqchip_in_kernel(vcpu->kvm)) - return; - vcpu->arch.apic->vapic_addr = vapic_addr; + if (vapic_addr) + __set_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention); + else + __clear_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention); } int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 4cb1642..72102e0 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -135,8 +135,6 @@ module_param(dbg, bool, 0644); #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ | PT64_NX_MASK) -#define PTE_LIST_EXT 4 - #define ACC_EXEC_MASK 1 #define ACC_WRITE_MASK PT_WRITABLE_MASK #define ACC_USER_MASK PT_USER_MASK @@ -151,6 +149,9 @@ module_param(dbg, bool, 0644); #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) +/* make pte_list_desc fit well in cache line */ +#define PTE_LIST_EXT 3 + struct pte_list_desc { u64 *sptes[PTE_LIST_EXT]; struct pte_list_desc *more; @@ -550,19 +551,29 @@ static u64 mmu_spte_get_lockless(u64 *sptep) static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) { - rcu_read_lock(); - atomic_inc(&vcpu->kvm->arch.reader_counter); - - /* Increase the counter before walking shadow page table */ - smp_mb__after_atomic_inc(); + /* + * Prevent page table teardown by making any free-er wait during + * kvm_flush_remote_tlbs() IPI to all active vcpus. + */ + local_irq_disable(); + vcpu->mode = READING_SHADOW_PAGE_TABLES; + /* + * Make sure a following spte read is not reordered ahead of the write + * to vcpu->mode. + */ + smp_mb(); } static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) { - /* Decrease the counter after walking shadow page table finished */ - smp_mb__before_atomic_dec(); - atomic_dec(&vcpu->kvm->arch.reader_counter); - rcu_read_unlock(); + /* + * Make sure the write to vcpu->mode is not reordered in front of + * reads to sptes. If it does, kvm_commit_zap_page() can see us + * OUTSIDE_GUEST_MODE and proceed to free the shadow page table. + */ + smp_mb(); + vcpu->mode = OUTSIDE_GUEST_MODE; + local_irq_enable(); } static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, @@ -841,32 +852,6 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, return count; } -static u64 *pte_list_next(unsigned long *pte_list, u64 *spte) -{ - struct pte_list_desc *desc; - u64 *prev_spte; - int i; - - if (!*pte_list) - return NULL; - else if (!(*pte_list & 1)) { - if (!spte) - return (u64 *)*pte_list; - return NULL; - } - desc = (struct pte_list_desc *)(*pte_list & ~1ul); - prev_spte = NULL; - while (desc) { - for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) { - if (prev_spte == spte) - return desc->sptes[i]; - prev_spte = desc->sptes[i]; - } - desc = desc->more; - } - return NULL; -} - static void pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc, int i, struct pte_list_desc *prev_desc) @@ -987,11 +972,6 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) return pte_list_add(vcpu, spte, rmapp); } -static u64 *rmap_next(unsigned long *rmapp, u64 *spte) -{ - return pte_list_next(rmapp, spte); -} - static void rmap_remove(struct kvm *kvm, u64 *spte) { struct kvm_mmu_page *sp; @@ -1004,106 +984,201 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) pte_list_remove(spte, rmapp); } +/* + * Used by the following functions to iterate through the sptes linked by a + * rmap. All fields are private and not assumed to be used outside. + */ +struct rmap_iterator { + /* private fields */ + struct pte_list_desc *desc; /* holds the sptep if not NULL */ + int pos; /* index of the sptep */ +}; + +/* + * Iteration must be started by this function. This should also be used after + * removing/dropping sptes from the rmap link because in such cases the + * information in the itererator may not be valid. + * + * Returns sptep if found, NULL otherwise. + */ +static u64 *rmap_get_first(unsigned long rmap, struct rmap_iterator *iter) +{ + if (!rmap) + return NULL; + + if (!(rmap & 1)) { + iter->desc = NULL; + return (u64 *)rmap; + } + + iter->desc = (struct pte_list_desc *)(rmap & ~1ul); + iter->pos = 0; + return iter->desc->sptes[iter->pos]; +} + +/* + * Must be used with a valid iterator: e.g. after rmap_get_first(). + * + * Returns sptep if found, NULL otherwise. + */ +static u64 *rmap_get_next(struct rmap_iterator *iter) +{ + if (iter->desc) { + if (iter->pos < PTE_LIST_EXT - 1) { + u64 *sptep; + + ++iter->pos; + sptep = iter->desc->sptes[iter->pos]; + if (sptep) + return sptep; + } + + iter->desc = iter->desc->more; + + if (iter->desc) { + iter->pos = 0; + /* desc->sptes[0] cannot be NULL */ + return iter->desc->sptes[iter->pos]; + } + } + + return NULL; +} + static void drop_spte(struct kvm *kvm, u64 *sptep) { if (mmu_spte_clear_track_bits(sptep)) rmap_remove(kvm, sptep); } -int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn, - struct kvm_memory_slot *slot) +static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level) { - unsigned long *rmapp; - u64 *spte; - int i, write_protected = 0; - - rmapp = __gfn_to_rmap(gfn, PT_PAGE_TABLE_LEVEL, slot); - spte = rmap_next(rmapp, NULL); - while (spte) { - BUG_ON(!(*spte & PT_PRESENT_MASK)); - rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); - if (is_writable_pte(*spte)) { - mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK); - write_protected = 1; + u64 *sptep; + struct rmap_iterator iter; + int write_protected = 0; + + for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { + BUG_ON(!(*sptep & PT_PRESENT_MASK)); + rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); + + if (!is_writable_pte(*sptep)) { + sptep = rmap_get_next(&iter); + continue; } - spte = rmap_next(rmapp, spte); - } - /* check for huge page mappings */ - for (i = PT_DIRECTORY_LEVEL; - i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { - rmapp = __gfn_to_rmap(gfn, i, slot); - spte = rmap_next(rmapp, NULL); - while (spte) { - BUG_ON(!(*spte & PT_PRESENT_MASK)); - BUG_ON(!is_large_pte(*spte)); - pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); - if (is_writable_pte(*spte)) { - drop_spte(kvm, spte); - --kvm->stat.lpages; - spte = NULL; - write_protected = 1; - } - spte = rmap_next(rmapp, spte); + if (level == PT_PAGE_TABLE_LEVEL) { + mmu_spte_update(sptep, *sptep & ~PT_WRITABLE_MASK); + sptep = rmap_get_next(&iter); + } else { + BUG_ON(!is_large_pte(*sptep)); + drop_spte(kvm, sptep); + --kvm->stat.lpages; + sptep = rmap_get_first(*rmapp, &iter); } + + write_protected = 1; } return write_protected; } +/** + * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages + * @kvm: kvm instance + * @slot: slot to protect + * @gfn_offset: start of the BITS_PER_LONG pages we care about + * @mask: indicates which pages we should protect + * + * Used when we do not need to care about huge page mappings: e.g. during dirty + * logging we do not have any such mappings. + */ +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask) +{ + unsigned long *rmapp; + + while (mask) { + rmapp = &slot->rmap[gfn_offset + __ffs(mask)]; + __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL); + + /* clear the first set bit */ + mask &= mask - 1; + } +} + static int rmap_write_protect(struct kvm *kvm, u64 gfn) { struct kvm_memory_slot *slot; + unsigned long *rmapp; + int i; + int write_protected = 0; slot = gfn_to_memslot(kvm, gfn); - return kvm_mmu_rmap_write_protect(kvm, gfn, slot); + + for (i = PT_PAGE_TABLE_LEVEL; + i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { + rmapp = __gfn_to_rmap(gfn, i, slot); + write_protected |= __rmap_write_protect(kvm, rmapp, i); + } + + return write_protected; } static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, unsigned long data) { - u64 *spte; + u64 *sptep; + struct rmap_iterator iter; int need_tlb_flush = 0; - while ((spte = rmap_next(rmapp, NULL))) { - BUG_ON(!(*spte & PT_PRESENT_MASK)); - rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); - drop_spte(kvm, spte); + while ((sptep = rmap_get_first(*rmapp, &iter))) { + BUG_ON(!(*sptep & PT_PRESENT_MASK)); + rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", sptep, *sptep); + + drop_spte(kvm, sptep); need_tlb_flush = 1; } + return need_tlb_flush; } static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, unsigned long data) { + u64 *sptep; + struct rmap_iterator iter; int need_flush = 0; - u64 *spte, new_spte; + u64 new_spte; pte_t *ptep = (pte_t *)data; pfn_t new_pfn; WARN_ON(pte_huge(*ptep)); new_pfn = pte_pfn(*ptep); - spte = rmap_next(rmapp, NULL); - while (spte) { - BUG_ON(!is_shadow_present_pte(*spte)); - rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte); + + for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { + BUG_ON(!is_shadow_present_pte(*sptep)); + rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", sptep, *sptep); + need_flush = 1; + if (pte_write(*ptep)) { - drop_spte(kvm, spte); - spte = rmap_next(rmapp, NULL); + drop_spte(kvm, sptep); + sptep = rmap_get_first(*rmapp, &iter); } else { - new_spte = *spte &~ (PT64_BASE_ADDR_MASK); + new_spte = *sptep & ~PT64_BASE_ADDR_MASK; new_spte |= (u64)new_pfn << PAGE_SHIFT; new_spte &= ~PT_WRITABLE_MASK; new_spte &= ~SPTE_HOST_WRITEABLE; new_spte &= ~shadow_accessed_mask; - mmu_spte_clear_track_bits(spte); - mmu_spte_set(spte, new_spte); - spte = rmap_next(rmapp, spte); + + mmu_spte_clear_track_bits(sptep); + mmu_spte_set(sptep, new_spte); + sptep = rmap_get_next(&iter); } } + if (need_flush) kvm_flush_remote_tlbs(kvm); @@ -1162,7 +1237,8 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, unsigned long data) { - u64 *spte; + u64 *sptep; + struct rmap_iterator iter; int young = 0; /* @@ -1175,25 +1251,24 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, if (!shadow_accessed_mask) return kvm_unmap_rmapp(kvm, rmapp, data); - spte = rmap_next(rmapp, NULL); - while (spte) { - int _young; - u64 _spte = *spte; - BUG_ON(!(_spte & PT_PRESENT_MASK)); - _young = _spte & PT_ACCESSED_MASK; - if (_young) { + for (sptep = rmap_get_first(*rmapp, &iter); sptep; + sptep = rmap_get_next(&iter)) { + BUG_ON(!(*sptep & PT_PRESENT_MASK)); + + if (*sptep & PT_ACCESSED_MASK) { young = 1; - clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte); + clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)sptep); } - spte = rmap_next(rmapp, spte); } + return young; } static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, unsigned long data) { - u64 *spte; + u64 *sptep; + struct rmap_iterator iter; int young = 0; /* @@ -1204,16 +1279,14 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, if (!shadow_accessed_mask) goto out; - spte = rmap_next(rmapp, NULL); - while (spte) { - u64 _spte = *spte; - BUG_ON(!(_spte & PT_PRESENT_MASK)); - young = _spte & PT_ACCESSED_MASK; - if (young) { + for (sptep = rmap_get_first(*rmapp, &iter); sptep; + sptep = rmap_get_next(&iter)) { + BUG_ON(!(*sptep & PT_PRESENT_MASK)); + + if (*sptep & PT_ACCESSED_MASK) { young = 1; break; } - spte = rmap_next(rmapp, spte); } out: return young; @@ -1865,10 +1938,11 @@ static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) { - u64 *parent_pte; + u64 *sptep; + struct rmap_iterator iter; - while ((parent_pte = pte_list_next(&sp->parent_ptes, NULL))) - drop_parent_pte(sp, parent_pte); + while ((sptep = rmap_get_first(sp->parent_ptes, &iter))) + drop_parent_pte(sp, sptep); } static int mmu_zap_unsync_children(struct kvm *kvm, @@ -1925,30 +1999,6 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, return ret; } -static void kvm_mmu_isolate_pages(struct list_head *invalid_list) -{ - struct kvm_mmu_page *sp; - - list_for_each_entry(sp, invalid_list, link) - kvm_mmu_isolate_page(sp); -} - -static void free_pages_rcu(struct rcu_head *head) -{ - struct kvm_mmu_page *next, *sp; - - sp = container_of(head, struct kvm_mmu_page, rcu); - while (sp) { - if (!list_empty(&sp->link)) - next = list_first_entry(&sp->link, - struct kvm_mmu_page, link); - else - next = NULL; - kvm_mmu_free_page(sp); - sp = next; - } -} - static void kvm_mmu_commit_zap_page(struct kvm *kvm, struct list_head *invalid_list) { @@ -1957,17 +2007,17 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, if (list_empty(invalid_list)) return; - kvm_flush_remote_tlbs(kvm); - - if (atomic_read(&kvm->arch.reader_counter)) { - kvm_mmu_isolate_pages(invalid_list); - sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); - list_del_init(invalid_list); + /* + * wmb: make sure everyone sees our modifications to the page tables + * rmb: make sure we see changes to vcpu->mode + */ + smp_mb(); - trace_kvm_mmu_delay_free_pages(sp); - call_rcu(&sp->rcu, free_pages_rcu); - return; - } + /* + * Wait for all vcpus to exit guest mode and/or lockless shadow + * page table walks. + */ + kvm_flush_remote_tlbs(kvm); do { sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); @@ -1975,7 +2025,6 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, kvm_mmu_isolate_page(sp); kvm_mmu_free_page(sp); } while (!list_empty(invalid_list)); - } /* @@ -3554,7 +3603,7 @@ static bool detect_write_flooding(struct kvm_mmu_page *sp) * Skip write-flooding detected for the sp whose level is 1, because * it can become unsync, then the guest page is not write-protected. */ - if (sp->role.level == 1) + if (sp->role.level == PT_PAGE_TABLE_LEVEL) return false; return ++sp->write_flooding_count >= 3; diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index 715da5a..7d7d0b9 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -192,7 +192,8 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) { struct kvm_memory_slot *slot; unsigned long *rmapp; - u64 *spte; + u64 *sptep; + struct rmap_iterator iter; if (sp->role.direct || sp->unsync || sp->role.invalid) return; @@ -200,13 +201,12 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) slot = gfn_to_memslot(kvm, sp->gfn); rmapp = &slot->rmap[sp->gfn - slot->base_gfn]; - spte = rmap_next(rmapp, NULL); - while (spte) { - if (is_writable_pte(*spte)) + for (sptep = rmap_get_first(*rmapp, &iter); sptep; + sptep = rmap_get_next(&iter)) { + if (is_writable_pte(*sptep)) audit_printk(kvm, "shadow page has writable " "mappings: gfn %llx role %x\n", sp->gfn, sp->role.word); - spte = rmap_next(rmapp, spte); } } diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index df5a703..34f9709 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -658,7 +658,7 @@ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) { int offset = 0; - WARN_ON(sp->role.level != 1); + WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL); if (PTTYPE == 32) offset = sp->role.quadrant << PT64_LEVEL_BITS; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e334389..f75af40 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -22,6 +22,7 @@ #include "x86.h" #include <linux/module.h> +#include <linux/mod_devicetable.h> #include <linux/kernel.h> #include <linux/vmalloc.h> #include <linux/highmem.h> @@ -42,6 +43,12 @@ MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); +static const struct x86_cpu_id svm_cpu_id[] = { + X86_FEATURE_MATCH(X86_FEATURE_SVM), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id); + #define IOPM_ALLOC_ORDER 2 #define MSRPM_ALLOC_ORDER 1 @@ -3240,6 +3247,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm) svm_clear_vintr(svm); svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; mark_dirty(svm->vmcb, VMCB_INTR); + ++svm->vcpu.stat.irq_window_exits; /* * If the user space waits to inject interrupts, exit as soon as * possible @@ -3247,7 +3255,6 @@ static int interrupt_window_interception(struct vcpu_svm *svm) if (!irqchip_in_kernel(svm->vcpu.kvm) && kvm_run->request_interrupt_window && !kvm_cpu_has_interrupt(&svm->vcpu)) { - ++svm->vcpu.stat.irq_window_exits; kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; return 0; } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4ff0ab9..32eb588 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -27,6 +27,7 @@ #include <linux/highmem.h> #include <linux/sched.h> #include <linux/moduleparam.h> +#include <linux/mod_devicetable.h> #include <linux/ftrace_event.h> #include <linux/slab.h> #include <linux/tboot.h> @@ -51,6 +52,12 @@ MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); +static const struct x86_cpu_id vmx_cpu_id[] = { + X86_FEATURE_MATCH(X86_FEATURE_VMX), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id); + static bool __read_mostly enable_vpid = 1; module_param_named(vpid, enable_vpid, bool, 0444); @@ -386,6 +393,9 @@ struct vcpu_vmx { struct { int loaded; u16 fs_sel, gs_sel, ldt_sel; +#ifdef CONFIG_X86_64 + u16 ds_sel, es_sel; +#endif int gs_ldt_reload_needed; int fs_reload_needed; } host_state; @@ -1411,6 +1421,11 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) } #ifdef CONFIG_X86_64 + savesegment(ds, vmx->host_state.ds_sel); + savesegment(es, vmx->host_state.es_sel); +#endif + +#ifdef CONFIG_X86_64 vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE)); vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE)); #else @@ -1450,6 +1465,19 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) } if (vmx->host_state.fs_reload_needed) loadsegment(fs, vmx->host_state.fs_sel); +#ifdef CONFIG_X86_64 + if (unlikely(vmx->host_state.ds_sel | vmx->host_state.es_sel)) { + loadsegment(ds, vmx->host_state.ds_sel); + loadsegment(es, vmx->host_state.es_sel); + } +#else + /* + * The sysexit path does not restore ds/es, so we must set them to + * a reasonable value ourselves. + */ + loadsegment(ds, __USER_DS); + loadsegment(es, __USER_DS); +#endif reload_tss(); #ifdef CONFIG_X86_64 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); @@ -3633,8 +3661,18 @@ static void vmx_set_constant_host_state(void) vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ +#ifdef CONFIG_X86_64 + /* + * Load null selectors, so we can avoid reloading them in + * __vmx_load_host_state(), in case userspace uses the null selectors + * too (the expected case). + */ + vmcs_write16(HOST_DS_SELECTOR, 0); + vmcs_write16(HOST_ES_SELECTOR, 0); +#else vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ +#endif vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ @@ -6256,7 +6294,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) } } - asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); vmx->loaded_vmcs->launched = 1; vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); @@ -6343,7 +6380,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) return &vmx->vcpu; free_vmcs: - free_vmcs(vmx->loaded_vmcs->vmcs); + free_loaded_vmcs(vmx->loaded_vmcs); free_msrs: kfree(vmx->guest_msrs); uninit_vcpu: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 185a2b8..be6d549 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2147,6 +2147,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_ASYNC_PF: case KVM_CAP_GET_TSC_KHZ: case KVM_CAP_PCI_2_3: + case KVM_CAP_KVMCLOCK_CTRL: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -2597,6 +2598,23 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, return r; } +/* + * kvm_set_guest_paused() indicates to the guest kernel that it has been + * stopped by the hypervisor. This function will be called from the host only. + * EINVAL is returned when the host attempts to set the flag for a guest that + * does not support pv clocks. + */ +static int kvm_set_guest_paused(struct kvm_vcpu *vcpu) +{ + struct pvclock_vcpu_time_info *src = &vcpu->arch.hv_clock; + if (!vcpu->arch.time_page) + return -EINVAL; + src->flags |= PVCLOCK_GUEST_STOPPED; + mark_page_dirty(vcpu->kvm, vcpu->arch.time >> PAGE_SHIFT); + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); + return 0; +} + long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -2873,6 +2891,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = vcpu->arch.virtual_tsc_khz; goto out; } + case KVM_KVMCLOCK_CTRL: { + r = kvm_set_guest_paused(vcpu); + goto out; + } default: r = -EINVAL; } @@ -3045,57 +3067,32 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm, } /** - * write_protect_slot - write protect a slot for dirty logging - * @kvm: the kvm instance - * @memslot: the slot we protect - * @dirty_bitmap: the bitmap indicating which pages are dirty - * @nr_dirty_pages: the number of dirty pages + * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot + * @kvm: kvm instance + * @log: slot id and address to which we copy the log * - * We have two ways to find all sptes to protect: - * 1. Use kvm_mmu_slot_remove_write_access() which walks all shadow pages and - * checks ones that have a spte mapping a page in the slot. - * 2. Use kvm_mmu_rmap_write_protect() for each gfn found in the bitmap. + * We need to keep it in mind that VCPU threads can write to the bitmap + * concurrently. So, to avoid losing data, we keep the following order for + * each bit: * - * Generally speaking, if there are not so many dirty pages compared to the - * number of shadow pages, we should use the latter. + * 1. Take a snapshot of the bit and clear it if needed. + * 2. Write protect the corresponding page. + * 3. Flush TLB's if needed. + * 4. Copy the snapshot to the userspace. * - * Note that letting others write into a page marked dirty in the old bitmap - * by using the remaining tlb entry is not a problem. That page will become - * write protected again when we flush the tlb and then be reported dirty to - * the user space by copying the old bitmap. - */ -static void write_protect_slot(struct kvm *kvm, - struct kvm_memory_slot *memslot, - unsigned long *dirty_bitmap, - unsigned long nr_dirty_pages) -{ - spin_lock(&kvm->mmu_lock); - - /* Not many dirty pages compared to # of shadow pages. */ - if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) { - unsigned long gfn_offset; - - for_each_set_bit(gfn_offset, dirty_bitmap, memslot->npages) { - unsigned long gfn = memslot->base_gfn + gfn_offset; - - kvm_mmu_rmap_write_protect(kvm, gfn, memslot); - } - kvm_flush_remote_tlbs(kvm); - } else - kvm_mmu_slot_remove_write_access(kvm, memslot->id); - - spin_unlock(&kvm->mmu_lock); -} - -/* - * Get (and clear) the dirty memory log for a memory slot. + * Between 2 and 3, the guest may write to the page using the remaining TLB + * entry. This is not a problem because the page will be reported dirty at + * step 4 using the snapshot taken before and step 3 ensures that successive + * writes will be logged for the next call. */ -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, - struct kvm_dirty_log *log) +int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { int r; struct kvm_memory_slot *memslot; - unsigned long n, nr_dirty_pages; + unsigned long n, i; + unsigned long *dirty_bitmap; + unsigned long *dirty_bitmap_buffer; + bool is_dirty = false; mutex_lock(&kvm->slots_lock); @@ -3104,49 +3101,42 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, goto out; memslot = id_to_memslot(kvm->memslots, log->slot); + + dirty_bitmap = memslot->dirty_bitmap; r = -ENOENT; - if (!memslot->dirty_bitmap) + if (!dirty_bitmap) goto out; n = kvm_dirty_bitmap_bytes(memslot); - nr_dirty_pages = memslot->nr_dirty_pages; - /* If nothing is dirty, don't bother messing with page tables. */ - if (nr_dirty_pages) { - struct kvm_memslots *slots, *old_slots; - unsigned long *dirty_bitmap, *dirty_bitmap_head; + dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long); + memset(dirty_bitmap_buffer, 0, n); - dirty_bitmap = memslot->dirty_bitmap; - dirty_bitmap_head = memslot->dirty_bitmap_head; - if (dirty_bitmap == dirty_bitmap_head) - dirty_bitmap_head += n / sizeof(long); - memset(dirty_bitmap_head, 0, n); + spin_lock(&kvm->mmu_lock); - r = -ENOMEM; - slots = kmemdup(kvm->memslots, sizeof(*kvm->memslots), GFP_KERNEL); - if (!slots) - goto out; + for (i = 0; i < n / sizeof(long); i++) { + unsigned long mask; + gfn_t offset; - memslot = id_to_memslot(slots, log->slot); - memslot->nr_dirty_pages = 0; - memslot->dirty_bitmap = dirty_bitmap_head; - update_memslots(slots, NULL); + if (!dirty_bitmap[i]) + continue; - old_slots = kvm->memslots; - rcu_assign_pointer(kvm->memslots, slots); - synchronize_srcu_expedited(&kvm->srcu); - kfree(old_slots); + is_dirty = true; - write_protect_slot(kvm, memslot, dirty_bitmap, nr_dirty_pages); + mask = xchg(&dirty_bitmap[i], 0); + dirty_bitmap_buffer[i] = mask; - r = -EFAULT; - if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) - goto out; - } else { - r = -EFAULT; - if (clear_user(log->dirty_bitmap, n)) - goto out; + offset = i * BITS_PER_LONG; + kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask); } + if (is_dirty) + kvm_flush_remote_tlbs(kvm); + + spin_unlock(&kvm->mmu_lock); + + r = -EFAULT; + if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) + goto out; r = 0; out: @@ -3728,9 +3718,8 @@ struct read_write_emulator_ops { static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes) { if (vcpu->mmio_read_completed) { - memcpy(val, vcpu->mmio_data, bytes); trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, - vcpu->mmio_phys_addr, *(u64 *)val); + vcpu->mmio_fragments[0].gpa, *(u64 *)val); vcpu->mmio_read_completed = 0; return 1; } @@ -3766,8 +3755,9 @@ static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, void *val, int bytes) { - memcpy(vcpu->mmio_data, val, bytes); - memcpy(vcpu->run->mmio.data, vcpu->mmio_data, 8); + struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0]; + + memcpy(vcpu->run->mmio.data, frag->data, frag->len); return X86EMUL_CONTINUE; } @@ -3794,10 +3784,7 @@ static int emulator_read_write_onepage(unsigned long addr, void *val, gpa_t gpa; int handled, ret; bool write = ops->write; - - if (ops->read_write_prepare && - ops->read_write_prepare(vcpu, val, bytes)) - return X86EMUL_CONTINUE; + struct kvm_mmio_fragment *frag; ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write); @@ -3823,15 +3810,19 @@ mmio: bytes -= handled; val += handled; - vcpu->mmio_needed = 1; - vcpu->run->exit_reason = KVM_EXIT_MMIO; - vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; - vcpu->mmio_size = bytes; - vcpu->run->mmio.len = min(vcpu->mmio_size, 8); - vcpu->run->mmio.is_write = vcpu->mmio_is_write = write; - vcpu->mmio_index = 0; + while (bytes) { + unsigned now = min(bytes, 8U); - return ops->read_write_exit_mmio(vcpu, gpa, val, bytes); + frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++]; + frag->gpa = gpa; + frag->data = val; + frag->len = now; + + gpa += now; + val += now; + bytes -= now; + } + return X86EMUL_CONTINUE; } int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr, @@ -3840,10 +3831,18 @@ int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr, struct read_write_emulator_ops *ops) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + gpa_t gpa; + int rc; + + if (ops->read_write_prepare && + ops->read_write_prepare(vcpu, val, bytes)) + return X86EMUL_CONTINUE; + + vcpu->mmio_nr_fragments = 0; /* Crossing a page boundary? */ if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { - int rc, now; + int now; now = -addr & ~PAGE_MASK; rc = emulator_read_write_onepage(addr, val, now, exception, @@ -3856,8 +3855,25 @@ int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr, bytes -= now; } - return emulator_read_write_onepage(addr, val, bytes, exception, - vcpu, ops); + rc = emulator_read_write_onepage(addr, val, bytes, exception, + vcpu, ops); + if (rc != X86EMUL_CONTINUE) + return rc; + + if (!vcpu->mmio_nr_fragments) + return rc; + + gpa = vcpu->mmio_fragments[0].gpa; + + vcpu->mmio_needed = 1; + vcpu->mmio_cur_fragment = 0; + + vcpu->run->mmio.len = vcpu->mmio_fragments[0].len; + vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write; + vcpu->run->exit_reason = KVM_EXIT_MMIO; + vcpu->run->mmio.phys_addr = gpa; + + return ops->read_write_exit_mmio(vcpu, gpa, val, bytes); } static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, @@ -5263,10 +5279,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_deliver_pmi(vcpu); } - r = kvm_mmu_reload(vcpu); - if (unlikely(r)) - goto out; - if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { inject_pending_event(vcpu); @@ -5282,6 +5294,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } } + r = kvm_mmu_reload(vcpu); + if (unlikely(r)) { + kvm_x86_ops->cancel_injection(vcpu); + goto out; + } + preempt_disable(); kvm_x86_ops->prepare_guest_switch(vcpu); @@ -5456,33 +5474,55 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) return r; } +/* + * Implements the following, as a state machine: + * + * read: + * for each fragment + * write gpa, len + * exit + * copy data + * execute insn + * + * write: + * for each fragment + * write gpa, len + * copy data + * exit + */ static int complete_mmio(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; + struct kvm_mmio_fragment *frag; int r; if (!(vcpu->arch.pio.count || vcpu->mmio_needed)) return 1; if (vcpu->mmio_needed) { - vcpu->mmio_needed = 0; + /* Complete previous fragment */ + frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment++]; if (!vcpu->mmio_is_write) - memcpy(vcpu->mmio_data + vcpu->mmio_index, - run->mmio.data, 8); - vcpu->mmio_index += 8; - if (vcpu->mmio_index < vcpu->mmio_size) { - run->exit_reason = KVM_EXIT_MMIO; - run->mmio.phys_addr = vcpu->mmio_phys_addr + vcpu->mmio_index; - memcpy(run->mmio.data, vcpu->mmio_data + vcpu->mmio_index, 8); - run->mmio.len = min(vcpu->mmio_size - vcpu->mmio_index, 8); - run->mmio.is_write = vcpu->mmio_is_write; - vcpu->mmio_needed = 1; - return 0; + memcpy(frag->data, run->mmio.data, frag->len); + if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) { + vcpu->mmio_needed = 0; + if (vcpu->mmio_is_write) + return 1; + vcpu->mmio_read_completed = 1; + goto done; } + /* Initiate next fragment */ + ++frag; + run->exit_reason = KVM_EXIT_MMIO; + run->mmio.phys_addr = frag->gpa; if (vcpu->mmio_is_write) - return 1; - vcpu->mmio_read_completed = 1; + memcpy(run->mmio.data, frag->data, frag->len); + run->mmio.len = frag->len; + run->mmio.is_write = vcpu->mmio_is_write; + return 0; + } +done: vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE); srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); @@ -6399,21 +6439,9 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) kvm_cpu_has_interrupt(vcpu)); } -void kvm_vcpu_kick(struct kvm_vcpu *vcpu) +int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) { - int me; - int cpu = vcpu->cpu; - - if (waitqueue_active(&vcpu->wq)) { - wake_up_interruptible(&vcpu->wq); - ++vcpu->stat.halt_wakeup; - } - - me = get_cpu(); - if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) - if (kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE) - smp_send_reschedule(cpu); - put_cpu(); + return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE; } int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index cb80c29..3d1134d 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -64,7 +64,7 @@ static inline int is_pse(struct kvm_vcpu *vcpu) static inline int is_paging(struct kvm_vcpu *vcpu) { - return kvm_read_cr0_bits(vcpu, X86_CR0_PG); + return likely(kvm_read_cr0_bits(vcpu, X86_CR0_PG)); } static inline u32 bit(int bitno) diff --git a/arch/xtensa/include/asm/kvm_para.h b/arch/xtensa/include/asm/kvm_para.h new file mode 100644 index 0000000..14fab8f --- /dev/null +++ b/arch/xtensa/include/asm/kvm_para.h @@ -0,0 +1 @@ +#include <asm-generic/kvm_para.h> diff --git a/drivers/s390/char/sclp_cmd.c b/drivers/s390/char/sclp_cmd.c index 3650636..766cb7b 100644 --- a/drivers/s390/char/sclp_cmd.c +++ b/drivers/s390/char/sclp_cmd.c @@ -17,6 +17,7 @@ #include <linux/mm.h> #include <linux/mmzone.h> #include <linux/memory.h> +#include <linux/module.h> #include <linux/platform_device.h> #include <asm/chpid.h> #include <asm/sclp.h> @@ -38,7 +39,8 @@ struct read_info_sccb { u64 facilities; /* 48-55 */ u8 _reserved2[84 - 56]; /* 56-83 */ u8 fac84; /* 84 */ - u8 _reserved3[91 - 85]; /* 85-90 */ + u8 fac85; /* 85 */ + u8 _reserved3[91 - 86]; /* 86-90 */ u8 flags; /* 91 */ u8 _reserved4[100 - 92]; /* 92-99 */ u32 rnsize2; /* 100-103 */ @@ -51,6 +53,7 @@ static int __initdata early_read_info_sccb_valid; u64 sclp_facilities; static u8 sclp_fac84; +static u8 sclp_fac85; static unsigned long long rzm; static unsigned long long rnmax; @@ -112,6 +115,7 @@ void __init sclp_facilities_detect(void) sccb = &early_read_info_sccb; sclp_facilities = sccb->facilities; sclp_fac84 = sccb->fac84; + sclp_fac85 = sccb->fac85; rnmax = sccb->rnmax ? sccb->rnmax : sccb->rnmax2; rzm = sccb->rnsize ? sccb->rnsize : sccb->rnsize2; rzm <<= 20; @@ -127,6 +131,12 @@ unsigned long long sclp_get_rzm(void) return rzm; } +u8 sclp_get_fac85(void) +{ + return sclp_fac85; +} +EXPORT_SYMBOL_GPL(sclp_get_fac85); + /* * This function will be called after sclp_facilities_detect(), which gets * called from early.c code. Therefore the sccb should have valid contents. diff --git a/include/asm-generic/kvm_para.h b/include/asm-generic/kvm_para.h new file mode 100644 index 0000000..5cba37f --- /dev/null +++ b/include/asm-generic/kvm_para.h @@ -0,0 +1,22 @@ +#ifndef _ASM_GENERIC_KVM_PARA_H +#define _ASM_GENERIC_KVM_PARA_H + +#ifdef __KERNEL__ + +/* + * This function is used by architectures that support kvm to avoid issuing + * false soft lockup messages. + */ +static inline bool kvm_check_and_clear_guest_paused(void) +{ + return false; +} + +static inline unsigned int kvm_arch_para_features(void) +{ + return 0; +} + +#endif /* _KERNEL__ */ + +#endif diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 6c322a9..09f2b3a 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -449,6 +449,30 @@ struct kvm_ppc_pvinfo { __u8 pad[108]; }; +/* for KVM_PPC_GET_SMMU_INFO */ +#define KVM_PPC_PAGE_SIZES_MAX_SZ 8 + +struct kvm_ppc_one_page_size { + __u32 page_shift; /* Page shift (or 0) */ + __u32 pte_enc; /* Encoding in the HPTE (>>12) */ +}; + +struct kvm_ppc_one_seg_page_size { + __u32 page_shift; /* Base page shift of segment (or 0) */ + __u32 slb_enc; /* SLB encoding for BookS */ + struct kvm_ppc_one_page_size enc[KVM_PPC_PAGE_SIZES_MAX_SZ]; +}; + +#define KVM_PPC_PAGE_SIZES_REAL 0x00000001 +#define KVM_PPC_1T_SEGMENTS 0x00000002 + +struct kvm_ppc_smmu_info { + __u64 flags; + __u32 slb_size; + __u32 pad; + struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ]; +}; + #define KVMIO 0xAE /* machine type bits, to be used as argument to KVM_CREATE_VM */ @@ -589,6 +613,10 @@ struct kvm_ppc_pvinfo { #define KVM_CAP_S390_UCONTROL 73 #define KVM_CAP_SYNC_REGS 74 #define KVM_CAP_PCI_2_3 75 +#define KVM_CAP_KVMCLOCK_CTRL 76 +#define KVM_CAP_SIGNAL_MSI 77 +#define KVM_CAP_PPC_GET_SMMU_INFO 78 +#define KVM_CAP_S390_COW 79 #ifdef KVM_CAP_IRQ_ROUTING @@ -714,6 +742,14 @@ struct kvm_one_reg { __u64 addr; }; +struct kvm_msi { + __u32 address_lo; + __u32 address_hi; + __u32 data; + __u32 flags; + __u8 pad[16]; +}; + /* * ioctls for VM fds */ @@ -788,6 +824,10 @@ struct kvm_s390_ucas_mapping { /* Available with KVM_CAP_PCI_2_3 */ #define KVM_ASSIGN_SET_INTX_MASK _IOW(KVMIO, 0xa4, \ struct kvm_assigned_pci_dev) +/* Available with KVM_CAP_SIGNAL_MSI */ +#define KVM_SIGNAL_MSI _IOW(KVMIO, 0xa5, struct kvm_msi) +/* Available with KVM_CAP_PPC_GET_SMMU_INFO */ +#define KVM_PPC_GET_SMMU_INFO _IOR(KVMIO, 0xa6, struct kvm_ppc_smmu_info) /* * ioctls for vcpu fds @@ -859,6 +899,8 @@ struct kvm_s390_ucas_mapping { /* Available with KVM_CAP_ONE_REG */ #define KVM_GET_ONE_REG _IOW(KVMIO, 0xab, struct kvm_one_reg) #define KVM_SET_ONE_REG _IOW(KVMIO, 0xac, struct kvm_one_reg) +/* VM is being stopped by host */ +#define KVM_KVMCLOCK_CTRL _IO(KVMIO, 0xad) #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) #define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 72cbf08..c446435 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -35,6 +35,20 @@ #endif /* + * If we support unaligned MMIO, at most one fragment will be split into two: + */ +#ifdef KVM_UNALIGNED_MMIO +# define KVM_EXTRA_MMIO_FRAGMENTS 1 +#else +# define KVM_EXTRA_MMIO_FRAGMENTS 0 +#endif + +#define KVM_USER_MMIO_SIZE 8 + +#define KVM_MAX_MMIO_FRAGMENTS \ + (KVM_MMIO_SIZE / KVM_USER_MMIO_SIZE + KVM_EXTRA_MMIO_FRAGMENTS) + +/* * vcpu->requests bit members */ #define KVM_REQ_TLB_FLUSH 0 @@ -68,10 +82,11 @@ struct kvm_io_range { struct kvm_io_device *dev; }; +#define NR_IOBUS_DEVS 1000 + struct kvm_io_bus { int dev_count; -#define NR_IOBUS_DEVS 300 - struct kvm_io_range range[NR_IOBUS_DEVS]; + struct kvm_io_range range[]; }; enum kvm_bus { @@ -113,7 +128,18 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu); enum { OUTSIDE_GUEST_MODE, IN_GUEST_MODE, - EXITING_GUEST_MODE + EXITING_GUEST_MODE, + READING_SHADOW_PAGE_TABLES, +}; + +/* + * Sometimes a large or cross-page mmio needs to be broken up into separate + * exits for userspace servicing. + */ +struct kvm_mmio_fragment { + gpa_t gpa; + void *data; + unsigned len; }; struct kvm_vcpu { @@ -143,10 +169,9 @@ struct kvm_vcpu { int mmio_needed; int mmio_read_completed; int mmio_is_write; - int mmio_size; - int mmio_index; - unsigned char mmio_data[KVM_MMIO_SIZE]; - gpa_t mmio_phys_addr; + int mmio_cur_fragment; + int mmio_nr_fragments; + struct kvm_mmio_fragment mmio_fragments[KVM_MAX_MMIO_FRAGMENTS]; #endif #ifdef CONFIG_KVM_ASYNC_PF @@ -178,8 +203,6 @@ struct kvm_memory_slot { unsigned long flags; unsigned long *rmap; unsigned long *dirty_bitmap; - unsigned long *dirty_bitmap_head; - unsigned long nr_dirty_pages; struct kvm_arch_memory_slot arch; unsigned long userspace_addr; int user_alloc; @@ -438,6 +461,8 @@ void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot, gfn_t gfn); void kvm_vcpu_block(struct kvm_vcpu *vcpu); +void kvm_vcpu_kick(struct kvm_vcpu *vcpu); +bool kvm_vcpu_yield_to(struct kvm_vcpu *target); void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu); void kvm_resched(struct kvm_vcpu *vcpu); void kvm_load_guest_fpu(struct kvm_vcpu *vcpu); @@ -506,6 +531,7 @@ int kvm_arch_hardware_setup(void); void kvm_arch_hardware_unsetup(void); void kvm_arch_check_processor_compat(void *rtn); int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); +int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu); void kvm_free_physmem(struct kvm *kvm); @@ -521,6 +547,15 @@ static inline void kvm_arch_free_vm(struct kvm *kvm) } #endif +static inline wait_queue_head_t *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu) +{ +#ifdef __KVM_HAVE_ARCH_WQP + return vcpu->arch.wqp; +#else + return &vcpu->wq; +#endif +} + int kvm_arch_init_vm(struct kvm *kvm, unsigned long type); void kvm_arch_destroy_vm(struct kvm *kvm); void kvm_free_all_assigned_devices(struct kvm *kvm); @@ -769,6 +804,8 @@ int kvm_set_irq_routing(struct kvm *kvm, unsigned flags); void kvm_free_irq_routing(struct kvm *kvm); +int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi); + #else static inline void kvm_free_irq_routing(struct kvm *kvm) {} diff --git a/kernel/watchdog.c b/kernel/watchdog.c index df30ee0..e5e1d85 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -24,6 +24,7 @@ #include <linux/sysctl.h> #include <asm/irq_regs.h> +#include <linux/kvm_para.h> #include <linux/perf_event.h> int watchdog_enabled = 1; @@ -280,6 +281,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) __this_cpu_write(softlockup_touch_sync, false); sched_clock_tick(); } + + /* Clear the guest paused flag on watchdog reset */ + kvm_check_and_clear_guest_paused(); __touch_watchdog(); return HRTIMER_RESTART; } @@ -292,6 +296,14 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) */ duration = is_softlockup(touch_ts); if (unlikely(duration)) { + /* + * If a virtual machine is stopped by the host it can look to + * the watchdog like a soft lockup, check to see if the host + * stopped the vm before we issue the warning + */ + if (kvm_check_and_clear_guest_paused()) + return HRTIMER_RESTART; + /* only warn once */ if (__this_cpu_read(soft_watchdog_warn) == true) return HRTIMER_RESTART; diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index f63ccb0..28694f4 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -18,3 +18,6 @@ config KVM_MMIO config KVM_ASYNC_PF bool + +config HAVE_KVM_MSI + bool diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index dcaf272c26..26fd54d 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -254,13 +254,17 @@ static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int vector, } } +bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector) +{ + struct kvm_ioapic *ioapic = kvm->arch.vioapic; + smp_rmb(); + return test_bit(vector, ioapic->handled_vectors); +} + void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode) { struct kvm_ioapic *ioapic = kvm->arch.vioapic; - smp_rmb(); - if (!test_bit(vector, ioapic->handled_vectors)) - return; spin_lock(&ioapic->lock); __kvm_ioapic_update_eoi(ioapic, vector, trigger_mode); spin_unlock(&ioapic->lock); diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h index 0b190c3..32872a0 100644 --- a/virt/kvm/ioapic.h +++ b/virt/kvm/ioapic.h @@ -71,6 +71,7 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int short_hand, int dest, int dest_mode); int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode); +bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector); int kvm_ioapic_init(struct kvm *kvm); void kvm_ioapic_destroy(struct kvm *kvm); int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level); diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index 9f614b4..a6a0365 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -138,6 +138,20 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, return kvm_irq_delivery_to_apic(kvm, NULL, &irq); } +int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi) +{ + struct kvm_kernel_irq_routing_entry route; + + if (!irqchip_in_kernel(kvm) || msi->flags != 0) + return -EINVAL; + + route.msi.address_lo = msi->address_lo; + route.msi.address_hi = msi->address_hi; + route.msi.data = msi->data; + + return kvm_set_msi(&route, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1); +} + /* * Return value: * < 0 Interrupt was ignored (masked or not delivered for other reasons) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 9739b53..7e14068 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -522,12 +522,11 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) return; if (2 * kvm_dirty_bitmap_bytes(memslot) > PAGE_SIZE) - vfree(memslot->dirty_bitmap_head); + vfree(memslot->dirty_bitmap); else - kfree(memslot->dirty_bitmap_head); + kfree(memslot->dirty_bitmap); memslot->dirty_bitmap = NULL; - memslot->dirty_bitmap_head = NULL; } /* @@ -611,8 +610,7 @@ static int kvm_vm_release(struct inode *inode, struct file *filp) /* * Allocation size is twice as large as the actual dirty bitmap size. - * This makes it possible to do double buffering: see x86's - * kvm_vm_ioctl_get_dirty_log(). + * See x86's kvm_vm_ioctl_get_dirty_log() why this is needed. */ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) { @@ -627,8 +625,6 @@ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) if (!memslot->dirty_bitmap) return -ENOMEM; - memslot->dirty_bitmap_head = memslot->dirty_bitmap; - memslot->nr_dirty_pages = 0; #endif /* !CONFIG_S390 */ return 0; } @@ -1477,8 +1473,8 @@ void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot, if (memslot && memslot->dirty_bitmap) { unsigned long rel_gfn = gfn - memslot->base_gfn; - if (!test_and_set_bit_le(rel_gfn, memslot->dirty_bitmap)) - memslot->nr_dirty_pages++; + /* TODO: introduce set_bit_le() and use it */ + test_and_set_bit_le(rel_gfn, memslot->dirty_bitmap); } } @@ -1515,6 +1511,30 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) finish_wait(&vcpu->wq, &wait); } +#ifndef CONFIG_S390 +/* + * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode. + */ +void kvm_vcpu_kick(struct kvm_vcpu *vcpu) +{ + int me; + int cpu = vcpu->cpu; + wait_queue_head_t *wqp; + + wqp = kvm_arch_vcpu_wq(vcpu); + if (waitqueue_active(wqp)) { + wake_up_interruptible(wqp); + ++vcpu->stat.halt_wakeup; + } + + me = get_cpu(); + if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) + if (kvm_arch_vcpu_should_kick(vcpu)) + smp_send_reschedule(cpu); + put_cpu(); +} +#endif /* !CONFIG_S390 */ + void kvm_resched(struct kvm_vcpu *vcpu) { if (!need_resched()) @@ -1523,6 +1543,31 @@ void kvm_resched(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_resched); +bool kvm_vcpu_yield_to(struct kvm_vcpu *target) +{ + struct pid *pid; + struct task_struct *task = NULL; + + rcu_read_lock(); + pid = rcu_dereference(target->pid); + if (pid) + task = get_pid_task(target->pid, PIDTYPE_PID); + rcu_read_unlock(); + if (!task) + return false; + if (task->flags & PF_VCPU) { + put_task_struct(task); + return false; + } + if (yield_to(task, 1)) { + put_task_struct(task); + return true; + } + put_task_struct(task); + return false; +} +EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to); + void kvm_vcpu_on_spin(struct kvm_vcpu *me) { struct kvm *kvm = me->kvm; @@ -1541,8 +1586,6 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me) */ for (pass = 0; pass < 2 && !yielded; pass++) { kvm_for_each_vcpu(i, vcpu, kvm) { - struct task_struct *task = NULL; - struct pid *pid; if (!pass && i < last_boosted_vcpu) { i = last_boosted_vcpu; continue; @@ -1552,24 +1595,11 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me) continue; if (waitqueue_active(&vcpu->wq)) continue; - rcu_read_lock(); - pid = rcu_dereference(vcpu->pid); - if (pid) - task = get_pid_task(vcpu->pid, PIDTYPE_PID); - rcu_read_unlock(); - if (!task) - continue; - if (task->flags & PF_VCPU) { - put_task_struct(task); - continue; - } - if (yield_to(task, 1)) { - put_task_struct(task); + if (kvm_vcpu_yield_to(vcpu)) { kvm->last_boosted_vcpu = i; yielded = 1; break; } - put_task_struct(task); } } } @@ -2040,6 +2070,17 @@ static long kvm_vm_ioctl(struct file *filp, mutex_unlock(&kvm->lock); break; #endif +#ifdef CONFIG_HAVE_KVM_MSI + case KVM_SIGNAL_MSI: { + struct kvm_msi msi; + + r = -EFAULT; + if (copy_from_user(&msi, argp, sizeof msi)) + goto out; + r = kvm_send_userspace_msi(kvm, &msi); + break; + } +#endif default: r = kvm_arch_vm_ioctl(filp, ioctl, arg); if (r == -ENOTTY) @@ -2168,6 +2209,9 @@ static long kvm_dev_ioctl_check_extension_generic(long arg) case KVM_CAP_SET_BOOT_CPU_ID: #endif case KVM_CAP_INTERNAL_ERROR_DATA: +#ifdef CONFIG_HAVE_KVM_MSI + case KVM_CAP_SIGNAL_MSI: +#endif return 1; #ifdef CONFIG_HAVE_KVM_IRQCHIP case KVM_CAP_IRQ_ROUTING: @@ -2394,9 +2438,6 @@ int kvm_io_bus_sort_cmp(const void *p1, const void *p2) int kvm_io_bus_insert_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev, gpa_t addr, int len) { - if (bus->dev_count == NR_IOBUS_DEVS) - return -ENOSPC; - bus->range[bus->dev_count++] = (struct kvm_io_range) { .addr = addr, .len = len, @@ -2496,12 +2537,15 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, struct kvm_io_bus *new_bus, *bus; bus = kvm->buses[bus_idx]; - if (bus->dev_count > NR_IOBUS_DEVS-1) + if (bus->dev_count > NR_IOBUS_DEVS - 1) return -ENOSPC; - new_bus = kmemdup(bus, sizeof(struct kvm_io_bus), GFP_KERNEL); + new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count + 1) * + sizeof(struct kvm_io_range)), GFP_KERNEL); if (!new_bus) return -ENOMEM; + memcpy(new_bus, bus, sizeof(*bus) + (bus->dev_count * + sizeof(struct kvm_io_range))); kvm_io_bus_insert_dev(new_bus, dev, addr, len); rcu_assign_pointer(kvm->buses[bus_idx], new_bus); synchronize_srcu_expedited(&kvm->srcu); @@ -2518,27 +2562,25 @@ int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, struct kvm_io_bus *new_bus, *bus; bus = kvm->buses[bus_idx]; - - new_bus = kmemdup(bus, sizeof(*bus), GFP_KERNEL); - if (!new_bus) - return -ENOMEM; - r = -ENOENT; - for (i = 0; i < new_bus->dev_count; i++) - if (new_bus->range[i].dev == dev) { + for (i = 0; i < bus->dev_count; i++) + if (bus->range[i].dev == dev) { r = 0; - new_bus->dev_count--; - new_bus->range[i] = new_bus->range[new_bus->dev_count]; - sort(new_bus->range, new_bus->dev_count, - sizeof(struct kvm_io_range), - kvm_io_bus_sort_cmp, NULL); break; } - if (r) { - kfree(new_bus); + if (r) return r; - } + + new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count - 1) * + sizeof(struct kvm_io_range)), GFP_KERNEL); + if (!new_bus) + return -ENOMEM; + + memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range)); + new_bus->dev_count--; + memcpy(new_bus->range + i, bus->range + i + 1, + (new_bus->dev_count - i) * sizeof(struct kvm_io_range)); rcu_assign_pointer(kvm->buses[bus_idx], new_bus); synchronize_srcu_expedited(&kvm->srcu); |