summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/virtual/kvm/api.txt87
-rw-r--r--Documentation/virtual/kvm/devices/s390_flic.txt2
-rw-r--r--Documentation/virtual/kvm/devices/vm.txt52
-rw-r--r--Documentation/virtual/kvm/mmu.txt6
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_64.h2
-rw-r--r--arch/powerpc/include/asm/kvm_host.h5
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h51
-rw-r--r--arch/powerpc/include/asm/pgtable.h3
-rw-r--r--arch/powerpc/include/asm/smp.h4
-rw-r--r--arch/powerpc/include/asm/xics.h1
-rw-r--r--arch/powerpc/include/uapi/asm/kvm.h9
-rw-r--r--arch/powerpc/kernel/smp.c28
-rw-r--r--arch/powerpc/kvm/Makefile2
-rw-r--r--arch/powerpc/kvm/book3s.c2
-rw-r--r--arch/powerpc/kvm/book3s_64_vio.c156
-rw-r--r--arch/powerpc/kvm/book3s_64_vio_hv.c330
-rw-r--r--arch/powerpc/kvm/book3s_hv.c192
-rw-r--r--arch/powerpc/kvm/book3s_hv_builtin.c3
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_xics.c131
-rw-r--r--arch/powerpc/kvm/book3s_hv_rmhandlers.S4
-rw-r--r--arch/powerpc/kvm/book3s_pr_papr.c35
-rw-r--r--arch/powerpc/kvm/powerpc.c38
-rw-r--r--arch/powerpc/mm/pgtable.c8
-rw-r--r--arch/powerpc/perf/hv-24x7.c8
-rw-r--r--arch/powerpc/sysdev/xics/icp-native.c21
-rw-r--r--arch/s390/include/asm/kvm_host.h8
-rw-r--r--arch/s390/include/uapi/asm/kvm.h8
-rw-r--r--arch/s390/kvm/gaccess.c57
-rw-r--r--arch/s390/kvm/gaccess.h38
-rw-r--r--arch/s390/kvm/intercept.c78
-rw-r--r--arch/s390/kvm/interrupt.c40
-rw-r--r--arch/s390/kvm/kvm-s390.c55
-rw-r--r--arch/s390/kvm/kvm-s390.h17
-rw-r--r--arch/s390/kvm/priv.c13
-rw-r--r--arch/x86/include/asm/kvm_host.h30
-rw-r--r--arch/x86/include/asm/kvm_page_track.h61
-rw-r--r--arch/x86/include/uapi/asm/hyperv.h4
-rw-r--r--arch/x86/kvm/Makefile3
-rw-r--r--arch/x86/kvm/assigned-dev.c14
-rw-r--r--arch/x86/kvm/hyperv.c50
-rw-r--r--arch/x86/kvm/i8254.c350
-rw-r--r--arch/x86/kvm/i8254.h17
-rw-r--r--arch/x86/kvm/ioapic.c30
-rw-r--r--arch/x86/kvm/ioapic.h17
-rw-r--r--arch/x86/kvm/irq.c9
-rw-r--r--arch/x86/kvm/irq.h8
-rw-r--r--arch/x86/kvm/irq_comm.c27
-rw-r--r--arch/x86/kvm/lapic.c156
-rw-r--r--arch/x86/kvm/lapic.h17
-rw-r--r--arch/x86/kvm/mmu.c502
-rw-r--r--arch/x86/kvm/mmu.h5
-rw-r--r--arch/x86/kvm/page_track.c222
-rw-r--r--arch/x86/kvm/paging_tmpl.h35
-rw-r--r--arch/x86/kvm/pmu.c2
-rw-r--r--arch/x86/kvm/svm.c3
-rw-r--r--arch/x86/kvm/trace.h12
-rw-r--r--arch/x86/kvm/vmx.c72
-rw-r--r--arch/x86/kvm/x86.c156
-rw-r--r--arch/x86/kvm/x86.h16
-rw-r--r--drivers/hv/hyperv_vmbus.h6
-rw-r--r--include/trace/events/kvm.h9
-rw-r--r--include/uapi/linux/kvm.h21
-rw-r--r--virt/kvm/async_pf.c8
-rw-r--r--virt/kvm/kvm_main.c37
64 files changed, 2509 insertions, 884 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index cb2ef0b..4d0542c 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3039,6 +3039,87 @@ Returns: 0 on success, -1 on error
Queues an SMI on the thread's vcpu.
+4.97 KVM_CAP_PPC_MULTITCE
+
+Capability: KVM_CAP_PPC_MULTITCE
+Architectures: ppc
+Type: vm
+
+This capability means the kernel is capable of handling hypercalls
+H_PUT_TCE_INDIRECT and H_STUFF_TCE without passing those into the user
+space. This significantly accelerates DMA operations for PPC KVM guests.
+User space should expect that its handlers for these hypercalls
+are not going to be called if user space previously registered LIOBN
+in KVM (via KVM_CREATE_SPAPR_TCE or similar calls).
+
+In order to enable H_PUT_TCE_INDIRECT and H_STUFF_TCE use in the guest,
+user space might have to advertise it for the guest. For example,
+IBM pSeries (sPAPR) guest starts using them if "hcall-multi-tce" is
+present in the "ibm,hypertas-functions" device-tree property.
+
+The hypercalls mentioned above may or may not be processed successfully
+in the kernel based fast path. If they can not be handled by the kernel,
+they will get passed on to user space. So user space still has to have
+an implementation for these despite the in kernel acceleration.
+
+This capability is always enabled.
+
+4.98 KVM_CREATE_SPAPR_TCE_64
+
+Capability: KVM_CAP_SPAPR_TCE_64
+Architectures: powerpc
+Type: vm ioctl
+Parameters: struct kvm_create_spapr_tce_64 (in)
+Returns: file descriptor for manipulating the created TCE table
+
+This is an extension for KVM_CAP_SPAPR_TCE which only supports 32bit
+windows, described in 4.62 KVM_CREATE_SPAPR_TCE
+
+This capability uses extended struct in ioctl interface:
+
+/* for KVM_CAP_SPAPR_TCE_64 */
+struct kvm_create_spapr_tce_64 {
+ __u64 liobn;
+ __u32 page_shift;
+ __u32 flags;
+ __u64 offset; /* in pages */
+ __u64 size; /* in pages */
+};
+
+The aim of extension is to support an additional bigger DMA window with
+a variable page size.
+KVM_CREATE_SPAPR_TCE_64 receives a 64bit window size, an IOMMU page shift and
+a bus offset of the corresponding DMA window, @size and @offset are numbers
+of IOMMU pages.
+
+@flags are not used at the moment.
+
+The rest of functionality is identical to KVM_CREATE_SPAPR_TCE.
+
+4.98 KVM_REINJECT_CONTROL
+
+Capability: KVM_CAP_REINJECT_CONTROL
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_reinject_control (in)
+Returns: 0 on success,
+ -EFAULT if struct kvm_reinject_control cannot be read,
+ -ENXIO if KVM_CREATE_PIT or KVM_CREATE_PIT2 didn't succeed earlier.
+
+i8254 (PIT) has two modes, reinject and !reinject. The default is reinject,
+where KVM queues elapsed i8254 ticks and monitors completion of interrupt from
+vector(s) that i8254 injects. Reinject mode dequeues a tick and injects its
+interrupt whenever there isn't a pending interrupt from i8254.
+!reinject mode injects an interrupt as soon as a tick arrives.
+
+struct kvm_reinject_control {
+ __u8 pit_reinject;
+ __u8 reserved[31];
+};
+
+pit_reinject = 0 (!reinject mode) is recommended, unless running an old
+operating system that uses the PIT for timing (e.g. Linux 2.4.x).
+
5. The kvm_run structure
------------------------
@@ -3343,6 +3424,7 @@ EOI was received.
struct kvm_hyperv_exit {
#define KVM_EXIT_HYPERV_SYNIC 1
+#define KVM_EXIT_HYPERV_HCALL 2
__u32 type;
union {
struct {
@@ -3351,6 +3433,11 @@ EOI was received.
__u64 evt_page;
__u64 msg_page;
} synic;
+ struct {
+ __u64 input;
+ __u64 result;
+ __u64 params[2];
+ } hcall;
} u;
};
/* KVM_EXIT_HYPERV */
diff --git a/Documentation/virtual/kvm/devices/s390_flic.txt b/Documentation/virtual/kvm/devices/s390_flic.txt
index d1ad9d5..e3e314c 100644
--- a/Documentation/virtual/kvm/devices/s390_flic.txt
+++ b/Documentation/virtual/kvm/devices/s390_flic.txt
@@ -88,6 +88,8 @@ struct kvm_s390_io_adapter_req {
perform a gmap translation for the guest address provided in addr,
pin a userspace page for the translated address and add it to the
list of mappings
+ Note: A new mapping will be created unconditionally; therefore,
+ the calling code should avoid making duplicate mappings.
KVM_S390_IO_ADAPTER_UNMAP
release a userspace page for the translated address specified in addr
diff --git a/Documentation/virtual/kvm/devices/vm.txt b/Documentation/virtual/kvm/devices/vm.txt
index f083a16..a9ea877 100644
--- a/Documentation/virtual/kvm/devices/vm.txt
+++ b/Documentation/virtual/kvm/devices/vm.txt
@@ -84,3 +84,55 @@ Returns: -EBUSY in case 1 or more vcpus are already activated (only in write
-EFAULT if the given address is not accessible from kernel space
-ENOMEM if not enough memory is available to process the ioctl
0 in case of success
+
+3. GROUP: KVM_S390_VM_TOD
+Architectures: s390
+
+3.1. ATTRIBUTE: KVM_S390_VM_TOD_HIGH
+
+Allows user space to set/get the TOD clock extension (u8).
+
+Parameters: address of a buffer in user space to store the data (u8) to
+Returns: -EFAULT if the given address is not accessible from kernel space
+ -EINVAL if setting the TOD clock extension to != 0 is not supported
+
+3.2. ATTRIBUTE: KVM_S390_VM_TOD_LOW
+
+Allows user space to set/get bits 0-63 of the TOD clock register as defined in
+the POP (u64).
+
+Parameters: address of a buffer in user space to store the data (u64) to
+Returns: -EFAULT if the given address is not accessible from kernel space
+
+4. GROUP: KVM_S390_VM_CRYPTO
+Architectures: s390
+
+4.1. ATTRIBUTE: KVM_S390_VM_CRYPTO_ENABLE_AES_KW (w/o)
+
+Allows user space to enable aes key wrapping, including generating a new
+wrapping key.
+
+Parameters: none
+Returns: 0
+
+4.2. ATTRIBUTE: KVM_S390_VM_CRYPTO_ENABLE_DEA_KW (w/o)
+
+Allows user space to enable dea key wrapping, including generating a new
+wrapping key.
+
+Parameters: none
+Returns: 0
+
+4.3. ATTRIBUTE: KVM_S390_VM_CRYPTO_DISABLE_AES_KW (w/o)
+
+Allows user space to disable aes key wrapping, clearing the wrapping key.
+
+Parameters: none
+Returns: 0
+
+4.4. ATTRIBUTE: KVM_S390_VM_CRYPTO_DISABLE_DEA_KW (w/o)
+
+Allows user space to disable dea key wrapping, clearing the wrapping key.
+
+Parameters: none
+Returns: 0
diff --git a/Documentation/virtual/kvm/mmu.txt b/Documentation/virtual/kvm/mmu.txt
index daf9c0f..dda2e93 100644
--- a/Documentation/virtual/kvm/mmu.txt
+++ b/Documentation/virtual/kvm/mmu.txt
@@ -391,11 +391,11 @@ To instantiate a large spte, four constraints must be satisfied:
write-protected pages
- the guest page must be wholly contained by a single memory slot
-To check the last two conditions, the mmu maintains a ->write_count set of
+To check the last two conditions, the mmu maintains a ->disallow_lpage set of
arrays for each memory slot and large page size. Every write protected page
-causes its write_count to be incremented, thus preventing instantiation of
+causes its disallow_lpage to be incremented, thus preventing instantiation of
a large spte. The frames at the end of an unaligned memory slot have
-artificially inflated ->write_counts so they can never be instantiated.
+artificially inflated ->disallow_lpages so they can never be instantiated.
Zapping all pages (page generation count)
=========================================
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 2aa79c8..7529aab 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -33,8 +33,6 @@ static inline void svcpu_put(struct kvmppc_book3s_shadow_vcpu *svcpu)
}
#endif
-#define SPAPR_TCE_SHIFT 12
-
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
#define KVM_DEFAULT_HPT_ORDER 24 /* 16MB HPT by default */
#endif
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 9d08d8c..2e7c791 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -182,7 +182,10 @@ struct kvmppc_spapr_tce_table {
struct list_head list;
struct kvm *kvm;
u64 liobn;
- u32 window_size;
+ struct rcu_head rcu;
+ u32 page_shift;
+ u64 offset; /* in pages */
+ u64 size; /* window size in pages */
struct page *pages[0];
};
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 2241d53..2544eda 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -165,9 +165,25 @@ extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu,
extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
- struct kvm_create_spapr_tce *args);
+ struct kvm_create_spapr_tce_64 *args);
+extern struct kvmppc_spapr_tce_table *kvmppc_find_table(
+ struct kvm_vcpu *vcpu, unsigned long liobn);
+extern long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt,
+ unsigned long ioba, unsigned long npages);
+extern long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *tt,
+ unsigned long tce);
+extern long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
+ unsigned long *ua, unsigned long **prmap);
+extern void kvmppc_tce_put(struct kvmppc_spapr_tce_table *tt,
+ unsigned long idx, unsigned long tce);
extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
unsigned long ioba, unsigned long tce);
+extern long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
+ unsigned long liobn, unsigned long ioba,
+ unsigned long tce_list, unsigned long npages);
+extern long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
+ unsigned long liobn, unsigned long ioba,
+ unsigned long tce_value, unsigned long npages);
extern long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
unsigned long ioba);
extern struct page *kvm_alloc_hpt(unsigned long nr_pages);
@@ -437,6 +453,8 @@ static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
{
return vcpu->arch.irq_type == KVMPPC_IRQ_XICS;
}
+extern void kvmppc_alloc_host_rm_ops(void);
+extern void kvmppc_free_host_rm_ops(void);
extern void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu);
extern int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server);
extern int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args);
@@ -445,7 +463,11 @@ extern u64 kvmppc_xics_get_icp(struct kvm_vcpu *vcpu);
extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval);
extern int kvmppc_xics_connect_vcpu(struct kvm_device *dev,
struct kvm_vcpu *vcpu, u32 cpu);
+extern void kvmppc_xics_ipi_action(void);
+extern int h_ipi_redirect;
#else
+static inline void kvmppc_alloc_host_rm_ops(void) {};
+static inline void kvmppc_free_host_rm_ops(void) {};
static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
{ return 0; }
static inline void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) { }
@@ -459,6 +481,33 @@ static inline int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd)
{ return 0; }
#endif
+/*
+ * Host-side operations we want to set up while running in real
+ * mode in the guest operating on the xics.
+ * Currently only VCPU wakeup is supported.
+ */
+
+union kvmppc_rm_state {
+ unsigned long raw;
+ struct {
+ u32 in_host;
+ u32 rm_action;
+ };
+};
+
+struct kvmppc_host_rm_core {
+ union kvmppc_rm_state rm_state;
+ void *rm_data;
+ char pad[112];
+};
+
+struct kvmppc_host_rm_ops {
+ struct kvmppc_host_rm_core *rm_core;
+ void (*vcpu_kick)(struct kvm_vcpu *vcpu);
+};
+
+extern struct kvmppc_host_rm_ops *kvmppc_host_rm_ops_hv;
+
static inline unsigned long kvmppc_get_epr(struct kvm_vcpu *vcpu)
{
#ifdef CONFIG_KVM_BOOKE_HV
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index ac9fb11..47897a3 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -78,6 +78,9 @@ static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
}
return __find_linux_pte_or_hugepte(pgdir, ea, is_thp, shift);
}
+
+unsigned long vmalloc_to_phys(void *vmalloc_addr);
+
#endif /* __ASSEMBLY__ */
#endif /* _ASM_POWERPC_PGTABLE_H */
diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index 825663c..78083ed 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -114,6 +114,9 @@ extern int cpu_to_core_id(int cpu);
#define PPC_MSG_TICK_BROADCAST 2
#define PPC_MSG_DEBUGGER_BREAK 3
+/* This is only used by the powernv kernel */
+#define PPC_MSG_RM_HOST_ACTION 4
+
/* for irq controllers that have dedicated ipis per message (4) */
extern int smp_request_message_ipi(int virq, int message);
extern const char *smp_ipi_name[];
@@ -121,6 +124,7 @@ extern const char *smp_ipi_name[];
/* for irq controllers with only a single ipi */
extern void smp_muxed_ipi_set_data(int cpu, unsigned long data);
extern void smp_muxed_ipi_message_pass(int cpu, int msg);
+extern void smp_muxed_ipi_set_message(int cpu, int msg);
extern irqreturn_t smp_ipi_demux(void);
void smp_init_pSeries(void);
diff --git a/arch/powerpc/include/asm/xics.h b/arch/powerpc/include/asm/xics.h
index 0e25bdb..2546048 100644
--- a/arch/powerpc/include/asm/xics.h
+++ b/arch/powerpc/include/asm/xics.h
@@ -30,6 +30,7 @@
#ifdef CONFIG_PPC_ICP_NATIVE
extern int icp_native_init(void);
extern void icp_native_flush_interrupt(void);
+extern void icp_native_cause_ipi_rm(int cpu);
#else
static inline int icp_native_init(void) { return -ENODEV; }
#endif
diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index ab4d473..c93cf35 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -333,6 +333,15 @@ struct kvm_create_spapr_tce {
__u32 window_size;
};
+/* for KVM_CAP_SPAPR_TCE_64 */
+struct kvm_create_spapr_tce_64 {
+ __u64 liobn;
+ __u32 page_shift;
+ __u32 flags;
+ __u64 offset; /* in pages */
+ __u64 size; /* in pages */
+};
+
/* for KVM_ALLOCATE_RMA */
struct kvm_allocate_rma {
__u64 rma_size;
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index ec9ec20..cb8be5d 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -206,7 +206,7 @@ int smp_request_message_ipi(int virq, int msg)
#ifdef CONFIG_PPC_SMP_MUXED_IPI
struct cpu_messages {
- int messages; /* current messages */
+ long messages; /* current messages */
unsigned long data; /* data for cause ipi */
};
static DEFINE_PER_CPU_SHARED_ALIGNED(struct cpu_messages, ipi_message);
@@ -218,7 +218,7 @@ void smp_muxed_ipi_set_data(int cpu, unsigned long data)
info->data = data;
}
-void smp_muxed_ipi_message_pass(int cpu, int msg)
+void smp_muxed_ipi_set_message(int cpu, int msg)
{
struct cpu_messages *info = &per_cpu(ipi_message, cpu);
char *message = (char *)&info->messages;
@@ -228,6 +228,13 @@ void smp_muxed_ipi_message_pass(int cpu, int msg)
*/
smp_mb();
message[msg] = 1;
+}
+
+void smp_muxed_ipi_message_pass(int cpu, int msg)
+{
+ struct cpu_messages *info = &per_cpu(ipi_message, cpu);
+
+ smp_muxed_ipi_set_message(cpu, msg);
/*
* cause_ipi functions are required to include a full barrier
* before doing whatever causes the IPI.
@@ -236,20 +243,31 @@ void smp_muxed_ipi_message_pass(int cpu, int msg)
}
#ifdef __BIG_ENDIAN__
-#define IPI_MESSAGE(A) (1 << (24 - 8 * (A)))
+#define IPI_MESSAGE(A) (1uL << ((BITS_PER_LONG - 8) - 8 * (A)))
#else
-#define IPI_MESSAGE(A) (1 << (8 * (A)))
+#define IPI_MESSAGE(A) (1uL << (8 * (A)))
#endif
irqreturn_t smp_ipi_demux(void)
{
struct cpu_messages *info = this_cpu_ptr(&ipi_message);
- unsigned int all;
+ unsigned long all;
mb(); /* order any irq clear */
do {
all = xchg(&info->messages, 0);
+#if defined(CONFIG_KVM_XICS) && defined(CONFIG_KVM_BOOK3S_HV_POSSIBLE)
+ /*
+ * Must check for PPC_MSG_RM_HOST_ACTION messages
+ * before PPC_MSG_CALL_FUNCTION messages because when
+ * a VM is destroyed, we call kick_all_cpus_sync()
+ * to ensure that any pending PPC_MSG_RM_HOST_ACTION
+ * messages have completed before we free any VCPUs.
+ */
+ if (all & IPI_MESSAGE(PPC_MSG_RM_HOST_ACTION))
+ kvmppc_xics_ipi_action();
+#endif
if (all & IPI_MESSAGE(PPC_MSG_CALL_FUNCTION))
generic_smp_call_function_interrupt();
if (all & IPI_MESSAGE(PPC_MSG_RESCHEDULE))
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 0570eef..7f7b6d8 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -8,7 +8,7 @@ ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm
KVM := ../../../virt/kvm
common-objs-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
- $(KVM)/eventfd.o
+ $(KVM)/eventfd.o $(KVM)/vfio.o
CFLAGS_e500_mmu.o := -I.
CFLAGS_e500_mmu_host.o := -I.
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 638c6d9..b34220d 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -807,7 +807,7 @@ int kvmppc_core_init_vm(struct kvm *kvm)
{
#ifdef CONFIG_PPC64
- INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
+ INIT_LIST_HEAD_RCU(&kvm->arch.spapr_tce_tables);
INIT_LIST_HEAD(&kvm->arch.rtas_tokens);
#endif
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 54cf9bc..2c2d103 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -14,6 +14,7 @@
*
* Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
* Copyright 2011 David Gibson, IBM Corporation <dwg@au1.ibm.com>
+ * Copyright 2016 Alexey Kardashevskiy, IBM Corporation <aik@au1.ibm.com>
*/
#include <linux/types.h>
@@ -36,28 +37,69 @@
#include <asm/ppc-opcode.h>
#include <asm/kvm_host.h>
#include <asm/udbg.h>
+#include <asm/iommu.h>
+#include <asm/tce.h>
-#define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64))
+static unsigned long kvmppc_tce_pages(unsigned long iommu_pages)
+{
+ return ALIGN(iommu_pages * sizeof(u64), PAGE_SIZE) / PAGE_SIZE;
+}
-static long kvmppc_stt_npages(unsigned long window_size)
+static unsigned long kvmppc_stt_pages(unsigned long tce_pages)
{
- return ALIGN((window_size >> SPAPR_TCE_SHIFT)
- * sizeof(u64), PAGE_SIZE) / PAGE_SIZE;
+ unsigned long stt_bytes = sizeof(struct kvmppc_spapr_tce_table) +
+ (tce_pages * sizeof(struct page *));
+
+ return tce_pages + ALIGN(stt_bytes, PAGE_SIZE) / PAGE_SIZE;
}
-static void release_spapr_tce_table(struct kvmppc_spapr_tce_table *stt)
+static long kvmppc_account_memlimit(unsigned long stt_pages, bool inc)
{
- struct kvm *kvm = stt->kvm;
- int i;
+ long ret = 0;
- mutex_lock(&kvm->lock);
- list_del(&stt->list);
- for (i = 0; i < kvmppc_stt_npages(stt->window_size); i++)
+ if (!current || !current->mm)
+ return ret; /* process exited */
+
+ down_write(&current->mm->mmap_sem);
+
+ if (inc) {
+ unsigned long locked, lock_limit;
+
+ locked = current->mm->locked_vm + stt_pages;
+ lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+ if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+ ret = -ENOMEM;
+ else
+ current->mm->locked_vm += stt_pages;
+ } else {
+ if (WARN_ON_ONCE(stt_pages > current->mm->locked_vm))
+ stt_pages = current->mm->locked_vm;
+
+ current->mm->locked_vm -= stt_pages;
+ }
+
+ pr_debug("[%d] RLIMIT_MEMLOCK KVM %c%ld %ld/%ld%s\n", current->pid,
+ inc ? '+' : '-',
+ stt_pages << PAGE_SHIFT,
+ current->mm->locked_vm << PAGE_SHIFT,
+ rlimit(RLIMIT_MEMLOCK),
+ ret ? " - exceeded" : "");
+
+ up_write(&current->mm->mmap_sem);
+
+ return ret;
+}
+
+static void release_spapr_tce_table(struct rcu_head *head)
+{
+ struct kvmppc_spapr_tce_table *stt = container_of(head,
+ struct kvmppc_spapr_tce_table, rcu);
+ unsigned long i, npages = kvmppc_tce_pages(stt->size);
+
+ for (i = 0; i < npages; i++)
__free_page(stt->pages[i]);
- kfree(stt);
- mutex_unlock(&kvm->lock);
- kvm_put_kvm(kvm);
+ kfree(stt);
}
static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
@@ -65,7 +107,7 @@ static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
struct kvmppc_spapr_tce_table *stt = vma->vm_file->private_data;
struct page *page;
- if (vmf->pgoff >= kvmppc_stt_npages(stt->window_size))
+ if (vmf->pgoff >= kvmppc_tce_pages(stt->size))
return VM_FAULT_SIGBUS;
page = stt->pages[vmf->pgoff];
@@ -88,7 +130,14 @@ static int kvm_spapr_tce_release(struct inode *inode, struct file *filp)
{
struct kvmppc_spapr_tce_table *stt = filp->private_data;
- release_spapr_tce_table(stt);
+ list_del_rcu(&stt->list);
+
+ kvm_put_kvm(stt->kvm);
+
+ kvmppc_account_memlimit(
+ kvmppc_stt_pages(kvmppc_tce_pages(stt->size)), false);
+ call_rcu(&stt->rcu, release_spapr_tce_table);
+
return 0;
}
@@ -98,20 +147,29 @@ static const struct file_operations kvm_spapr_tce_fops = {
};
long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
- struct kvm_create_spapr_tce *args)
+ struct kvm_create_spapr_tce_64 *args)
{
struct kvmppc_spapr_tce_table *stt = NULL;
- long npages;
+ unsigned long npages, size;
int ret = -ENOMEM;
int i;
+ if (!args->size)
+ return -EINVAL;
+
/* Check this LIOBN hasn't been previously allocated */
list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
if (stt->liobn == args->liobn)
return -EBUSY;
}
- npages = kvmppc_stt_npages(args->window_size);
+ size = args->size;
+ npages = kvmppc_tce_pages(size);
+ ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true);
+ if (ret) {
+ stt = NULL;
+ goto fail;
+ }
stt = kzalloc(sizeof(*stt) + npages * sizeof(struct page *),
GFP_KERNEL);
@@ -119,7 +177,9 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
goto fail;
stt->liobn = args->liobn;
- stt->window_size = args->window_size;
+ stt->page_shift = args->page_shift;
+ stt->offset = args->offset;
+ stt->size = size;
stt->kvm = kvm;
for (i = 0; i < npages; i++) {
@@ -131,7 +191,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
kvm_get_kvm(kvm);
mutex_lock(&kvm->lock);
- list_add(&stt->list, &kvm->arch.spapr_tce_tables);
+ list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables);
mutex_unlock(&kvm->lock);
@@ -148,3 +208,59 @@ fail:
}
return ret;
}
+
+long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
+ unsigned long liobn, unsigned long ioba,
+ unsigned long tce_list, unsigned long npages)
+{
+ struct kvmppc_spapr_tce_table *stt;
+ long i, ret = H_SUCCESS, idx;
+ unsigned long entry, ua = 0;
+ u64 __user *tces, tce;
+
+ stt = kvmppc_find_table(vcpu, liobn);
+ if (!stt)
+ return H_TOO_HARD;
+
+ entry = ioba >> stt->page_shift;
+ /*
+ * SPAPR spec says that the maximum size of the list is 512 TCEs
+ * so the whole table fits in 4K page
+ */
+ if (npages > 512)
+ return H_PARAMETER;
+
+ if (tce_list & (SZ_4K - 1))
+ return H_PARAMETER;
+
+ ret = kvmppc_ioba_validate(stt, ioba, npages);
+ if (ret != H_SUCCESS)
+ return ret;
+
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL)) {
+ ret = H_TOO_HARD;
+ goto unlock_exit;
+ }
+ tces = (u64 __user *) ua;
+
+ for (i = 0; i < npages; ++i) {
+ if (get_user(tce, tces + i)) {
+ ret = H_TOO_HARD;
+ goto unlock_exit;
+ }
+ tce = be64_to_cpu(tce);
+
+ ret = kvmppc_tce_validate(stt, tce);
+ if (ret != H_SUCCESS)
+ goto unlock_exit;
+
+ kvmppc_tce_put(stt, entry + i, tce);
+ }
+
+unlock_exit:
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(kvmppc_h_put_tce_indirect);
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
index 89e96b3..44be73e 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -14,6 +14,7 @@
*
* Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
* Copyright 2011 David Gibson, IBM Corporation <dwg@au1.ibm.com>
+ * Copyright 2016 Alexey Kardashevskiy, IBM Corporation <aik@au1.ibm.com>
*/
#include <linux/types.h>
@@ -30,76 +31,321 @@
#include <asm/kvm_ppc.h>
#include <asm/kvm_book3s.h>
#include <asm/mmu-hash64.h>
+#include <asm/mmu_context.h>
#include <asm/hvcall.h>
#include <asm/synch.h>
#include <asm/ppc-opcode.h>
#include <asm/kvm_host.h>
#include <asm/udbg.h>
+#include <asm/iommu.h>
+#include <asm/tce.h>
+#include <asm/iommu.h>
#define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64))
-/* WARNING: This will be called in real-mode on HV KVM and virtual
+/*
+ * Finds a TCE table descriptor by LIOBN.
+ *
+ * WARNING: This will be called in real or virtual mode on HV KVM and virtual
* mode on PR KVM
*/
-long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
- unsigned long ioba, unsigned long tce)
+struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm_vcpu *vcpu,
+ unsigned long liobn)
{
struct kvm *kvm = vcpu->kvm;
struct kvmppc_spapr_tce_table *stt;
+ list_for_each_entry_lockless(stt, &kvm->arch.spapr_tce_tables, list)
+ if (stt->liobn == liobn)
+ return stt;
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(kvmppc_find_table);
+
+/*
+ * Validates IO address.
+ *
+ * WARNING: This will be called in real-mode on HV KVM and virtual
+ * mode on PR KVM
+ */
+long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt,
+ unsigned long ioba, unsigned long npages)
+{
+ unsigned long mask = (1ULL << stt->page_shift) - 1;
+ unsigned long idx = ioba >> stt->page_shift;
+
+ if ((ioba & mask) || (idx < stt->offset) ||
+ (idx - stt->offset + npages > stt->size) ||
+ (idx + npages < idx))
+ return H_PARAMETER;
+
+ return H_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(kvmppc_ioba_validate);
+
+/*
+ * Validates TCE address.
+ * At the moment flags and page mask are validated.
+ * As the host kernel does not access those addresses (just puts them
+ * to the table and user space is supposed to process them), we can skip
+ * checking other things (such as TCE is a guest RAM address or the page
+ * was actually allocated).
+ *
+ * WARNING: This will be called in real-mode on HV KVM and virtual
+ * mode on PR KVM
+ */
+long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, unsigned long tce)
+{
+ unsigned long page_mask = ~((1ULL << stt->page_shift) - 1);
+ unsigned long mask = ~(page_mask | TCE_PCI_WRITE | TCE_PCI_READ);
+
+ if (tce & mask)
+ return H_PARAMETER;
+
+ return H_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(kvmppc_tce_validate);
+
+/* Note on the use of page_address() in real mode,
+ *
+ * It is safe to use page_address() in real mode on ppc64 because
+ * page_address() is always defined as lowmem_page_address()
+ * which returns __va(PFN_PHYS(page_to_pfn(page))) which is arithmetic
+ * operation and does not access page struct.
+ *
+ * Theoretically page_address() could be defined different
+ * but either WANT_PAGE_VIRTUAL or HASHED_PAGE_VIRTUAL
+ * would have to be enabled.
+ * WANT_PAGE_VIRTUAL is never enabled on ppc32/ppc64,
+ * HASHED_PAGE_VIRTUAL could be enabled for ppc32 only and only
+ * if CONFIG_HIGHMEM is defined. As CONFIG_SPARSEMEM_VMEMMAP
+ * is not expected to be enabled on ppc32, page_address()
+ * is safe for ppc32 as well.
+ *
+ * WARNING: This will be called in real-mode on HV KVM and virtual
+ * mode on PR KVM
+ */
+static u64 *kvmppc_page_address(struct page *page)
+{
+#if defined(HASHED_PAGE_VIRTUAL) || defined(WANT_PAGE_VIRTUAL)
+#error TODO: fix to avoid page_address() here
+#endif
+ return (u64 *) page_address(page);
+}
+
+/*
+ * Handles TCE requests for emulated devices.
+ * Puts guest TCE values to the table and expects user space to convert them.
+ * Called in both real and virtual modes.
+ * Cannot fail so kvmppc_tce_validate must be called before it.
+ *
+ * WARNING: This will be called in real-mode on HV KVM and virtual
+ * mode on PR KVM
+ */
+void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt,
+ unsigned long idx, unsigned long tce)
+{
+ struct page *page;
+ u64 *tbl;
+
+ idx -= stt->offset;
+ page = stt->pages[idx / TCES_PER_PAGE];
+ tbl = kvmppc_page_address(page);
+
+ tbl[idx % TCES_PER_PAGE] = tce;
+}
+EXPORT_SYMBOL_GPL(kvmppc_tce_put);
+
+long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
+ unsigned long *ua, unsigned long **prmap)
+{
+ unsigned long gfn = gpa >> PAGE_SHIFT;
+ struct kvm_memory_slot *memslot;
+
+ memslot = search_memslots(kvm_memslots(kvm), gfn);
+ if (!memslot)
+ return -EINVAL;
+
+ *ua = __gfn_to_hva_memslot(memslot, gfn) |
+ (gpa & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE));
+
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+ if (prmap)
+ *prmap = &memslot->arch.rmap[gfn - memslot->base_gfn];
+#endif
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua);
+
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
+ unsigned long ioba, unsigned long tce)
+{
+ struct kvmppc_spapr_tce_table *stt = kvmppc_find_table(vcpu, liobn);
+ long ret;
+
/* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */
/* liobn, ioba, tce); */
- list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
- if (stt->liobn == liobn) {
- unsigned long idx = ioba >> SPAPR_TCE_SHIFT;
- struct page *page;
- u64 *tbl;
-
- /* udbg_printf("H_PUT_TCE: liobn 0x%lx => stt=%p window_size=0x%x\n", */
- /* liobn, stt, stt->window_size); */
- if (ioba >= stt->window_size)
- return H_PARAMETER;
-
- page = stt->pages[idx / TCES_PER_PAGE];
- tbl = (u64 *)page_address(page);
-
- /* FIXME: Need to validate the TCE itself */
- /* udbg_printf("tce @ %p\n", &tbl[idx % TCES_PER_PAGE]); */
- tbl[idx % TCES_PER_PAGE] = tce;
- return H_SUCCESS;
- }
- }
+ if (!stt)
+ return H_TOO_HARD;
+
+ ret = kvmppc_ioba_validate(stt, ioba, 1);
+ if (ret != H_SUCCESS)
+ return ret;
- /* Didn't find the liobn, punt it to userspace */
- return H_TOO_HARD;
+ ret = kvmppc_tce_validate(stt, tce);
+ if (ret != H_SUCCESS)
+ return ret;
+
+ kvmppc_tce_put(stt, ioba >> stt->page_shift, tce);
+
+ return H_SUCCESS;
}
EXPORT_SYMBOL_GPL(kvmppc_h_put_tce);
-long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
- unsigned long ioba)
+static long kvmppc_rm_ua_to_hpa(struct kvm_vcpu *vcpu,
+ unsigned long ua, unsigned long *phpa)
+{
+ pte_t *ptep, pte;
+ unsigned shift = 0;
+
+ ptep = __find_linux_pte_or_hugepte(vcpu->arch.pgdir, ua, NULL, &shift);
+ if (!ptep || !pte_present(*ptep))
+ return -ENXIO;
+ pte = *ptep;
+
+ if (!shift)
+ shift = PAGE_SHIFT;
+
+ /* Avoid handling anything potentially complicated in realmode */
+ if (shift > PAGE_SHIFT)
+ return -EAGAIN;
+
+ if (!pte_young(pte))
+ return -EAGAIN;
+
+ *phpa = (pte_pfn(pte) << PAGE_SHIFT) | (ua & ((1ULL << shift) - 1)) |
+ (ua & ~PAGE_MASK);
+
+ return 0;
+}
+
+long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
+ unsigned long liobn, unsigned long ioba,
+ unsigned long tce_list, unsigned long npages)
{
- struct kvm *kvm = vcpu->kvm;
struct kvmppc_spapr_tce_table *stt;
+ long i, ret = H_SUCCESS;
+ unsigned long tces, entry, ua = 0;
+ unsigned long *rmap = NULL;
+
+ stt = kvmppc_find_table(vcpu, liobn);
+ if (!stt)
+ return H_TOO_HARD;
+
+ entry = ioba >> stt->page_shift;
+ /*
+ * The spec says that the maximum size of the list is 512 TCEs
+ * so the whole table addressed resides in 4K page
+ */
+ if (npages > 512)
+ return H_PARAMETER;
+
+ if (tce_list & (SZ_4K - 1))
+ return H_PARAMETER;
+
+ ret = kvmppc_ioba_validate(stt, ioba, npages);
+ if (ret != H_SUCCESS)
+ return ret;
- list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
- if (stt->liobn == liobn) {
- unsigned long idx = ioba >> SPAPR_TCE_SHIFT;
- struct page *page;
- u64 *tbl;
+ if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap))
+ return H_TOO_HARD;
- if (ioba >= stt->window_size)
- return H_PARAMETER;
+ rmap = (void *) vmalloc_to_phys(rmap);
- page = stt->pages[idx / TCES_PER_PAGE];
- tbl = (u64 *)page_address(page);
+ /*
+ * Synchronize with the MMU notifier callbacks in
+ * book3s_64_mmu_hv.c (kvm_unmap_hva_hv etc.).
+ * While we have the rmap lock, code running on other CPUs
+ * cannot finish unmapping the host real page that backs
+ * this guest real page, so we are OK to access the host
+ * real page.
+ */
+ lock_rmap(rmap);
+ if (kvmppc_rm_ua_to_hpa(vcpu, ua, &tces)) {
+ ret = H_TOO_HARD;
+ goto unlock_exit;
+ }
+
+ for (i = 0; i < npages; ++i) {
+ unsigned long tce = be64_to_cpu(((u64 *)tces)[i]);
+
+ ret = kvmppc_tce_validate(stt, tce);
+ if (ret != H_SUCCESS)
+ goto unlock_exit;
- vcpu->arch.gpr[4] = tbl[idx % TCES_PER_PAGE];
- return H_SUCCESS;
- }
+ kvmppc_tce_put(stt, entry + i, tce);
}
- /* Didn't find the liobn, punt it to userspace */
- return H_TOO_HARD;
+unlock_exit:
+ unlock_rmap(rmap);
+
+ return ret;
+}
+
+long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
+ unsigned long liobn, unsigned long ioba,
+ unsigned long tce_value, unsigned long npages)
+{
+ struct kvmppc_spapr_tce_table *stt;
+ long i, ret;
+
+ stt = kvmppc_find_table(vcpu, liobn);
+ if (!stt)
+ return H_TOO_HARD;
+
+ ret = kvmppc_ioba_validate(stt, ioba, npages);
+ if (ret != H_SUCCESS)
+ return ret;
+
+ /* Check permission bits only to allow userspace poison TCE for debug */
+ if (tce_value & (TCE_PCI_WRITE | TCE_PCI_READ))
+ return H_PARAMETER;
+
+ for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift))
+ kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value);
+
+ return H_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(kvmppc_h_stuff_tce);
+
+long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
+ unsigned long ioba)
+{
+ struct kvmppc_spapr_tce_table *stt = kvmppc_find_table(vcpu, liobn);
+ long ret;
+ unsigned long idx;
+ struct page *page;
+ u64 *tbl;
+
+ if (!stt)
+ return H_TOO_HARD;
+
+ ret = kvmppc_ioba_validate(stt, ioba, 1);
+ if (ret != H_SUCCESS)
+ return ret;
+
+ idx = (ioba >> stt->page_shift) - stt->offset;
+ page = stt->pages[idx / TCES_PER_PAGE];
+ tbl = (u64 *)page_address(page);
+
+ vcpu->arch.gpr[4] = tbl[idx % TCES_PER_PAGE];
+
+ return H_SUCCESS;
}
EXPORT_SYMBOL_GPL(kvmppc_h_get_tce);
+
+#endif /* KVM_BOOK3S_HV_POSSIBLE */
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index baeddb0..f47fffe 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -81,6 +81,17 @@ static int target_smt_mode;
module_param(target_smt_mode, int, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(target_smt_mode, "Target threads per core (0 = max)");
+#ifdef CONFIG_KVM_XICS
+static struct kernel_param_ops module_param_ops = {
+ .set = param_set_int,
+ .get = param_get_int,
+};
+
+module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect,
+ S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
+#endif
+
static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
@@ -768,7 +779,31 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
if (kvmppc_xics_enabled(vcpu)) {
ret = kvmppc_xics_hcall(vcpu, req);
break;
- } /* fallthrough */
+ }
+ return RESUME_HOST;
+ case H_PUT_TCE:
+ ret = kvmppc_h_put_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
+ kvmppc_get_gpr(vcpu, 5),
+ kvmppc_get_gpr(vcpu, 6));
+ if (ret == H_TOO_HARD)
+ return RESUME_HOST;
+ break;
+ case H_PUT_TCE_INDIRECT:
+ ret = kvmppc_h_put_tce_indirect(vcpu, kvmppc_get_gpr(vcpu, 4),
+ kvmppc_get_gpr(vcpu, 5),
+ kvmppc_get_gpr(vcpu, 6),
+ kvmppc_get_gpr(vcpu, 7));
+ if (ret == H_TOO_HARD)
+ return RESUME_HOST;
+ break;
+ case H_STUFF_TCE:
+ ret = kvmppc_h_stuff_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
+ kvmppc_get_gpr(vcpu, 5),
+ kvmppc_get_gpr(vcpu, 6),
+ kvmppc_get_gpr(vcpu, 7));
+ if (ret == H_TOO_HARD)
+ return RESUME_HOST;
+ break;
default:
return RESUME_HOST;
}
@@ -2279,6 +2314,46 @@ static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
}
/*
+ * Clear core from the list of active host cores as we are about to
+ * enter the guest. Only do this if it is the primary thread of the
+ * core (not if a subcore) that is entering the guest.
+ */
+static inline void kvmppc_clear_host_core(int cpu)
+{
+ int core;
+
+ if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu))
+ return;
+ /*
+ * Memory barrier can be omitted here as we will do a smp_wmb()
+ * later in kvmppc_start_thread and we need ensure that state is
+ * visible to other CPUs only after we enter guest.
+ */
+ core = cpu >> threads_shift;
+ kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 0;
+}
+
+/*
+ * Advertise this core as an active host core since we exited the guest
+ * Only need to do this if it is the primary thread of the core that is
+ * exiting.
+ */
+static inline void kvmppc_set_host_core(int cpu)
+{
+ int core;
+
+ if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu))
+ return;
+
+ /*
+ * Memory barrier can be omitted here because we do a spin_unlock
+ * immediately after this which provides the memory barrier.
+ */
+ core = cpu >> threads_shift;
+ kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 1;
+}
+
+/*
* Run a set of guest threads on a physical core.
* Called with vc->lock held.
*/
@@ -2390,6 +2465,8 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
}
}
+ kvmppc_clear_host_core(pcpu);
+
/* Start all the threads */
active = 0;
for (sub = 0; sub < core_info.n_subcores; ++sub) {
@@ -2486,6 +2563,8 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
kvmppc_ipi_thread(pcpu + i);
}
+ kvmppc_set_host_core(pcpu);
+
spin_unlock(&vc->lock);
/* make sure updates to secondary vcpu structs are visible now */
@@ -2984,6 +3063,114 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
goto out_srcu;
}
+#ifdef CONFIG_KVM_XICS
+static int kvmppc_cpu_notify(struct notifier_block *self, unsigned long action,
+ void *hcpu)
+{
+ unsigned long cpu = (long)hcpu;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
+ kvmppc_set_host_core(cpu);
+ break;
+
+#ifdef CONFIG_HOTPLUG_CPU
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
+ kvmppc_clear_host_core(cpu);
+ break;
+#endif
+ default:
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block kvmppc_cpu_notifier = {
+ .notifier_call = kvmppc_cpu_notify,
+};
+
+/*
+ * Allocate a per-core structure for managing state about which cores are
+ * running in the host versus the guest and for exchanging data between
+ * real mode KVM and CPU running in the host.
+ * This is only done for the first VM.
+ * The allocated structure stays even if all VMs have stopped.
+ * It is only freed when the kvm-hv module is unloaded.
+ * It's OK for this routine to fail, we just don't support host
+ * core operations like redirecting H_IPI wakeups.
+ */
+void kvmppc_alloc_host_rm_ops(void)
+{
+ struct kvmppc_host_rm_ops *ops;
+ unsigned long l_ops;
+ int cpu, core;
+ int size;
+
+ /* Not the first time here ? */
+ if (kvmppc_host_rm_ops_hv != NULL)
+ return;
+
+ ops = kzalloc(sizeof(struct kvmppc_host_rm_ops), GFP_KERNEL);
+ if (!ops)
+ return;
+
+ size = cpu_nr_cores() * sizeof(struct kvmppc_host_rm_core);
+ ops->rm_core = kzalloc(size, GFP_KERNEL);
+
+ if (!ops->rm_core) {
+ kfree(ops);
+ return;
+ }
+
+ get_online_cpus();
+
+ for (cpu = 0; cpu < nr_cpu_ids; cpu += threads_per_core) {
+ if (!cpu_online(cpu))
+ continue;
+
+ core = cpu >> threads_shift;
+ ops->rm_core[core].rm_state.in_host = 1;
+ }
+
+ ops->vcpu_kick = kvmppc_fast_vcpu_kick_hv;
+
+ /*
+ * Make the contents of the kvmppc_host_rm_ops structure visible
+ * to other CPUs before we assign it to the global variable.
+ * Do an atomic assignment (no locks used here), but if someone
+ * beats us to it, just free our copy and return.
+ */
+ smp_wmb();
+ l_ops = (unsigned long) ops;
+
+ if (cmpxchg64((unsigned long *)&kvmppc_host_rm_ops_hv, 0, l_ops)) {
+ put_online_cpus();
+ kfree(ops->rm_core);
+ kfree(ops);
+ return;
+ }
+
+ register_cpu_notifier(&kvmppc_cpu_notifier);
+
+ put_online_cpus();
+}
+
+void kvmppc_free_host_rm_ops(void)
+{
+ if (kvmppc_host_rm_ops_hv) {
+ unregister_cpu_notifier(&kvmppc_cpu_notifier);
+ kfree(kvmppc_host_rm_ops_hv->rm_core);
+ kfree(kvmppc_host_rm_ops_hv);
+ kvmppc_host_rm_ops_hv = NULL;
+ }
+}
+#endif
+
static int kvmppc_core_init_vm_hv(struct kvm *kvm)
{
unsigned long lpcr, lpid;
@@ -2996,6 +3183,8 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
return -ENOMEM;
kvm->arch.lpid = lpid;
+ kvmppc_alloc_host_rm_ops();
+
/*
* Since we don't flush the TLB when tearing down a VM,
* and this lpid might have previously been used,
@@ -3229,6 +3418,7 @@ static int kvmppc_book3s_init_hv(void)
static void kvmppc_book3s_exit_hv(void)
{
+ kvmppc_free_host_rm_ops();
kvmppc_hv_ops = NULL;
}
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index fd7006b..5f0380d 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -283,3 +283,6 @@ void kvmhv_commence_exit(int trap)
kvmhv_interrupt_vcore(vc, ee);
}
}
+
+struct kvmppc_host_rm_ops *kvmppc_host_rm_ops_hv;
+EXPORT_SYMBOL_GPL(kvmppc_host_rm_ops_hv);
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index 24f5807..980d8a6 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -17,12 +17,16 @@
#include <asm/xics.h>
#include <asm/debug.h>
#include <asm/synch.h>
+#include <asm/cputhreads.h>
#include <asm/ppc-opcode.h>
#include "book3s_xics.h"
#define DEBUG_PASSUP
+int h_ipi_redirect = 1;
+EXPORT_SYMBOL(h_ipi_redirect);
+
static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
u32 new_irq);
@@ -50,11 +54,84 @@ static void ics_rm_check_resend(struct kvmppc_xics *xics,
/* -- ICP routines -- */
+#ifdef CONFIG_SMP
+static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu)
+{
+ int hcpu;
+
+ hcpu = hcore << threads_shift;
+ kvmppc_host_rm_ops_hv->rm_core[hcore].rm_data = vcpu;
+ smp_muxed_ipi_set_message(hcpu, PPC_MSG_RM_HOST_ACTION);
+ icp_native_cause_ipi_rm(hcpu);
+}
+#else
+static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu) { }
+#endif
+
+/*
+ * We start the search from our current CPU Id in the core map
+ * and go in a circle until we get back to our ID looking for a
+ * core that is running in host context and that hasn't already
+ * been targeted for another rm_host_ops.
+ *
+ * In the future, could consider using a fairer algorithm (one
+ * that distributes the IPIs better)
+ *
+ * Returns -1, if no CPU could be found in the host
+ * Else, returns a CPU Id which has been reserved for use
+ */
+static inline int grab_next_hostcore(int start,
+ struct kvmppc_host_rm_core *rm_core, int max, int action)
+{
+ bool success;
+ int core;
+ union kvmppc_rm_state old, new;
+
+ for (core = start + 1; core < max; core++) {
+ old = new = READ_ONCE(rm_core[core].rm_state);
+
+ if (!old.in_host || old.rm_action)
+ continue;
+
+ /* Try to grab this host core if not taken already. */
+ new.rm_action = action;
+
+ success = cmpxchg64(&rm_core[core].rm_state.raw,
+ old.raw, new.raw) == old.raw;
+ if (success) {
+ /*
+ * Make sure that the store to the rm_action is made
+ * visible before we return to caller (and the
+ * subsequent store to rm_data) to synchronize with
+ * the IPI handler.
+ */
+ smp_wmb();
+ return core;
+ }
+ }
+
+ return -1;
+}
+
+static inline int find_available_hostcore(int action)
+{
+ int core;
+ int my_core = smp_processor_id() >> threads_shift;
+ struct kvmppc_host_rm_core *rm_core = kvmppc_host_rm_ops_hv->rm_core;
+
+ core = grab_next_hostcore(my_core, rm_core, cpu_nr_cores(), action);
+ if (core == -1)
+ core = grab_next_hostcore(core, rm_core, my_core, action);
+
+ return core;
+}
+
static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu,
struct kvm_vcpu *this_vcpu)
{
struct kvmppc_icp *this_icp = this_vcpu->arch.icp;
int cpu;
+ int hcore;
/* Mark the target VCPU as having an interrupt pending */
vcpu->stat.queue_intr++;
@@ -66,11 +143,22 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu,
return;
}
- /* Check if the core is loaded, if not, too hard */
+ /*
+ * Check if the core is loaded,
+ * if not, find an available host core to post to wake the VCPU,
+ * if we can't find one, set up state to eventually return too hard.
+ */
cpu = vcpu->arch.thread_cpu;
if (cpu < 0 || cpu >= nr_cpu_ids) {
- this_icp->rm_action |= XICS_RM_KICK_VCPU;
- this_icp->rm_kick_target = vcpu;
+ hcore = -1;
+ if (kvmppc_host_rm_ops_hv && h_ipi_redirect)
+ hcore = find_available_hostcore(XICS_RM_KICK_VCPU);
+ if (hcore != -1) {
+ icp_send_hcore_msg(hcore, vcpu);
+ } else {
+ this_icp->rm_action |= XICS_RM_KICK_VCPU;
+ this_icp->rm_kick_target = vcpu;
+ }
return;
}
@@ -623,3 +711,40 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
bail:
return check_too_hard(xics, icp);
}
+
+/* --- Non-real mode XICS-related built-in routines --- */
+
+/**
+ * Host Operations poked by RM KVM
+ */
+static void rm_host_ipi_action(int action, void *data)
+{
+ switch (action) {
+ case XICS_RM_KICK_VCPU:
+ kvmppc_host_rm_ops_hv->vcpu_kick(data);
+ break;
+ default:
+ WARN(1, "Unexpected rm_action=%d data=%p\n", action, data);
+ break;
+ }
+
+}
+
+void kvmppc_xics_ipi_action(void)
+{
+ int core;
+ unsigned int cpu = smp_processor_id();
+ struct kvmppc_host_rm_core *rm_corep;
+
+ core = cpu >> threads_shift;
+ rm_corep = &kvmppc_host_rm_ops_hv->rm_core[core];
+
+ if (rm_corep->rm_data) {
+ rm_host_ipi_action(rm_corep->rm_state.rm_action,
+ rm_corep->rm_data);
+ /* Order these stores against the real mode KVM */
+ rm_corep->rm_data = NULL;
+ smp_wmb();
+ rm_corep->rm_state.rm_action = 0;
+ }
+}
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 6ee26de..ed16182 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -2006,8 +2006,8 @@ hcall_real_table:
.long 0 /* 0x12c */
.long 0 /* 0x130 */
.long DOTSYM(kvmppc_h_set_xdabr) - hcall_real_table
- .long 0 /* 0x138 */
- .long 0 /* 0x13c */
+ .long DOTSYM(kvmppc_h_stuff_tce) - hcall_real_table
+ .long DOTSYM(kvmppc_rm_h_put_tce_indirect) - hcall_real_table
.long 0 /* 0x140 */
.long 0 /* 0x144 */
.long 0 /* 0x148 */
diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c
index f2c75a1..02176fd 100644
--- a/arch/powerpc/kvm/book3s_pr_papr.c
+++ b/arch/powerpc/kvm/book3s_pr_papr.c
@@ -280,6 +280,37 @@ static int kvmppc_h_pr_logical_ci_store(struct kvm_vcpu *vcpu)
return EMULATE_DONE;
}
+static int kvmppc_h_pr_put_tce_indirect(struct kvm_vcpu *vcpu)
+{
+ unsigned long liobn = kvmppc_get_gpr(vcpu, 4);
+ unsigned long ioba = kvmppc_get_gpr(vcpu, 5);
+ unsigned long tce = kvmppc_get_gpr(vcpu, 6);
+ unsigned long npages = kvmppc_get_gpr(vcpu, 7);
+ long rc;
+
+ rc = kvmppc_h_put_tce_indirect(vcpu, liobn, ioba,
+ tce, npages);
+ if (rc == H_TOO_HARD)
+ return EMULATE_FAIL;
+ kvmppc_set_gpr(vcpu, 3, rc);
+ return EMULATE_DONE;
+}
+
+static int kvmppc_h_pr_stuff_tce(struct kvm_vcpu *vcpu)
+{
+ unsigned long liobn = kvmppc_get_gpr(vcpu, 4);
+ unsigned long ioba = kvmppc_get_gpr(vcpu, 5);
+ unsigned long tce_value = kvmppc_get_gpr(vcpu, 6);
+ unsigned long npages = kvmppc_get_gpr(vcpu, 7);
+ long rc;
+
+ rc = kvmppc_h_stuff_tce(vcpu, liobn, ioba, tce_value, npages);
+ if (rc == H_TOO_HARD)
+ return EMULATE_FAIL;
+ kvmppc_set_gpr(vcpu, 3, rc);
+ return EMULATE_DONE;
+}
+
static int kvmppc_h_pr_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd)
{
long rc = kvmppc_xics_hcall(vcpu, cmd);
@@ -306,6 +337,10 @@ int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
return kvmppc_h_pr_bulk_remove(vcpu);
case H_PUT_TCE:
return kvmppc_h_pr_put_tce(vcpu);
+ case H_PUT_TCE_INDIRECT:
+ return kvmppc_h_pr_put_tce_indirect(vcpu);
+ case H_STUFF_TCE:
+ return kvmppc_h_pr_stuff_tce(vcpu);
case H_CEDE:
kvmppc_set_msr_fast(vcpu, kvmppc_get_msr(vcpu) | MSR_EE);
kvm_vcpu_block(vcpu);
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index a3b182d..19aa59b 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -33,6 +33,7 @@
#include <asm/tlbflush.h>
#include <asm/cputhreads.h>
#include <asm/irqflags.h>
+#include <asm/iommu.h>
#include "timing.h"
#include "irq.h"
#include "../mm/mmu_decl.h"
@@ -437,6 +438,16 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
unsigned int i;
struct kvm_vcpu *vcpu;
+#ifdef CONFIG_KVM_XICS
+ /*
+ * We call kick_all_cpus_sync() to ensure that all
+ * CPUs have executed any pending IPIs before we
+ * continue and free VCPUs structures below.
+ */
+ if (is_kvmppc_hv_enabled(kvm))
+ kick_all_cpus_sync();
+#endif
+
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_arch_vcpu_free(vcpu);
@@ -509,6 +520,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
#ifdef CONFIG_PPC_BOOK3S_64
case KVM_CAP_SPAPR_TCE:
+ case KVM_CAP_SPAPR_TCE_64:
case KVM_CAP_PPC_ALLOC_HTAB:
case KVM_CAP_PPC_RTAS:
case KVM_CAP_PPC_FIXUP_HCALL:
@@ -569,6 +581,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_PPC_GET_SMMU_INFO:
r = 1;
break;
+ case KVM_CAP_SPAPR_MULTITCE:
+ r = 1;
+ break;
#endif
default:
r = 0;
@@ -1331,13 +1346,34 @@ long kvm_arch_vm_ioctl(struct file *filp,
break;
}
#ifdef CONFIG_PPC_BOOK3S_64
+ case KVM_CREATE_SPAPR_TCE_64: {
+ struct kvm_create_spapr_tce_64 create_tce_64;
+
+ r = -EFAULT;
+ if (copy_from_user(&create_tce_64, argp, sizeof(create_tce_64)))
+ goto out;
+ if (create_tce_64.flags) {
+ r = -EINVAL;
+ goto out;
+ }
+ r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce_64);
+ goto out;
+ }
case KVM_CREATE_SPAPR_TCE: {
struct kvm_create_spapr_tce create_tce;
+ struct kvm_create_spapr_tce_64 create_tce_64;
r = -EFAULT;
if (copy_from_user(&create_tce, argp, sizeof(create_tce)))
goto out;
- r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce);
+
+ create_tce_64.liobn = create_tce.liobn;
+ create_tce_64.page_shift = IOMMU_PAGE_SHIFT_4K;
+ create_tce_64.offset = 0;
+ create_tce_64.size = create_tce.window_size >>
+ IOMMU_PAGE_SHIFT_4K;
+ create_tce_64.flags = 0;
+ r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce_64);
goto out;
}
case KVM_PPC_GET_SMMU_INFO: {
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 83dfd79..de37ff4 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -243,3 +243,11 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
}
#endif /* CONFIG_DEBUG_VM */
+unsigned long vmalloc_to_phys(void *va)
+{
+ unsigned long pfn = vmalloc_to_pfn(va);
+
+ BUG_ON(!pfn);
+ return __pa(pfn_to_kaddr(pfn)) + offset_in_page(va);
+}
+EXPORT_SYMBOL_GPL(vmalloc_to_phys);
diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c
index 9f9dfda..3b09ecf 100644
--- a/arch/powerpc/perf/hv-24x7.c
+++ b/arch/powerpc/perf/hv-24x7.c
@@ -493,14 +493,6 @@ static size_t event_to_attr_ct(struct hv_24x7_event_data *event)
}
}
-static unsigned long vmalloc_to_phys(void *v)
-{
- struct page *p = vmalloc_to_page(v);
-
- BUG_ON(!p);
- return page_to_phys(p) + offset_in_page(v);
-}
-
/* */
struct event_uniq {
struct rb_node node;
diff --git a/arch/powerpc/sysdev/xics/icp-native.c b/arch/powerpc/sysdev/xics/icp-native.c
index eae3265..afdf62f 100644
--- a/arch/powerpc/sysdev/xics/icp-native.c
+++ b/arch/powerpc/sysdev/xics/icp-native.c
@@ -159,6 +159,27 @@ static void icp_native_cause_ipi(int cpu, unsigned long data)
icp_native_set_qirr(cpu, IPI_PRIORITY);
}
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+void icp_native_cause_ipi_rm(int cpu)
+{
+ /*
+ * Currently not used to send IPIs to another CPU
+ * on the same core. Only caller is KVM real mode.
+ * Need the physical address of the XICS to be
+ * previously saved in kvm_hstate in the paca.
+ */
+ unsigned long xics_phys;
+
+ /*
+ * Just like the cause_ipi functions, it is required to
+ * include a full barrier (out8 includes a sync) before
+ * causing the IPI.
+ */
+ xics_phys = paca[cpu].kvm_hstate.xics_phys;
+ out_rm8((u8 *)(xics_phys + XICS_MFRR), IPI_PRIORITY);
+}
+#endif
+
/*
* Called when an interrupt is received on an off-line CPU to
* clear the interrupt, so that the CPU can go back to nap mode.
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 8959ebb..727e7f7 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -229,17 +229,11 @@ struct kvm_s390_itdb {
__u8 data[256];
} __packed;
-struct kvm_s390_vregs {
- __vector128 vrs[32];
- __u8 reserved200[512]; /* for future vector expansion */
-} __packed;
-
struct sie_page {
struct kvm_s390_sie_block sie_block;
__u8 reserved200[1024]; /* 0x0200 */
struct kvm_s390_itdb itdb; /* 0x0600 */
- __u8 reserved700[1280]; /* 0x0700 */
- struct kvm_s390_vregs vregs; /* 0x0c00 */
+ __u8 reserved700[2304]; /* 0x0700 */
} __packed;
struct kvm_vcpu_stat {
diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
index fe84bd5..347fe5a 100644
--- a/arch/s390/include/uapi/asm/kvm.h
+++ b/arch/s390/include/uapi/asm/kvm.h
@@ -154,6 +154,7 @@ struct kvm_guest_debug_arch {
#define KVM_SYNC_PFAULT (1UL << 5)
#define KVM_SYNC_VRS (1UL << 6)
#define KVM_SYNC_RICCB (1UL << 7)
+#define KVM_SYNC_FPRS (1UL << 8)
/* definition of registers in kvm_run */
struct kvm_sync_regs {
__u64 prefix; /* prefix register */
@@ -168,9 +169,12 @@ struct kvm_sync_regs {
__u64 pft; /* pfault token [PFAULT] */
__u64 pfs; /* pfault select [PFAULT] */
__u64 pfc; /* pfault compare [PFAULT] */
- __u64 vrs[32][2]; /* vector registers */
+ union {
+ __u64 vrs[32][2]; /* vector registers (KVM_SYNC_VRS) */
+ __u64 fprs[16]; /* fp registers (KVM_SYNC_FPRS) */
+ };
__u8 reserved[512]; /* for future vector expansion */
- __u32 fpc; /* only valid with vector registers */
+ __u32 fpc; /* valid on KVM_SYNC_VRS or KVM_SYNC_FPRS */
__u8 padding[52]; /* riccb needs to be 64byte aligned */
__u8 riccb[64]; /* runtime instrumentation controls block */
};
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index d30db40..66938d2 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -373,7 +373,7 @@ void ipte_unlock(struct kvm_vcpu *vcpu)
}
static int ar_translation(struct kvm_vcpu *vcpu, union asce *asce, ar_t ar,
- int write)
+ enum gacc_mode mode)
{
union alet alet;
struct ale ale;
@@ -454,7 +454,7 @@ static int ar_translation(struct kvm_vcpu *vcpu, union asce *asce, ar_t ar,
}
}
- if (ale.fo == 1 && write)
+ if (ale.fo == 1 && mode == GACC_STORE)
return PGM_PROTECTION;
asce->val = aste.asce;
@@ -477,25 +477,28 @@ enum {
};
static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce,
- ar_t ar, int write)
+ ar_t ar, enum gacc_mode mode)
{
int rc;
- psw_t *psw = &vcpu->arch.sie_block->gpsw;
+ struct psw_bits psw = psw_bits(vcpu->arch.sie_block->gpsw);
struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
struct trans_exc_code_bits *tec_bits;
memset(pgm, 0, sizeof(*pgm));
tec_bits = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
- tec_bits->fsi = write ? FSI_STORE : FSI_FETCH;
- tec_bits->as = psw_bits(*psw).as;
+ tec_bits->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH;
+ tec_bits->as = psw.as;
- if (!psw_bits(*psw).t) {
+ if (!psw.t) {
asce->val = 0;
asce->r = 1;
return 0;
}
- switch (psw_bits(vcpu->arch.sie_block->gpsw).as) {
+ if (mode == GACC_IFETCH)
+ psw.as = psw.as == PSW_AS_HOME ? PSW_AS_HOME : PSW_AS_PRIMARY;
+
+ switch (psw.as) {
case PSW_AS_PRIMARY:
asce->val = vcpu->arch.sie_block->gcr[1];
return 0;
@@ -506,7 +509,7 @@ static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce,
asce->val = vcpu->arch.sie_block->gcr[13];
return 0;
case PSW_AS_ACCREG:
- rc = ar_translation(vcpu, asce, ar, write);
+ rc = ar_translation(vcpu, asce, ar, mode);
switch (rc) {
case PGM_ALEN_TRANSLATION:
case PGM_ALE_SEQUENCE:
@@ -538,7 +541,7 @@ static int deref_table(struct kvm *kvm, unsigned long gpa, unsigned long *val)
* @gva: guest virtual address
* @gpa: points to where guest physical (absolute) address should be stored
* @asce: effective asce
- * @write: indicates if access is a write access
+ * @mode: indicates the access mode to be used
*
* Translate a guest virtual address into a guest absolute address by means
* of dynamic address translation as specified by the architecture.
@@ -554,7 +557,7 @@ static int deref_table(struct kvm *kvm, unsigned long gpa, unsigned long *val)
*/
static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
unsigned long *gpa, const union asce asce,
- int write)
+ enum gacc_mode mode)
{
union vaddress vaddr = {.addr = gva};
union raddress raddr = {.addr = gva};
@@ -699,7 +702,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
real_address:
raddr.addr = kvm_s390_real_to_abs(vcpu, raddr.addr);
absolute_address:
- if (write && dat_protection)
+ if (mode == GACC_STORE && dat_protection)
return PGM_PROTECTION;
if (kvm_is_error_gpa(vcpu->kvm, raddr.addr))
return PGM_ADDRESSING;
@@ -728,7 +731,7 @@ static int low_address_protection_enabled(struct kvm_vcpu *vcpu,
static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga,
unsigned long *pages, unsigned long nr_pages,
- const union asce asce, int write)
+ const union asce asce, enum gacc_mode mode)
{
struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
psw_t *psw = &vcpu->arch.sie_block->gpsw;
@@ -740,13 +743,13 @@ static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga,
while (nr_pages) {
ga = kvm_s390_logical_to_effective(vcpu, ga);
tec_bits->addr = ga >> PAGE_SHIFT;
- if (write && lap_enabled && is_low_address(ga)) {
+ if (mode == GACC_STORE && lap_enabled && is_low_address(ga)) {
pgm->code = PGM_PROTECTION;
return pgm->code;
}
ga &= PAGE_MASK;
if (psw_bits(*psw).t) {
- rc = guest_translate(vcpu, ga, pages, asce, write);
+ rc = guest_translate(vcpu, ga, pages, asce, mode);
if (rc < 0)
return rc;
if (rc == PGM_PROTECTION)
@@ -768,7 +771,7 @@ static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga,
}
int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
- unsigned long len, int write)
+ unsigned long len, enum gacc_mode mode)
{
psw_t *psw = &vcpu->arch.sie_block->gpsw;
unsigned long _len, nr_pages, gpa, idx;
@@ -780,7 +783,7 @@ int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
if (!len)
return 0;
- rc = get_vcpu_asce(vcpu, &asce, ar, write);
+ rc = get_vcpu_asce(vcpu, &asce, ar, mode);
if (rc)
return rc;
nr_pages = (((ga & ~PAGE_MASK) + len - 1) >> PAGE_SHIFT) + 1;
@@ -792,11 +795,11 @@ int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
need_ipte_lock = psw_bits(*psw).t && !asce.r;
if (need_ipte_lock)
ipte_lock(vcpu);
- rc = guest_page_range(vcpu, ga, pages, nr_pages, asce, write);
+ rc = guest_page_range(vcpu, ga, pages, nr_pages, asce, mode);
for (idx = 0; idx < nr_pages && !rc; idx++) {
gpa = *(pages + idx) + (ga & ~PAGE_MASK);
_len = min(PAGE_SIZE - (gpa & ~PAGE_MASK), len);
- if (write)
+ if (mode == GACC_STORE)
rc = kvm_write_guest(vcpu->kvm, gpa, data, _len);
else
rc = kvm_read_guest(vcpu->kvm, gpa, data, _len);
@@ -812,7 +815,7 @@ int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
}
int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
- void *data, unsigned long len, int write)
+ void *data, unsigned long len, enum gacc_mode mode)
{
unsigned long _len, gpa;
int rc = 0;
@@ -820,7 +823,7 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
while (len && !rc) {
gpa = kvm_s390_real_to_abs(vcpu, gra);
_len = min(PAGE_SIZE - (gpa & ~PAGE_MASK), len);
- if (write)
+ if (mode)
rc = write_guest_abs(vcpu, gpa, data, _len);
else
rc = read_guest_abs(vcpu, gpa, data, _len);
@@ -841,7 +844,7 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
* has to take care of this.
*/
int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar,
- unsigned long *gpa, int write)
+ unsigned long *gpa, enum gacc_mode mode)
{
struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
psw_t *psw = &vcpu->arch.sie_block->gpsw;
@@ -851,19 +854,19 @@ int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar,
gva = kvm_s390_logical_to_effective(vcpu, gva);
tec = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
- rc = get_vcpu_asce(vcpu, &asce, ar, write);
+ rc = get_vcpu_asce(vcpu, &asce, ar, mode);
tec->addr = gva >> PAGE_SHIFT;
if (rc)
return rc;
if (is_low_address(gva) && low_address_protection_enabled(vcpu, asce)) {
- if (write) {
+ if (mode == GACC_STORE) {
rc = pgm->code = PGM_PROTECTION;
return rc;
}
}
if (psw_bits(*psw).t && !asce.r) { /* Use DAT? */
- rc = guest_translate(vcpu, gva, gpa, asce, write);
+ rc = guest_translate(vcpu, gva, gpa, asce, mode);
if (rc > 0) {
if (rc == PGM_PROTECTION)
tec->b61 = 1;
@@ -883,7 +886,7 @@ int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar,
* check_gva_range - test a range of guest virtual addresses for accessibility
*/
int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar,
- unsigned long length, int is_write)
+ unsigned long length, enum gacc_mode mode)
{
unsigned long gpa;
unsigned long currlen;
@@ -892,7 +895,7 @@ int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar,
ipte_lock(vcpu);
while (length > 0 && !rc) {
currlen = min(length, PAGE_SIZE - (gva % PAGE_SIZE));
- rc = guest_translate_address(vcpu, gva, ar, &gpa, is_write);
+ rc = guest_translate_address(vcpu, gva, ar, &gpa, mode);
gva += currlen;
length -= currlen;
}
diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h
index ef03726..df0a79d 100644
--- a/arch/s390/kvm/gaccess.h
+++ b/arch/s390/kvm/gaccess.h
@@ -155,16 +155,22 @@ int read_guest_lc(struct kvm_vcpu *vcpu, unsigned long gra, void *data,
return kvm_read_guest(vcpu->kvm, gpa, data, len);
}
+enum gacc_mode {
+ GACC_FETCH,
+ GACC_STORE,
+ GACC_IFETCH,
+};
+
int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva,
- ar_t ar, unsigned long *gpa, int write);
+ ar_t ar, unsigned long *gpa, enum gacc_mode mode);
int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar,
- unsigned long length, int is_write);
+ unsigned long length, enum gacc_mode mode);
int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
- unsigned long len, int write);
+ unsigned long len, enum gacc_mode mode);
int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
- void *data, unsigned long len, int write);
+ void *data, unsigned long len, enum gacc_mode mode);
/**
* write_guest - copy data from kernel space to guest space
@@ -215,7 +221,7 @@ static inline __must_check
int write_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
unsigned long len)
{
- return access_guest(vcpu, ga, ar, data, len, 1);
+ return access_guest(vcpu, ga, ar, data, len, GACC_STORE);
}
/**
@@ -235,7 +241,27 @@ static inline __must_check
int read_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
unsigned long len)
{
- return access_guest(vcpu, ga, ar, data, len, 0);
+ return access_guest(vcpu, ga, ar, data, len, GACC_FETCH);
+}
+
+/**
+ * read_guest_instr - copy instruction data from guest space to kernel space
+ * @vcpu: virtual cpu
+ * @data: destination address in kernel space
+ * @len: number of bytes to copy
+ *
+ * Copy @len bytes from the current psw address (guest space) to @data (kernel
+ * space).
+ *
+ * The behaviour of read_guest_instr is identical to read_guest, except that
+ * instruction data will be read from primary space when in home-space or
+ * address-space mode.
+ */
+static inline __must_check
+int read_guest_instr(struct kvm_vcpu *vcpu, void *data, unsigned long len)
+{
+ return access_guest(vcpu, vcpu->arch.sie_block->gpsw.addr, 0, data, len,
+ GACC_IFETCH);
}
/**
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index d53c107..2e6b54e 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -38,17 +38,32 @@ static const intercept_handler_t instruction_handlers[256] = {
[0xeb] = kvm_s390_handle_eb,
};
-void kvm_s390_rewind_psw(struct kvm_vcpu *vcpu, int ilc)
+u8 kvm_s390_get_ilen(struct kvm_vcpu *vcpu)
{
struct kvm_s390_sie_block *sie_block = vcpu->arch.sie_block;
+ u8 ilen = 0;
- /* Use the length of the EXECUTE instruction if necessary */
- if (sie_block->icptstatus & 1) {
- ilc = (sie_block->icptstatus >> 4) & 0x6;
- if (!ilc)
- ilc = 4;
+ switch (vcpu->arch.sie_block->icptcode) {
+ case ICPT_INST:
+ case ICPT_INSTPROGI:
+ case ICPT_OPEREXC:
+ case ICPT_PARTEXEC:
+ case ICPT_IOINST:
+ /* instruction only stored for these icptcodes */
+ ilen = insn_length(vcpu->arch.sie_block->ipa >> 8);
+ /* Use the length of the EXECUTE instruction if necessary */
+ if (sie_block->icptstatus & 1) {
+ ilen = (sie_block->icptstatus >> 4) & 0x6;
+ if (!ilen)
+ ilen = 4;
+ }
+ break;
+ case ICPT_PROGI:
+ /* bit 1+2 of pgmilc are the ilc, so we directly get ilen */
+ ilen = vcpu->arch.sie_block->pgmilc & 0x6;
+ break;
}
- sie_block->gpsw.addr = __rewind_psw(sie_block->gpsw, ilc);
+ return ilen;
}
static int handle_noop(struct kvm_vcpu *vcpu)
@@ -121,11 +136,13 @@ static int handle_instruction(struct kvm_vcpu *vcpu)
return -EOPNOTSUPP;
}
-static void __extract_prog_irq(struct kvm_vcpu *vcpu,
- struct kvm_s390_pgm_info *pgm_info)
+static int inject_prog_on_prog_intercept(struct kvm_vcpu *vcpu)
{
- memset(pgm_info, 0, sizeof(struct kvm_s390_pgm_info));
- pgm_info->code = vcpu->arch.sie_block->iprcc;
+ struct kvm_s390_pgm_info pgm_info = {
+ .code = vcpu->arch.sie_block->iprcc,
+ /* the PSW has already been rewound */
+ .flags = KVM_S390_PGM_FLAGS_NO_REWIND,
+ };
switch (vcpu->arch.sie_block->iprcc & ~PGM_PER) {
case PGM_AFX_TRANSLATION:
@@ -138,7 +155,7 @@ static void __extract_prog_irq(struct kvm_vcpu *vcpu,
case PGM_PRIMARY_AUTHORITY:
case PGM_SECONDARY_AUTHORITY:
case PGM_SPACE_SWITCH:
- pgm_info->trans_exc_code = vcpu->arch.sie_block->tecmc;
+ pgm_info.trans_exc_code = vcpu->arch.sie_block->tecmc;
break;
case PGM_ALEN_TRANSLATION:
case PGM_ALE_SEQUENCE:
@@ -146,7 +163,7 @@ static void __extract_prog_irq(struct kvm_vcpu *vcpu,
case PGM_ASTE_SEQUENCE:
case PGM_ASTE_VALIDITY:
case PGM_EXTENDED_AUTHORITY:
- pgm_info->exc_access_id = vcpu->arch.sie_block->eai;
+ pgm_info.exc_access_id = vcpu->arch.sie_block->eai;
break;
case PGM_ASCE_TYPE:
case PGM_PAGE_TRANSLATION:
@@ -154,32 +171,33 @@ static void __extract_prog_irq(struct kvm_vcpu *vcpu,
case PGM_REGION_SECOND_TRANS:
case PGM_REGION_THIRD_TRANS:
case PGM_SEGMENT_TRANSLATION:
- pgm_info->trans_exc_code = vcpu->arch.sie_block->tecmc;
- pgm_info->exc_access_id = vcpu->arch.sie_block->eai;
- pgm_info->op_access_id = vcpu->arch.sie_block->oai;
+ pgm_info.trans_exc_code = vcpu->arch.sie_block->tecmc;
+ pgm_info.exc_access_id = vcpu->arch.sie_block->eai;
+ pgm_info.op_access_id = vcpu->arch.sie_block->oai;
break;
case PGM_MONITOR:
- pgm_info->mon_class_nr = vcpu->arch.sie_block->mcn;
- pgm_info->mon_code = vcpu->arch.sie_block->tecmc;
+ pgm_info.mon_class_nr = vcpu->arch.sie_block->mcn;
+ pgm_info.mon_code = vcpu->arch.sie_block->tecmc;
break;
case PGM_VECTOR_PROCESSING:
case PGM_DATA:
- pgm_info->data_exc_code = vcpu->arch.sie_block->dxc;
+ pgm_info.data_exc_code = vcpu->arch.sie_block->dxc;
break;
case PGM_PROTECTION:
- pgm_info->trans_exc_code = vcpu->arch.sie_block->tecmc;
- pgm_info->exc_access_id = vcpu->arch.sie_block->eai;
+ pgm_info.trans_exc_code = vcpu->arch.sie_block->tecmc;
+ pgm_info.exc_access_id = vcpu->arch.sie_block->eai;
break;
default:
break;
}
if (vcpu->arch.sie_block->iprcc & PGM_PER) {
- pgm_info->per_code = vcpu->arch.sie_block->perc;
- pgm_info->per_atmid = vcpu->arch.sie_block->peratmid;
- pgm_info->per_address = vcpu->arch.sie_block->peraddr;
- pgm_info->per_access_id = vcpu->arch.sie_block->peraid;
+ pgm_info.per_code = vcpu->arch.sie_block->perc;
+ pgm_info.per_atmid = vcpu->arch.sie_block->peratmid;
+ pgm_info.per_address = vcpu->arch.sie_block->peraddr;
+ pgm_info.per_access_id = vcpu->arch.sie_block->peraid;
}
+ return kvm_s390_inject_prog_irq(vcpu, &pgm_info);
}
/*
@@ -208,7 +226,6 @@ static int handle_itdb(struct kvm_vcpu *vcpu)
static int handle_prog(struct kvm_vcpu *vcpu)
{
- struct kvm_s390_pgm_info pgm_info;
psw_t psw;
int rc;
@@ -234,8 +251,7 @@ static int handle_prog(struct kvm_vcpu *vcpu)
if (rc)
return rc;
- __extract_prog_irq(vcpu, &pgm_info);
- return kvm_s390_inject_prog_irq(vcpu, &pgm_info);
+ return inject_prog_on_prog_intercept(vcpu);
}
/**
@@ -302,7 +318,7 @@ static int handle_mvpg_pei(struct kvm_vcpu *vcpu)
/* Make sure that the source is paged-in */
rc = guest_translate_address(vcpu, vcpu->run->s.regs.gprs[reg2],
- reg2, &srcaddr, 0);
+ reg2, &srcaddr, GACC_FETCH);
if (rc)
return kvm_s390_inject_prog_cond(vcpu, rc);
rc = kvm_arch_fault_in_page(vcpu, srcaddr, 0);
@@ -311,14 +327,14 @@ static int handle_mvpg_pei(struct kvm_vcpu *vcpu)
/* Make sure that the destination is paged-in */
rc = guest_translate_address(vcpu, vcpu->run->s.regs.gprs[reg1],
- reg1, &dstaddr, 1);
+ reg1, &dstaddr, GACC_STORE);
if (rc)
return kvm_s390_inject_prog_cond(vcpu, rc);
rc = kvm_arch_fault_in_page(vcpu, dstaddr, 1);
if (rc != 0)
return rc;
- kvm_s390_rewind_psw(vcpu, 4);
+ kvm_s390_retry_instr(vcpu);
return 0;
}
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index f88ca72..87e2d1a 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -335,23 +335,6 @@ static void set_intercept_indicators(struct kvm_vcpu *vcpu)
set_intercept_indicators_stop(vcpu);
}
-static u16 get_ilc(struct kvm_vcpu *vcpu)
-{
- switch (vcpu->arch.sie_block->icptcode) {
- case ICPT_INST:
- case ICPT_INSTPROGI:
- case ICPT_OPEREXC:
- case ICPT_PARTEXEC:
- case ICPT_IOINST:
- /* last instruction only stored for these icptcodes */
- return insn_length(vcpu->arch.sie_block->ipa >> 8);
- case ICPT_PROGI:
- return vcpu->arch.sie_block->pgmilc;
- default:
- return 0;
- }
-}
-
static int __must_check __deliver_cpu_timer(struct kvm_vcpu *vcpu)
{
struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
@@ -588,7 +571,7 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu)
struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
struct kvm_s390_pgm_info pgm_info;
int rc = 0, nullifying = false;
- u16 ilc = get_ilc(vcpu);
+ u16 ilen;
spin_lock(&li->lock);
pgm_info = li->irq.pgm;
@@ -596,8 +579,9 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu)
memset(&li->irq.pgm, 0, sizeof(pgm_info));
spin_unlock(&li->lock);
- VCPU_EVENT(vcpu, 3, "deliver: program irq code 0x%x, ilc:%d",
- pgm_info.code, ilc);
+ ilen = pgm_info.flags & KVM_S390_PGM_FLAGS_ILC_MASK;
+ VCPU_EVENT(vcpu, 3, "deliver: program irq code 0x%x, ilen:%d",
+ pgm_info.code, ilen);
vcpu->stat.deliver_program_int++;
trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_PROGRAM_INT,
pgm_info.code, 0);
@@ -681,10 +665,11 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu)
(u8 *) __LC_PER_ACCESS_ID);
}
- if (nullifying && vcpu->arch.sie_block->icptcode == ICPT_INST)
- kvm_s390_rewind_psw(vcpu, ilc);
+ if (nullifying && !(pgm_info.flags & KVM_S390_PGM_FLAGS_NO_REWIND))
+ kvm_s390_rewind_psw(vcpu, ilen);
- rc |= put_guest_lc(vcpu, ilc, (u16 *) __LC_PGM_ILC);
+ /* bit 1+2 of the target are the ilc, so we can directly use ilen */
+ rc |= put_guest_lc(vcpu, ilen, (u16 *) __LC_PGM_ILC);
rc |= put_guest_lc(vcpu, vcpu->arch.sie_block->gbea,
(u64 *) __LC_LAST_BREAK);
rc |= put_guest_lc(vcpu, pgm_info.code,
@@ -1059,8 +1044,16 @@ static int __inject_prog(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_PROGRAM_INT,
irq->u.pgm.code, 0);
+ if (!(irq->u.pgm.flags & KVM_S390_PGM_FLAGS_ILC_VALID)) {
+ /* auto detection if no valid ILC was given */
+ irq->u.pgm.flags &= ~KVM_S390_PGM_FLAGS_ILC_MASK;
+ irq->u.pgm.flags |= kvm_s390_get_ilen(vcpu);
+ irq->u.pgm.flags |= KVM_S390_PGM_FLAGS_ILC_VALID;
+ }
+
if (irq->u.pgm.code == PGM_PER) {
li->irq.pgm.code |= PGM_PER;
+ li->irq.pgm.flags = irq->u.pgm.flags;
/* only modify PER related information */
li->irq.pgm.per_address = irq->u.pgm.per_address;
li->irq.pgm.per_code = irq->u.pgm.per_code;
@@ -1069,6 +1062,7 @@ static int __inject_prog(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
} else if (!(irq->u.pgm.code & PGM_PER)) {
li->irq.pgm.code = (li->irq.pgm.code & PGM_PER) |
irq->u.pgm.code;
+ li->irq.pgm.flags = irq->u.pgm.flags;
/* only modify non-PER information */
li->irq.pgm.trans_exc_code = irq->u.pgm.trans_exc_code;
li->irq.pgm.mon_code = irq->u.pgm.mon_code;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 4af21c7..28bd5ea 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -274,7 +274,6 @@ static void kvm_s390_sync_dirty_log(struct kvm *kvm,
unsigned long address;
struct gmap *gmap = kvm->arch.gmap;
- down_read(&gmap->mm->mmap_sem);
/* Loop over all guest pages */
last_gfn = memslot->base_gfn + memslot->npages;
for (cur_gfn = memslot->base_gfn; cur_gfn <= last_gfn; cur_gfn++) {
@@ -282,8 +281,10 @@ static void kvm_s390_sync_dirty_log(struct kvm *kvm,
if (gmap_test_and_clear_dirty(address, gmap))
mark_page_dirty(kvm, cur_gfn);
+ if (fatal_signal_pending(current))
+ return;
+ cond_resched();
}
- up_read(&gmap->mm->mmap_sem);
}
/* Section: vm related */
@@ -1414,8 +1415,13 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
KVM_SYNC_PFAULT;
if (test_kvm_facility(vcpu->kvm, 64))
vcpu->run->kvm_valid_regs |= KVM_SYNC_RICCB;
- if (test_kvm_facility(vcpu->kvm, 129))
+ /* fprs can be synchronized via vrs, even if the guest has no vx. With
+ * MACHINE_HAS_VX, (load|store)_fpu_regs() will work with vrs format.
+ */
+ if (MACHINE_HAS_VX)
vcpu->run->kvm_valid_regs |= KVM_SYNC_VRS;
+ else
+ vcpu->run->kvm_valid_regs |= KVM_SYNC_FPRS;
if (kvm_is_ucontrol(vcpu->kvm))
return __kvm_ucontrol_vcpu_init(vcpu);
@@ -1430,10 +1436,10 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
vcpu->arch.host_fpregs.fpc = current->thread.fpu.fpc;
vcpu->arch.host_fpregs.regs = current->thread.fpu.regs;
- /* Depending on MACHINE_HAS_VX, data stored to vrs either
- * has vector register or floating point register format.
- */
- current->thread.fpu.regs = vcpu->run->s.regs.vrs;
+ if (MACHINE_HAS_VX)
+ current->thread.fpu.regs = vcpu->run->s.regs.vrs;
+ else
+ current->thread.fpu.regs = vcpu->run->s.regs.fprs;
current->thread.fpu.fpc = vcpu->run->s.regs.fpc;
if (test_fp_ctl(current->thread.fpu.fpc))
/* User space provided an invalid FPC, let's clear it */
@@ -2158,8 +2164,10 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu)
static int vcpu_post_run_fault_in_sie(struct kvm_vcpu *vcpu)
{
- psw_t *psw = &vcpu->arch.sie_block->gpsw;
- u8 opcode;
+ struct kvm_s390_pgm_info pgm_info = {
+ .code = PGM_ADDRESSING,
+ };
+ u8 opcode, ilen;
int rc;
VCPU_EVENT(vcpu, 3, "%s", "fault in sie instruction");
@@ -2173,12 +2181,21 @@ static int vcpu_post_run_fault_in_sie(struct kvm_vcpu *vcpu)
* to look up the current opcode to get the length of the instruction
* to be able to forward the PSW.
*/
- rc = read_guest(vcpu, psw->addr, 0, &opcode, 1);
- if (rc)
- return kvm_s390_inject_prog_cond(vcpu, rc);
- psw->addr = __rewind_psw(*psw, -insn_length(opcode));
-
- return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+ rc = read_guest_instr(vcpu, &opcode, 1);
+ ilen = insn_length(opcode);
+ if (rc < 0) {
+ return rc;
+ } else if (rc) {
+ /* Instruction-Fetching Exceptions - we can't detect the ilen.
+ * Forward by arbitrary ilc, injection will take care of
+ * nullification if necessary.
+ */
+ pgm_info = vcpu->arch.pgm;
+ ilen = 4;
+ }
+ pgm_info.flags = ilen | KVM_S390_PGM_FLAGS_ILC_VALID;
+ kvm_s390_forward_psw(vcpu, ilen);
+ return kvm_s390_inject_prog_irq(vcpu, &pgm_info);
}
static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
@@ -2386,7 +2403,7 @@ int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long gpa)
fprs, 128);
} else {
rc = write_guest_abs(vcpu, gpa + __LC_FPREGS_SAVE_AREA,
- vcpu->run->s.regs.vrs, 128);
+ vcpu->run->s.regs.fprs, 128);
}
rc |= write_guest_abs(vcpu, gpa + __LC_GPREGS_SAVE_AREA,
vcpu->run->s.regs.gprs, 128);
@@ -2605,7 +2622,8 @@ static long kvm_s390_guest_mem_op(struct kvm_vcpu *vcpu,
switch (mop->op) {
case KVM_S390_MEMOP_LOGICAL_READ:
if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
- r = check_gva_range(vcpu, mop->gaddr, mop->ar, mop->size, false);
+ r = check_gva_range(vcpu, mop->gaddr, mop->ar,
+ mop->size, GACC_FETCH);
break;
}
r = read_guest(vcpu, mop->gaddr, mop->ar, tmpbuf, mop->size);
@@ -2616,7 +2634,8 @@ static long kvm_s390_guest_mem_op(struct kvm_vcpu *vcpu,
break;
case KVM_S390_MEMOP_LOGICAL_WRITE:
if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
- r = check_gva_range(vcpu, mop->gaddr, mop->ar, mop->size, true);
+ r = check_gva_range(vcpu, mop->gaddr, mop->ar,
+ mop->size, GACC_STORE);
break;
}
if (copy_from_user(tmpbuf, uaddr, mop->size)) {
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index df1abad..1c756c7 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -19,6 +19,7 @@
#include <linux/kvm.h>
#include <linux/kvm_host.h>
#include <asm/facility.h>
+#include <asm/processor.h>
typedef int (*intercept_handler_t)(struct kvm_vcpu *vcpu);
@@ -212,8 +213,22 @@ int kvm_s390_reinject_io_int(struct kvm *kvm,
int kvm_s390_mask_adapter(struct kvm *kvm, unsigned int id, bool masked);
/* implemented in intercept.c */
-void kvm_s390_rewind_psw(struct kvm_vcpu *vcpu, int ilc);
+u8 kvm_s390_get_ilen(struct kvm_vcpu *vcpu);
int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu);
+static inline void kvm_s390_rewind_psw(struct kvm_vcpu *vcpu, int ilen)
+{
+ struct kvm_s390_sie_block *sie_block = vcpu->arch.sie_block;
+
+ sie_block->gpsw.addr = __rewind_psw(sie_block->gpsw, ilen);
+}
+static inline void kvm_s390_forward_psw(struct kvm_vcpu *vcpu, int ilen)
+{
+ kvm_s390_rewind_psw(vcpu, -ilen);
+}
+static inline void kvm_s390_retry_instr(struct kvm_vcpu *vcpu)
+{
+ kvm_s390_rewind_psw(vcpu, kvm_s390_get_ilen(vcpu));
+}
/* implemented in priv.c */
int is_valid_psw(psw_t *psw);
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index ed74e86..add9909 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -173,7 +173,7 @@ static int handle_skey(struct kvm_vcpu *vcpu)
if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
- kvm_s390_rewind_psw(vcpu, 4);
+ kvm_s390_retry_instr(vcpu);
VCPU_EVENT(vcpu, 4, "%s", "retrying storage key operation");
return 0;
}
@@ -184,7 +184,7 @@ static int handle_ipte_interlock(struct kvm_vcpu *vcpu)
if (psw_bits(vcpu->arch.sie_block->gpsw).p)
return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
wait_event(vcpu->kvm->arch.ipte_wq, !ipte_lock_held(vcpu));
- kvm_s390_rewind_psw(vcpu, 4);
+ kvm_s390_retry_instr(vcpu);
VCPU_EVENT(vcpu, 4, "%s", "retrying ipte interlock operation");
return 0;
}
@@ -759,8 +759,8 @@ static int handle_essa(struct kvm_vcpu *vcpu)
if (((vcpu->arch.sie_block->ipb & 0xf0000000) >> 28) > 6)
return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
- /* Rewind PSW to repeat the ESSA instruction */
- kvm_s390_rewind_psw(vcpu, 4);
+ /* Retry the ESSA instruction */
+ kvm_s390_retry_instr(vcpu);
vcpu->arch.sie_block->cbrlo &= PAGE_MASK; /* reset nceo */
cbrlo = phys_to_virt(vcpu->arch.sie_block->cbrlo);
down_read(&gmap->mm->mmap_sem);
@@ -981,11 +981,12 @@ static int handle_tprot(struct kvm_vcpu *vcpu)
return -EOPNOTSUPP;
if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_DAT)
ipte_lock(vcpu);
- ret = guest_translate_address(vcpu, address1, ar, &gpa, 1);
+ ret = guest_translate_address(vcpu, address1, ar, &gpa, GACC_STORE);
if (ret == PGM_PROTECTION) {
/* Write protected? Try again with read-only... */
cc = 1;
- ret = guest_translate_address(vcpu, address1, ar, &gpa, 0);
+ ret = guest_translate_address(vcpu, address1, ar, &gpa,
+ GACC_FETCH);
}
if (ret) {
if (ret == PGM_ADDRESSING || ret == PGM_TRANSLATION_SPEC) {
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 44adbb8..d110dc4 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -32,6 +32,7 @@
#include <asm/mtrr.h>
#include <asm/msr-index.h>
#include <asm/asm.h>
+#include <asm/kvm_page_track.h>
#define KVM_MAX_VCPUS 255
#define KVM_SOFT_MAX_VCPUS 160
@@ -214,6 +215,14 @@ struct kvm_mmu_memory_cache {
void *objects[KVM_NR_MEM_OBJS];
};
+/*
+ * the pages used as guest page table on soft mmu are tracked by
+ * kvm_memory_slot.arch.gfn_track which is 16 bits, so the role bits used
+ * by indirect shadow page can not be more than 15 bits.
+ *
+ * Currently, we used 14 bits that are @level, @cr4_pae, @quadrant, @access,
+ * @nxe, @cr0_wp, @smep_andnot_wp and @smap_andnot_wp.
+ */
union kvm_mmu_page_role {
unsigned word;
struct {
@@ -276,7 +285,7 @@ struct kvm_mmu_page {
#endif
/* Number of writes since the last time traversal visited this page. */
- int write_flooding_count;
+ atomic_t write_flooding_count;
};
struct kvm_pio_request {
@@ -338,12 +347,8 @@ struct kvm_mmu {
struct rsvd_bits_validate guest_rsvd_check;
- /*
- * Bitmap: bit set = last pte in walk
- * index[0:1]: level (zero-based)
- * index[2]: pte.ps
- */
- u8 last_pte_bitmap;
+ /* Can have large pages at levels 2..last_nonleaf_level-1. */
+ u8 last_nonleaf_level;
bool nx;
@@ -644,12 +649,13 @@ struct kvm_vcpu_arch {
};
struct kvm_lpage_info {
- int write_count;
+ int disallow_lpage;
};
struct kvm_arch_memory_slot {
struct kvm_rmap_head *rmap[KVM_NR_PAGE_SIZES];
struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
+ unsigned short *gfn_track[KVM_PAGE_TRACK_MAX];
};
/*
@@ -694,6 +700,8 @@ struct kvm_arch {
*/
struct list_head active_mmu_pages;
struct list_head zapped_obsolete_pages;
+ struct kvm_page_track_notifier_node mmu_sp_tracker;
+ struct kvm_page_track_notifier_head track_notifier_head;
struct list_head assigned_dev_head;
struct iommu_domain *iommu_domain;
@@ -754,6 +762,8 @@ struct kvm_arch {
bool irqchip_split;
u8 nr_reserved_ioapic_pins;
+
+ bool disabled_lapic_found;
};
struct kvm_vm_stat {
@@ -988,6 +998,8 @@ void kvm_mmu_module_exit(void);
void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
int kvm_mmu_create(struct kvm_vcpu *vcpu);
void kvm_mmu_setup(struct kvm_vcpu *vcpu);
+void kvm_mmu_init_vm(struct kvm *kvm);
+void kvm_mmu_uninit_vm(struct kvm *kvm);
void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
u64 dirty_mask, u64 nx_mask, u64 x_mask);
@@ -1127,8 +1139,6 @@ void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id);
void kvm_inject_nmi(struct kvm_vcpu *vcpu);
-void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
- const u8 *new, int bytes);
int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn);
int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h
new file mode 100644
index 0000000..c2b8d24
--- /dev/null
+++ b/arch/x86/include/asm/kvm_page_track.h
@@ -0,0 +1,61 @@
+#ifndef _ASM_X86_KVM_PAGE_TRACK_H
+#define _ASM_X86_KVM_PAGE_TRACK_H
+
+enum kvm_page_track_mode {
+ KVM_PAGE_TRACK_WRITE,
+ KVM_PAGE_TRACK_MAX,
+};
+
+/*
+ * The notifier represented by @kvm_page_track_notifier_node is linked into
+ * the head which will be notified when guest is triggering the track event.
+ *
+ * Write access on the head is protected by kvm->mmu_lock, read access
+ * is protected by track_srcu.
+ */
+struct kvm_page_track_notifier_head {
+ struct srcu_struct track_srcu;
+ struct hlist_head track_notifier_list;
+};
+
+struct kvm_page_track_notifier_node {
+ struct hlist_node node;
+
+ /*
+ * It is called when guest is writing the write-tracked page
+ * and write emulation is finished at that time.
+ *
+ * @vcpu: the vcpu where the write access happened.
+ * @gpa: the physical address written by guest.
+ * @new: the data was written to the address.
+ * @bytes: the written length.
+ */
+ void (*track_write)(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
+ int bytes);
+};
+
+void kvm_page_track_init(struct kvm *kvm);
+
+void kvm_page_track_free_memslot(struct kvm_memory_slot *free,
+ struct kvm_memory_slot *dont);
+int kvm_page_track_create_memslot(struct kvm_memory_slot *slot,
+ unsigned long npages);
+
+void kvm_slot_page_track_add_page(struct kvm *kvm,
+ struct kvm_memory_slot *slot, gfn_t gfn,
+ enum kvm_page_track_mode mode);
+void kvm_slot_page_track_remove_page(struct kvm *kvm,
+ struct kvm_memory_slot *slot, gfn_t gfn,
+ enum kvm_page_track_mode mode);
+bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn,
+ enum kvm_page_track_mode mode);
+
+void
+kvm_page_track_register_notifier(struct kvm *kvm,
+ struct kvm_page_track_notifier_node *n);
+void
+kvm_page_track_unregister_notifier(struct kvm *kvm,
+ struct kvm_page_track_notifier_node *n);
+void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
+ int bytes);
+#endif
diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h
index 7956412..9b1a918 100644
--- a/arch/x86/include/uapi/asm/hyperv.h
+++ b/arch/x86/include/uapi/asm/hyperv.h
@@ -226,7 +226,9 @@
(~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1))
/* Declare the various hypercall operations. */
-#define HV_X64_HV_NOTIFY_LONG_SPIN_WAIT 0x0008
+#define HVCALL_NOTIFY_LONG_SPIN_WAIT 0x0008
+#define HVCALL_POST_MESSAGE 0x005c
+#define HVCALL_SIGNAL_EVENT 0x005d
#define HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE 0x00000001
#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT 12
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index a1ff508..464fa47 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -13,9 +13,10 @@ kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
- hyperv.o
+ hyperv.o page_track.o
kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += assigned-dev.o iommu.o
+
kvm-intel-y += vmx.o pmu_intel.o
kvm-amd-y += svm.o pmu_amd.o
diff --git a/arch/x86/kvm/assigned-dev.c b/arch/x86/kvm/assigned-dev.c
index 9dc091a..308b859 100644
--- a/arch/x86/kvm/assigned-dev.c
+++ b/arch/x86/kvm/assigned-dev.c
@@ -51,11 +51,9 @@ struct kvm_assigned_dev_kernel {
static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
int assigned_dev_id)
{
- struct list_head *ptr;
struct kvm_assigned_dev_kernel *match;
- list_for_each(ptr, head) {
- match = list_entry(ptr, struct kvm_assigned_dev_kernel, list);
+ list_for_each_entry(match, head, list) {
if (match->assigned_dev_id == assigned_dev_id)
return match;
}
@@ -373,14 +371,10 @@ static void kvm_free_assigned_device(struct kvm *kvm,
void kvm_free_all_assigned_devices(struct kvm *kvm)
{
- struct list_head *ptr, *ptr2;
- struct kvm_assigned_dev_kernel *assigned_dev;
-
- list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) {
- assigned_dev = list_entry(ptr,
- struct kvm_assigned_dev_kernel,
- list);
+ struct kvm_assigned_dev_kernel *assigned_dev, *tmp;
+ list_for_each_entry_safe(assigned_dev, tmp,
+ &kvm->arch.assigned_dev_head, list) {
kvm_free_assigned_device(kvm, assigned_dev);
}
}
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index c58ba67..5ff3485 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1043,6 +1043,27 @@ bool kvm_hv_hypercall_enabled(struct kvm *kvm)
return kvm->arch.hyperv.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE;
}
+static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
+{
+ bool longmode;
+
+ longmode = is_64_bit_mode(vcpu);
+ if (longmode)
+ kvm_register_write(vcpu, VCPU_REGS_RAX, result);
+ else {
+ kvm_register_write(vcpu, VCPU_REGS_RDX, result >> 32);
+ kvm_register_write(vcpu, VCPU_REGS_RAX, result & 0xffffffff);
+ }
+}
+
+static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
+{
+ struct kvm_run *run = vcpu->run;
+
+ kvm_hv_hypercall_set_result(vcpu, run->hyperv.u.hcall.result);
+ return 1;
+}
+
int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
{
u64 param, ingpa, outgpa, ret;
@@ -1055,7 +1076,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
*/
if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) {
kvm_queue_exception(vcpu, UD_VECTOR);
- return 0;
+ return 1;
}
longmode = is_64_bit_mode(vcpu);
@@ -1083,22 +1104,33 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa);
+ /* Hypercall continuation is not supported yet */
+ if (rep_cnt || rep_idx) {
+ res = HV_STATUS_INVALID_HYPERCALL_CODE;
+ goto set_result;
+ }
+
switch (code) {
- case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT:
+ case HVCALL_NOTIFY_LONG_SPIN_WAIT:
kvm_vcpu_on_spin(vcpu);
break;
+ case HVCALL_POST_MESSAGE:
+ case HVCALL_SIGNAL_EVENT:
+ vcpu->run->exit_reason = KVM_EXIT_HYPERV;
+ vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL;
+ vcpu->run->hyperv.u.hcall.input = param;
+ vcpu->run->hyperv.u.hcall.params[0] = ingpa;
+ vcpu->run->hyperv.u.hcall.params[1] = outgpa;
+ vcpu->arch.complete_userspace_io =
+ kvm_hv_hypercall_complete_userspace;
+ return 0;
default:
res = HV_STATUS_INVALID_HYPERCALL_CODE;
break;
}
+set_result:
ret = res | (((u64)rep_done & 0xfff) << 32);
- if (longmode) {
- kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
- } else {
- kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32);
- kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff);
- }
-
+ kvm_hv_hypercall_set_result(vcpu, ret);
return 1;
}
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index b0ea42b..a4bf5b4 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -51,32 +51,9 @@
#define RW_STATE_WORD0 3
#define RW_STATE_WORD1 4
-/* Compute with 96 bit intermediate result: (a*b)/c */
-static u64 muldiv64(u64 a, u32 b, u32 c)
+static void pit_set_gate(struct kvm_pit *pit, int channel, u32 val)
{
- union {
- u64 ll;
- struct {
- u32 low, high;
- } l;
- } u, res;
- u64 rl, rh;
-
- u.ll = a;
- rl = (u64)u.l.low * (u64)b;
- rh = (u64)u.l.high * (u64)b;
- rh += (rl >> 32);
- res.l.high = div64_u64(rh, c);
- res.l.low = div64_u64(((mod_64(rh, c) << 32) + (rl & 0xffffffff)), c);
- return res.ll;
-}
-
-static void pit_set_gate(struct kvm *kvm, int channel, u32 val)
-{
- struct kvm_kpit_channel_state *c =
- &kvm->arch.vpit->pit_state.channels[channel];
-
- WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+ struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
switch (c->mode) {
default:
@@ -97,18 +74,16 @@ static void pit_set_gate(struct kvm *kvm, int channel, u32 val)
c->gate = val;
}
-static int pit_get_gate(struct kvm *kvm, int channel)
+static int pit_get_gate(struct kvm_pit *pit, int channel)
{
- WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
-
- return kvm->arch.vpit->pit_state.channels[channel].gate;
+ return pit->pit_state.channels[channel].gate;
}
-static s64 __kpit_elapsed(struct kvm *kvm)
+static s64 __kpit_elapsed(struct kvm_pit *pit)
{
s64 elapsed;
ktime_t remaining;
- struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
+ struct kvm_kpit_state *ps = &pit->pit_state;
if (!ps->period)
return 0;
@@ -128,26 +103,23 @@ static s64 __kpit_elapsed(struct kvm *kvm)
return elapsed;
}
-static s64 kpit_elapsed(struct kvm *kvm, struct kvm_kpit_channel_state *c,
+static s64 kpit_elapsed(struct kvm_pit *pit, struct kvm_kpit_channel_state *c,
int channel)
{
if (channel == 0)
- return __kpit_elapsed(kvm);
+ return __kpit_elapsed(pit);
return ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time));
}
-static int pit_get_count(struct kvm *kvm, int channel)
+static int pit_get_count(struct kvm_pit *pit, int channel)
{
- struct kvm_kpit_channel_state *c =
- &kvm->arch.vpit->pit_state.channels[channel];
+ struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
s64 d, t;
int counter;
- WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
-
- t = kpit_elapsed(kvm, c, channel);
- d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
+ t = kpit_elapsed(pit, c, channel);
+ d = mul_u64_u32_div(t, KVM_PIT_FREQ, NSEC_PER_SEC);
switch (c->mode) {
case 0:
@@ -167,17 +139,14 @@ static int pit_get_count(struct kvm *kvm, int channel)
return counter;
}
-static int pit_get_out(struct kvm *kvm, int channel)
+static int pit_get_out(struct kvm_pit *pit, int channel)
{
- struct kvm_kpit_channel_state *c =
- &kvm->arch.vpit->pit_state.channels[channel];
+ struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
s64 d, t;
int out;
- WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
-
- t = kpit_elapsed(kvm, c, channel);
- d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
+ t = kpit_elapsed(pit, c, channel);
+ d = mul_u64_u32_div(t, KVM_PIT_FREQ, NSEC_PER_SEC);
switch (c->mode) {
default:
@@ -202,29 +171,23 @@ static int pit_get_out(struct kvm *kvm, int channel)
return out;
}
-static void pit_latch_count(struct kvm *kvm, int channel)
+static void pit_latch_count(struct kvm_pit *pit, int channel)
{
- struct kvm_kpit_channel_state *c =
- &kvm->arch.vpit->pit_state.channels[channel];
-
- WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+ struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
if (!c->count_latched) {
- c->latched_count = pit_get_count(kvm, channel);
+ c->latched_count = pit_get_count(pit, channel);
c->count_latched = c->rw_mode;
}
}
-static void pit_latch_status(struct kvm *kvm, int channel)
+static void pit_latch_status(struct kvm_pit *pit, int channel)
{
- struct kvm_kpit_channel_state *c =
- &kvm->arch.vpit->pit_state.channels[channel];
-
- WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+ struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
if (!c->status_latched) {
/* TODO: Return NULL COUNT (bit 6). */
- c->status = ((pit_get_out(kvm, channel) << 7) |
+ c->status = ((pit_get_out(pit, channel) << 7) |
(c->rw_mode << 4) |
(c->mode << 1) |
c->bcd);
@@ -232,26 +195,24 @@ static void pit_latch_status(struct kvm *kvm, int channel)
}
}
+static inline struct kvm_pit *pit_state_to_pit(struct kvm_kpit_state *ps)
+{
+ return container_of(ps, struct kvm_pit, pit_state);
+}
+
static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
{
struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
irq_ack_notifier);
- int value;
-
- spin_lock(&ps->inject_lock);
- value = atomic_dec_return(&ps->pending);
- if (value < 0)
- /* spurious acks can be generated if, for example, the
- * PIC is being reset. Handle it gracefully here
- */
- atomic_inc(&ps->pending);
- else if (value > 0)
- /* in this case, we had multiple outstanding pit interrupts
- * that we needed to inject. Reinject
- */
- queue_kthread_work(&ps->pit->worker, &ps->pit->expired);
- ps->irq_ack = 1;
- spin_unlock(&ps->inject_lock);
+ struct kvm_pit *pit = pit_state_to_pit(ps);
+
+ atomic_set(&ps->irq_ack, 1);
+ /* irq_ack should be set before pending is read. Order accesses with
+ * inc(pending) in pit_timer_fn and xchg(irq_ack, 0) in pit_do_work.
+ */
+ smp_mb();
+ if (atomic_dec_if_positive(&ps->pending) > 0)
+ queue_kthread_work(&pit->worker, &pit->expired);
}
void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
@@ -282,45 +243,36 @@ static void pit_do_work(struct kthread_work *work)
struct kvm_vcpu *vcpu;
int i;
struct kvm_kpit_state *ps = &pit->pit_state;
- int inject = 0;
- /* Try to inject pending interrupts when
- * last one has been acked.
+ if (atomic_read(&ps->reinject) && !atomic_xchg(&ps->irq_ack, 0))
+ return;
+
+ kvm_set_irq(kvm, pit->irq_source_id, 0, 1, false);
+ kvm_set_irq(kvm, pit->irq_source_id, 0, 0, false);
+
+ /*
+ * Provides NMI watchdog support via Virtual Wire mode.
+ * The route is: PIT -> LVT0 in NMI mode.
+ *
+ * Note: Our Virtual Wire implementation does not follow
+ * the MP specification. We propagate a PIT interrupt to all
+ * VCPUs and only when LVT0 is in NMI mode. The interrupt can
+ * also be simultaneously delivered through PIC and IOAPIC.
*/
- spin_lock(&ps->inject_lock);
- if (ps->irq_ack) {
- ps->irq_ack = 0;
- inject = 1;
- }
- spin_unlock(&ps->inject_lock);
- if (inject) {
- kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1, false);
- kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0, false);
-
- /*
- * Provides NMI watchdog support via Virtual Wire mode.
- * The route is: PIT -> PIC -> LVT0 in NMI mode.
- *
- * Note: Our Virtual Wire implementation is simplified, only
- * propagating PIT interrupts to all VCPUs when they have set
- * LVT0 to NMI delivery. Other PIC interrupts are just sent to
- * VCPU0, and only if its LVT0 is in EXTINT mode.
- */
- if (atomic_read(&kvm->arch.vapics_in_nmi_mode) > 0)
- kvm_for_each_vcpu(i, vcpu, kvm)
- kvm_apic_nmi_wd_deliver(vcpu);
- }
+ if (atomic_read(&kvm->arch.vapics_in_nmi_mode) > 0)
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_apic_nmi_wd_deliver(vcpu);
}
static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
{
struct kvm_kpit_state *ps = container_of(data, struct kvm_kpit_state, timer);
- struct kvm_pit *pt = ps->kvm->arch.vpit;
+ struct kvm_pit *pt = pit_state_to_pit(ps);
- if (ps->reinject || !atomic_read(&ps->pending)) {
+ if (atomic_read(&ps->reinject))
atomic_inc(&ps->pending);
- queue_kthread_work(&pt->worker, &pt->expired);
- }
+
+ queue_kthread_work(&pt->worker, &pt->expired);
if (ps->is_periodic) {
hrtimer_add_expires_ns(&ps->timer, ps->period);
@@ -329,30 +281,54 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
return HRTIMER_NORESTART;
}
-static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
+static inline void kvm_pit_reset_reinject(struct kvm_pit *pit)
{
- struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
+ atomic_set(&pit->pit_state.pending, 0);
+ atomic_set(&pit->pit_state.irq_ack, 1);
+}
+
+void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject)
+{
+ struct kvm_kpit_state *ps = &pit->pit_state;
+ struct kvm *kvm = pit->kvm;
+
+ if (atomic_read(&ps->reinject) == reinject)
+ return;
+
+ if (reinject) {
+ /* The initial state is preserved while ps->reinject == 0. */
+ kvm_pit_reset_reinject(pit);
+ kvm_register_irq_ack_notifier(kvm, &ps->irq_ack_notifier);
+ kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
+ } else {
+ kvm_unregister_irq_ack_notifier(kvm, &ps->irq_ack_notifier);
+ kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
+ }
+
+ atomic_set(&ps->reinject, reinject);
+}
+
+static void create_pit_timer(struct kvm_pit *pit, u32 val, int is_period)
+{
+ struct kvm_kpit_state *ps = &pit->pit_state;
+ struct kvm *kvm = pit->kvm;
s64 interval;
if (!ioapic_in_kernel(kvm) ||
ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)
return;
- interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
+ interval = mul_u64_u32_div(val, NSEC_PER_SEC, KVM_PIT_FREQ);
pr_debug("create pit timer, interval is %llu nsec\n", interval);
/* TODO The new value only affected after the retriggered */
hrtimer_cancel(&ps->timer);
- flush_kthread_work(&ps->pit->expired);
+ flush_kthread_work(&pit->expired);
ps->period = interval;
ps->is_periodic = is_period;
- ps->timer.function = pit_timer_fn;
- ps->kvm = ps->pit->kvm;
-
- atomic_set(&ps->pending, 0);
- ps->irq_ack = 1;
+ kvm_pit_reset_reinject(pit);
/*
* Do not allow the guest to program periodic timers with small
@@ -375,11 +351,9 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
HRTIMER_MODE_ABS);
}
-static void pit_load_count(struct kvm *kvm, int channel, u32 val)
+static void pit_load_count(struct kvm_pit *pit, int channel, u32 val)
{
- struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
-
- WARN_ON(!mutex_is_locked(&ps->lock));
+ struct kvm_kpit_state *ps = &pit->pit_state;
pr_debug("load_count val is %d, channel is %d\n", val, channel);
@@ -404,29 +378,33 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
case 1:
/* FIXME: enhance mode 4 precision */
case 4:
- create_pit_timer(kvm, val, 0);
+ create_pit_timer(pit, val, 0);
break;
case 2:
case 3:
- create_pit_timer(kvm, val, 1);
+ create_pit_timer(pit, val, 1);
break;
default:
- destroy_pit_timer(kvm->arch.vpit);
+ destroy_pit_timer(pit);
}
}
-void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start)
+void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val,
+ int hpet_legacy_start)
{
u8 saved_mode;
+
+ WARN_ON_ONCE(!mutex_is_locked(&pit->pit_state.lock));
+
if (hpet_legacy_start) {
/* save existing mode for later reenablement */
WARN_ON(channel != 0);
- saved_mode = kvm->arch.vpit->pit_state.channels[0].mode;
- kvm->arch.vpit->pit_state.channels[0].mode = 0xff; /* disable timer */
- pit_load_count(kvm, channel, val);
- kvm->arch.vpit->pit_state.channels[0].mode = saved_mode;
+ saved_mode = pit->pit_state.channels[0].mode;
+ pit->pit_state.channels[0].mode = 0xff; /* disable timer */
+ pit_load_count(pit, channel, val);
+ pit->pit_state.channels[0].mode = saved_mode;
} else {
- pit_load_count(kvm, channel, val);
+ pit_load_count(pit, channel, val);
}
}
@@ -452,7 +430,6 @@ static int pit_ioport_write(struct kvm_vcpu *vcpu,
{
struct kvm_pit *pit = dev_to_pit(this);
struct kvm_kpit_state *pit_state = &pit->pit_state;
- struct kvm *kvm = pit->kvm;
int channel, access;
struct kvm_kpit_channel_state *s;
u32 val = *(u32 *) data;
@@ -476,9 +453,9 @@ static int pit_ioport_write(struct kvm_vcpu *vcpu,
s = &pit_state->channels[channel];
if (val & (2 << channel)) {
if (!(val & 0x20))
- pit_latch_count(kvm, channel);
+ pit_latch_count(pit, channel);
if (!(val & 0x10))
- pit_latch_status(kvm, channel);
+ pit_latch_status(pit, channel);
}
}
} else {
@@ -486,7 +463,7 @@ static int pit_ioport_write(struct kvm_vcpu *vcpu,
s = &pit_state->channels[channel];
access = (val >> 4) & KVM_PIT_CHANNEL_MASK;
if (access == 0) {
- pit_latch_count(kvm, channel);
+ pit_latch_count(pit, channel);
} else {
s->rw_mode = access;
s->read_state = access;
@@ -503,17 +480,17 @@ static int pit_ioport_write(struct kvm_vcpu *vcpu,
switch (s->write_state) {
default:
case RW_STATE_LSB:
- pit_load_count(kvm, addr, val);
+ pit_load_count(pit, addr, val);
break;
case RW_STATE_MSB:
- pit_load_count(kvm, addr, val << 8);
+ pit_load_count(pit, addr, val << 8);
break;
case RW_STATE_WORD0:
s->write_latch = val;
s->write_state = RW_STATE_WORD1;
break;
case RW_STATE_WORD1:
- pit_load_count(kvm, addr, s->write_latch | (val << 8));
+ pit_load_count(pit, addr, s->write_latch | (val << 8));
s->write_state = RW_STATE_WORD0;
break;
}
@@ -529,7 +506,6 @@ static int pit_ioport_read(struct kvm_vcpu *vcpu,
{
struct kvm_pit *pit = dev_to_pit(this);
struct kvm_kpit_state *pit_state = &pit->pit_state;
- struct kvm *kvm = pit->kvm;
int ret, count;
struct kvm_kpit_channel_state *s;
if (!pit_in_range(addr))
@@ -566,20 +542,20 @@ static int pit_ioport_read(struct kvm_vcpu *vcpu,
switch (s->read_state) {
default:
case RW_STATE_LSB:
- count = pit_get_count(kvm, addr);
+ count = pit_get_count(pit, addr);
ret = count & 0xff;
break;
case RW_STATE_MSB:
- count = pit_get_count(kvm, addr);
+ count = pit_get_count(pit, addr);
ret = (count >> 8) & 0xff;
break;
case RW_STATE_WORD0:
- count = pit_get_count(kvm, addr);
+ count = pit_get_count(pit, addr);
ret = count & 0xff;
s->read_state = RW_STATE_WORD1;
break;
case RW_STATE_WORD1:
- count = pit_get_count(kvm, addr);
+ count = pit_get_count(pit, addr);
ret = (count >> 8) & 0xff;
s->read_state = RW_STATE_WORD0;
break;
@@ -600,14 +576,13 @@ static int speaker_ioport_write(struct kvm_vcpu *vcpu,
{
struct kvm_pit *pit = speaker_to_pit(this);
struct kvm_kpit_state *pit_state = &pit->pit_state;
- struct kvm *kvm = pit->kvm;
u32 val = *(u32 *) data;
if (addr != KVM_SPEAKER_BASE_ADDRESS)
return -EOPNOTSUPP;
mutex_lock(&pit_state->lock);
pit_state->speaker_data_on = (val >> 1) & 1;
- pit_set_gate(kvm, 2, val & 1);
+ pit_set_gate(pit, 2, val & 1);
mutex_unlock(&pit_state->lock);
return 0;
}
@@ -618,7 +593,6 @@ static int speaker_ioport_read(struct kvm_vcpu *vcpu,
{
struct kvm_pit *pit = speaker_to_pit(this);
struct kvm_kpit_state *pit_state = &pit->pit_state;
- struct kvm *kvm = pit->kvm;
unsigned int refresh_clock;
int ret;
if (addr != KVM_SPEAKER_BASE_ADDRESS)
@@ -628,8 +602,8 @@ static int speaker_ioport_read(struct kvm_vcpu *vcpu,
refresh_clock = ((unsigned int)ktime_to_ns(ktime_get()) >> 14) & 1;
mutex_lock(&pit_state->lock);
- ret = ((pit_state->speaker_data_on << 1) | pit_get_gate(kvm, 2) |
- (pit_get_out(kvm, 2) << 5) | (refresh_clock << 4));
+ ret = ((pit_state->speaker_data_on << 1) | pit_get_gate(pit, 2) |
+ (pit_get_out(pit, 2) << 5) | (refresh_clock << 4));
if (len > sizeof(ret))
len = sizeof(ret);
memcpy(data, (char *)&ret, len);
@@ -637,33 +611,28 @@ static int speaker_ioport_read(struct kvm_vcpu *vcpu,
return 0;
}
-void kvm_pit_reset(struct kvm_pit *pit)
+static void kvm_pit_reset(struct kvm_pit *pit)
{
int i;
struct kvm_kpit_channel_state *c;
- mutex_lock(&pit->pit_state.lock);
pit->pit_state.flags = 0;
for (i = 0; i < 3; i++) {
c = &pit->pit_state.channels[i];
c->mode = 0xff;
c->gate = (i != 2);
- pit_load_count(pit->kvm, i, 0);
+ pit_load_count(pit, i, 0);
}
- mutex_unlock(&pit->pit_state.lock);
- atomic_set(&pit->pit_state.pending, 0);
- pit->pit_state.irq_ack = 1;
+ kvm_pit_reset_reinject(pit);
}
static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask)
{
struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier);
- if (!mask) {
- atomic_set(&pit->pit_state.pending, 0);
- pit->pit_state.irq_ack = 1;
- }
+ if (!mask)
+ kvm_pit_reset_reinject(pit);
}
static const struct kvm_io_device_ops pit_dev_ops = {
@@ -690,14 +659,10 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
return NULL;
pit->irq_source_id = kvm_request_irq_source_id(kvm);
- if (pit->irq_source_id < 0) {
- kfree(pit);
- return NULL;
- }
+ if (pit->irq_source_id < 0)
+ goto fail_request;
mutex_init(&pit->pit_state.lock);
- mutex_lock(&pit->pit_state.lock);
- spin_lock_init(&pit->pit_state.inject_lock);
pid = get_pid(task_tgid(current));
pid_nr = pid_vnr(pid);
@@ -706,36 +671,30 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
init_kthread_worker(&pit->worker);
pit->worker_task = kthread_run(kthread_worker_fn, &pit->worker,
"kvm-pit/%d", pid_nr);
- if (IS_ERR(pit->worker_task)) {
- mutex_unlock(&pit->pit_state.lock);
- kvm_free_irq_source_id(kvm, pit->irq_source_id);
- kfree(pit);
- return NULL;
- }
+ if (IS_ERR(pit->worker_task))
+ goto fail_kthread;
+
init_kthread_work(&pit->expired, pit_do_work);
- kvm->arch.vpit = pit;
pit->kvm = kvm;
pit_state = &pit->pit_state;
- pit_state->pit = pit;
hrtimer_init(&pit_state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ pit_state->timer.function = pit_timer_fn;
+
pit_state->irq_ack_notifier.gsi = 0;
pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
- kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
- pit_state->reinject = true;
- mutex_unlock(&pit->pit_state.lock);
+ pit->mask_notifier.func = pit_mask_notifer;
kvm_pit_reset(pit);
- pit->mask_notifier.func = pit_mask_notifer;
- kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
+ kvm_pit_set_reinject(pit, true);
kvm_iodevice_init(&pit->dev, &pit_dev_ops);
ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, KVM_PIT_BASE_ADDRESS,
KVM_PIT_MEM_LENGTH, &pit->dev);
if (ret < 0)
- goto fail;
+ goto fail_register_pit;
if (flags & KVM_PIT_SPEAKER_DUMMY) {
kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops);
@@ -743,42 +702,35 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
KVM_SPEAKER_BASE_ADDRESS, 4,
&pit->speaker_dev);
if (ret < 0)
- goto fail_unregister;
+ goto fail_register_speaker;
}
return pit;
-fail_unregister:
+fail_register_speaker:
kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev);
-
-fail:
- kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
- kvm_unregister_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
- kvm_free_irq_source_id(kvm, pit->irq_source_id);
+fail_register_pit:
+ kvm_pit_set_reinject(pit, false);
kthread_stop(pit->worker_task);
+fail_kthread:
+ kvm_free_irq_source_id(kvm, pit->irq_source_id);
+fail_request:
kfree(pit);
return NULL;
}
void kvm_free_pit(struct kvm *kvm)
{
- struct hrtimer *timer;
-
- if (kvm->arch.vpit) {
- kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &kvm->arch.vpit->dev);
- kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
- &kvm->arch.vpit->speaker_dev);
- kvm_unregister_irq_mask_notifier(kvm, 0,
- &kvm->arch.vpit->mask_notifier);
- kvm_unregister_irq_ack_notifier(kvm,
- &kvm->arch.vpit->pit_state.irq_ack_notifier);
- mutex_lock(&kvm->arch.vpit->pit_state.lock);
- timer = &kvm->arch.vpit->pit_state.timer;
- hrtimer_cancel(timer);
- flush_kthread_work(&kvm->arch.vpit->expired);
- kthread_stop(kvm->arch.vpit->worker_task);
- kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id);
- mutex_unlock(&kvm->arch.vpit->pit_state.lock);
- kfree(kvm->arch.vpit);
+ struct kvm_pit *pit = kvm->arch.vpit;
+
+ if (pit) {
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev);
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->speaker_dev);
+ kvm_pit_set_reinject(pit, false);
+ hrtimer_cancel(&pit->pit_state.timer);
+ flush_kthread_work(&pit->expired);
+ kthread_stop(pit->worker_task);
+ kvm_free_irq_source_id(kvm, pit->irq_source_id);
+ kfree(pit);
}
}
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index c84990b..2f5af07 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -22,19 +22,18 @@ struct kvm_kpit_channel_state {
};
struct kvm_kpit_state {
+ /* All members before "struct mutex lock" are protected by the lock. */
struct kvm_kpit_channel_state channels[3];
u32 flags;
bool is_periodic;
s64 period; /* unit: ns */
struct hrtimer timer;
- atomic_t pending; /* accumulated triggered timers */
- bool reinject;
- struct kvm *kvm;
u32 speaker_data_on;
+
struct mutex lock;
- struct kvm_pit *pit;
- spinlock_t inject_lock;
- unsigned long irq_ack;
+ atomic_t reinject;
+ atomic_t pending; /* accumulated triggered timers */
+ atomic_t irq_ack;
struct kvm_irq_ack_notifier irq_ack_notifier;
};
@@ -57,9 +56,11 @@ struct kvm_pit {
#define KVM_MAX_PIT_INTR_INTERVAL HZ / 100
#define KVM_PIT_CHANNEL_MASK 0x3
-void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start);
struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags);
void kvm_free_pit(struct kvm *kvm);
-void kvm_pit_reset(struct kvm_pit *pit);
+
+void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val,
+ int hpet_legacy_start);
+void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject);
#endif
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 1facfd6..9db47090 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -94,7 +94,7 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
static void rtc_irq_eoi_tracking_reset(struct kvm_ioapic *ioapic)
{
ioapic->rtc_status.pending_eoi = 0;
- bitmap_zero(ioapic->rtc_status.dest_map, KVM_MAX_VCPUS);
+ bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPUS);
}
static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic);
@@ -117,16 +117,16 @@ static void __rtc_irq_eoi_tracking_restore_one(struct kvm_vcpu *vcpu)
return;
new_val = kvm_apic_pending_eoi(vcpu, e->fields.vector);
- old_val = test_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map);
+ old_val = test_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map.map);
if (new_val == old_val)
return;
if (new_val) {
- __set_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map);
+ __set_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map.map);
ioapic->rtc_status.pending_eoi++;
} else {
- __clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map);
+ __clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map.map);
ioapic->rtc_status.pending_eoi--;
rtc_status_pending_eoi_check_valid(ioapic);
}
@@ -156,7 +156,8 @@ static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic)
static void rtc_irq_eoi(struct kvm_ioapic *ioapic, struct kvm_vcpu *vcpu)
{
- if (test_and_clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map)) {
+ if (test_and_clear_bit(vcpu->vcpu_id,
+ ioapic->rtc_status.dest_map.map)) {
--ioapic->rtc_status.pending_eoi;
rtc_status_pending_eoi_check_valid(ioapic);
}
@@ -236,10 +237,17 @@ static void kvm_ioapic_inject_all(struct kvm_ioapic *ioapic, unsigned long irr)
void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors)
{
struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
+ struct dest_map *dest_map = &ioapic->rtc_status.dest_map;
union kvm_ioapic_redirect_entry *e;
int index;
spin_lock(&ioapic->lock);
+
+ /* Make sure we see any missing RTC EOI */
+ if (test_bit(vcpu->vcpu_id, dest_map->map))
+ __set_bit(dest_map->vectors[vcpu->vcpu_id],
+ ioapic_handled_vectors);
+
for (index = 0; index < IOAPIC_NUM_PINS; index++) {
e = &ioapic->redirtbl[index];
if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG ||
@@ -346,7 +354,7 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status)
*/
BUG_ON(ioapic->rtc_status.pending_eoi != 0);
ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe,
- ioapic->rtc_status.dest_map);
+ &ioapic->rtc_status.dest_map);
ioapic->rtc_status.pending_eoi = (ret < 0 ? 0 : ret);
} else
ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe, NULL);
@@ -407,8 +415,14 @@ static void kvm_ioapic_eoi_inject_work(struct work_struct *work)
static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
struct kvm_ioapic *ioapic, int vector, int trigger_mode)
{
- int i;
+ struct dest_map *dest_map = &ioapic->rtc_status.dest_map;
struct kvm_lapic *apic = vcpu->arch.apic;
+ int i;
+
+ /* RTC special handling */
+ if (test_bit(vcpu->vcpu_id, dest_map->map) &&
+ vector == dest_map->vectors[vcpu->vcpu_id])
+ rtc_irq_eoi(ioapic, vcpu);
for (i = 0; i < IOAPIC_NUM_PINS; i++) {
union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i];
@@ -416,8 +430,6 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
if (ent->fields.vector != vector)
continue;
- if (i == RTC_GSI)
- rtc_irq_eoi(ioapic, vcpu);
/*
* We are dropping lock while calling ack notifiers because ack
* notifier callbacks for assigned devices call into IOAPIC
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index 2d16dc2..7d2692a 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -40,9 +40,21 @@ struct kvm_vcpu;
#define RTC_GSI -1U
#endif
+struct dest_map {
+ /* vcpu bitmap where IRQ has been sent */
+ DECLARE_BITMAP(map, KVM_MAX_VCPUS);
+
+ /*
+ * Vector sent to a given vcpu, only valid when
+ * the vcpu's bit in map is set
+ */
+ u8 vectors[KVM_MAX_VCPUS];
+};
+
+
struct rtc_status {
int pending_eoi;
- DECLARE_BITMAP(dest_map, KVM_MAX_VCPUS);
+ struct dest_map dest_map;
};
union kvm_ioapic_redirect_entry {
@@ -118,7 +130,8 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
int level, bool line_status);
void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id);
int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
- struct kvm_lapic_irq *irq, unsigned long *dest_map);
+ struct kvm_lapic_irq *irq,
+ struct dest_map *dest_map);
int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 3982b47..95fcc7b 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -33,7 +33,10 @@
*/
int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
{
- return apic_has_pending_timer(vcpu);
+ if (lapic_in_kernel(vcpu))
+ return apic_has_pending_timer(vcpu);
+
+ return 0;
}
EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
@@ -137,8 +140,8 @@ EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
{
- kvm_inject_apic_timer_irqs(vcpu);
- /* TODO: PIT, RTC etc. */
+ if (lapic_in_kernel(vcpu))
+ kvm_inject_apic_timer_irqs(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index ae5c78f..61ebdc1 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -109,14 +109,6 @@ static inline int irqchip_in_kernel(struct kvm *kvm)
return ret;
}
-static inline int lapic_in_kernel(struct kvm_vcpu *vcpu)
-{
- /* Same as irqchip_in_kernel(vcpu->kvm), but with less
- * pointer chasing and no unnecessary memory barriers.
- */
- return vcpu->arch.apic != NULL;
-}
-
void kvm_pic_reset(struct kvm_kpic_state *s);
void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 8fc89ef..54ead79 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -34,6 +34,7 @@
#include "lapic.h"
#include "hyperv.h"
+#include "x86.h"
static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
struct kvm *kvm, int irq_source_id, int level,
@@ -53,10 +54,12 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
}
int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
- struct kvm_lapic_irq *irq, unsigned long *dest_map)
+ struct kvm_lapic_irq *irq, struct dest_map *dest_map)
{
int i, r = -1;
struct kvm_vcpu *vcpu, *lowest = NULL;
+ unsigned long dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)];
+ unsigned int dest_vcpus = 0;
if (irq->dest_mode == 0 && irq->dest_id == 0xff &&
kvm_lowest_prio_delivery(irq)) {
@@ -67,6 +70,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map))
return r;
+ memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap));
+
kvm_for_each_vcpu(i, vcpu, kvm) {
if (!kvm_apic_present(vcpu))
continue;
@@ -80,13 +85,25 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
r = 0;
r += kvm_apic_set_irq(vcpu, irq, dest_map);
} else if (kvm_lapic_enabled(vcpu)) {
- if (!lowest)
- lowest = vcpu;
- else if (kvm_apic_compare_prio(vcpu, lowest) < 0)
- lowest = vcpu;
+ if (!kvm_vector_hashing_enabled()) {
+ if (!lowest)
+ lowest = vcpu;
+ else if (kvm_apic_compare_prio(vcpu, lowest) < 0)
+ lowest = vcpu;
+ } else {
+ __set_bit(i, dest_vcpu_bitmap);
+ dest_vcpus++;
+ }
}
}
+ if (dest_vcpus != 0) {
+ int idx = kvm_vector_to_index(irq->vector, dest_vcpus,
+ dest_vcpu_bitmap, KVM_MAX_VCPUS);
+
+ lowest = kvm_get_vcpu(kvm, idx);
+ }
+
if (lowest)
r = kvm_apic_set_irq(lowest, irq, dest_map);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 36591fa..d9ae1ce 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -281,7 +281,7 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu)
struct kvm_cpuid_entry2 *feat;
u32 v = APIC_VERSION;
- if (!kvm_vcpu_has_lapic(vcpu))
+ if (!lapic_in_kernel(vcpu))
return;
feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0);
@@ -475,26 +475,20 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
{
- int highest_irr;
-
/* This may race with setting of irr in __apic_accept_irq() and
* value returned may be wrong, but kvm_vcpu_kick() in __apic_accept_irq
* will cause vmexit immediately and the value will be recalculated
* on the next vmentry.
*/
- if (!kvm_vcpu_has_lapic(vcpu))
- return 0;
- highest_irr = apic_find_highest_irr(vcpu->arch.apic);
-
- return highest_irr;
+ return apic_find_highest_irr(vcpu->arch.apic);
}
static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
int vector, int level, int trig_mode,
- unsigned long *dest_map);
+ struct dest_map *dest_map);
int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
- unsigned long *dest_map)
+ struct dest_map *dest_map)
{
struct kvm_lapic *apic = vcpu->arch.apic;
@@ -675,8 +669,33 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
}
}
+int kvm_vector_to_index(u32 vector, u32 dest_vcpus,
+ const unsigned long *bitmap, u32 bitmap_size)
+{
+ u32 mod;
+ int i, idx = -1;
+
+ mod = vector % dest_vcpus;
+
+ for (i = 0; i <= mod; i++) {
+ idx = find_next_bit(bitmap, bitmap_size, idx + 1);
+ BUG_ON(idx == bitmap_size);
+ }
+
+ return idx;
+}
+
+static void kvm_apic_disabled_lapic_found(struct kvm *kvm)
+{
+ if (!kvm->arch.disabled_lapic_found) {
+ kvm->arch.disabled_lapic_found = true;
+ printk(KERN_INFO
+ "Disabled LAPIC found during irq injection\n");
+ }
+}
+
bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
- struct kvm_lapic_irq *irq, int *r, unsigned long *dest_map)
+ struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map)
{
struct kvm_apic_map *map;
unsigned long bitmap = 1;
@@ -727,21 +746,42 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
dst = map->logical_map[cid];
- if (kvm_lowest_prio_delivery(irq)) {
+ if (!kvm_lowest_prio_delivery(irq))
+ goto set_irq;
+
+ if (!kvm_vector_hashing_enabled()) {
int l = -1;
for_each_set_bit(i, &bitmap, 16) {
if (!dst[i])
continue;
if (l < 0)
l = i;
- else if (kvm_apic_compare_prio(dst[i]->vcpu, dst[l]->vcpu) < 0)
+ else if (kvm_apic_compare_prio(dst[i]->vcpu,
+ dst[l]->vcpu) < 0)
l = i;
}
-
bitmap = (l >= 0) ? 1 << l : 0;
+ } else {
+ int idx;
+ unsigned int dest_vcpus;
+
+ dest_vcpus = hweight16(bitmap);
+ if (dest_vcpus == 0)
+ goto out;
+
+ idx = kvm_vector_to_index(irq->vector,
+ dest_vcpus, &bitmap, 16);
+
+ if (!dst[idx]) {
+ kvm_apic_disabled_lapic_found(kvm);
+ goto out;
+ }
+
+ bitmap = (idx >= 0) ? 1 << idx : 0;
}
}
+set_irq:
for_each_set_bit(i, &bitmap, 16) {
if (!dst[i])
continue;
@@ -754,6 +794,20 @@ out:
return ret;
}
+/*
+ * This routine tries to handler interrupts in posted mode, here is how
+ * it deals with different cases:
+ * - For single-destination interrupts, handle it in posted mode
+ * - Else if vector hashing is enabled and it is a lowest-priority
+ * interrupt, handle it in posted mode and use the following mechanism
+ * to find the destinaiton vCPU.
+ * 1. For lowest-priority interrupts, store all the possible
+ * destination vCPUs in an array.
+ * 2. Use "guest vector % max number of destination vCPUs" to find
+ * the right destination vCPU in the array for the lowest-priority
+ * interrupt.
+ * - Otherwise, use remapped mode to inject the interrupt.
+ */
bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
struct kvm_vcpu **dest_vcpu)
{
@@ -795,16 +849,37 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
if (cid >= ARRAY_SIZE(map->logical_map))
goto out;
- for_each_set_bit(i, &bitmap, 16) {
- dst = map->logical_map[cid][i];
- if (++r == 2)
+ if (kvm_vector_hashing_enabled() &&
+ kvm_lowest_prio_delivery(irq)) {
+ int idx;
+ unsigned int dest_vcpus;
+
+ dest_vcpus = hweight16(bitmap);
+ if (dest_vcpus == 0)
goto out;
- }
- if (dst && kvm_apic_present(dst->vcpu))
+ idx = kvm_vector_to_index(irq->vector, dest_vcpus,
+ &bitmap, 16);
+
+ dst = map->logical_map[cid][idx];
+ if (!dst) {
+ kvm_apic_disabled_lapic_found(kvm);
+ goto out;
+ }
+
*dest_vcpu = dst->vcpu;
- else
- goto out;
+ } else {
+ for_each_set_bit(i, &bitmap, 16) {
+ dst = map->logical_map[cid][i];
+ if (++r == 2)
+ goto out;
+ }
+
+ if (dst && kvm_apic_present(dst->vcpu))
+ *dest_vcpu = dst->vcpu;
+ else
+ goto out;
+ }
}
ret = true;
@@ -819,7 +894,7 @@ out:
*/
static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
int vector, int level, int trig_mode,
- unsigned long *dest_map)
+ struct dest_map *dest_map)
{
int result = 0;
struct kvm_vcpu *vcpu = apic->vcpu;
@@ -839,8 +914,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
result = 1;
- if (dest_map)
- __set_bit(vcpu->vcpu_id, dest_map);
+ if (dest_map) {
+ __set_bit(vcpu->vcpu_id, dest_map->map);
+ dest_map->vectors[vcpu->vcpu_id] = vector;
+ }
if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) {
if (trig_mode)
@@ -1239,7 +1316,7 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
struct kvm_lapic *apic = vcpu->arch.apic;
u64 guest_tsc, tsc_deadline;
- if (!kvm_vcpu_has_lapic(vcpu))
+ if (!lapic_in_kernel(vcpu))
return;
if (apic->lapic_timer.expired_tscdeadline == 0)
@@ -1515,8 +1592,7 @@ static int apic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu)
{
- if (kvm_vcpu_has_lapic(vcpu))
- apic_reg_write(vcpu->arch.apic, APIC_EOI, 0);
+ apic_reg_write(vcpu->arch.apic, APIC_EOI, 0);
}
EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
@@ -1566,7 +1642,7 @@ u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (!kvm_vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) ||
+ if (!lapic_in_kernel(vcpu) || apic_lvtt_oneshot(apic) ||
apic_lvtt_period(apic))
return 0;
@@ -1577,7 +1653,7 @@ void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (!kvm_vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) ||
+ if (!lapic_in_kernel(vcpu) || apic_lvtt_oneshot(apic) ||
apic_lvtt_period(apic))
return;
@@ -1590,9 +1666,6 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (!kvm_vcpu_has_lapic(vcpu))
- return;
-
apic_set_tpr(apic, ((cr8 & 0x0f) << 4)
| (kvm_apic_get_reg(apic, APIC_TASKPRI) & 4));
}
@@ -1601,9 +1674,6 @@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
{
u64 tpr;
- if (!kvm_vcpu_has_lapic(vcpu))
- return 0;
-
tpr = (u64) kvm_apic_get_reg(vcpu->arch.apic, APIC_TASKPRI);
return (tpr & 0xf0) >> 4;
@@ -1728,8 +1798,7 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (kvm_vcpu_has_lapic(vcpu) && apic_enabled(apic) &&
- apic_lvt_enabled(apic, APIC_LVTT))
+ if (apic_enabled(apic) && apic_lvt_enabled(apic, APIC_LVTT))
return atomic_read(&apic->lapic_timer.pending);
return 0;
@@ -1826,7 +1895,7 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
struct kvm_lapic *apic = vcpu->arch.apic;
int highest_irr;
- if (!kvm_vcpu_has_lapic(vcpu) || !apic_enabled(apic))
+ if (!apic_enabled(apic))
return -1;
apic_update_ppr(apic);
@@ -1854,9 +1923,6 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (!kvm_vcpu_has_lapic(vcpu))
- return;
-
if (atomic_read(&apic->lapic_timer.pending) > 0) {
kvm_apic_local_deliver(apic, APIC_LVTT);
if (apic_lvtt_tscdeadline(apic))
@@ -1932,7 +1998,7 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
{
struct hrtimer *timer;
- if (!kvm_vcpu_has_lapic(vcpu))
+ if (!lapic_in_kernel(vcpu))
return;
timer = &vcpu->arch.apic->lapic_timer.timer;
@@ -2105,7 +2171,7 @@ int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (!kvm_vcpu_has_lapic(vcpu))
+ if (!lapic_in_kernel(vcpu))
return 1;
/* if this is ICR write vector before command */
@@ -2119,7 +2185,7 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
struct kvm_lapic *apic = vcpu->arch.apic;
u32 low, high = 0;
- if (!kvm_vcpu_has_lapic(vcpu))
+ if (!lapic_in_kernel(vcpu))
return 1;
if (apic_reg_read(apic, reg, 4, &low))
@@ -2151,7 +2217,7 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
u8 sipi_vector;
unsigned long pe;
- if (!kvm_vcpu_has_lapic(vcpu) || !apic->pending_events)
+ if (!lapic_in_kernel(vcpu) || !apic->pending_events)
return;
/*
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 41bdb35..f71183e 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -42,6 +42,9 @@ struct kvm_lapic {
unsigned long pending_events;
unsigned int sipi_vector;
};
+
+struct dest_map;
+
int kvm_create_lapic(struct kvm_vcpu *vcpu);
void kvm_free_lapic(struct kvm_vcpu *vcpu);
@@ -60,11 +63,11 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu);
void __kvm_apic_update_irr(u32 *pir, void *regs);
void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
- unsigned long *dest_map);
+ struct dest_map *dest_map);
int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
- struct kvm_lapic_irq *irq, int *r, unsigned long *dest_map);
+ struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map);
u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
@@ -103,7 +106,7 @@ static inline u32 kvm_apic_get_reg(struct kvm_lapic *apic, int reg_off)
extern struct static_key kvm_no_apic_vcpu;
-static inline bool kvm_vcpu_has_lapic(struct kvm_vcpu *vcpu)
+static inline bool lapic_in_kernel(struct kvm_vcpu *vcpu)
{
if (static_key_false(&kvm_no_apic_vcpu))
return vcpu->arch.apic;
@@ -130,7 +133,7 @@ static inline bool kvm_apic_sw_enabled(struct kvm_lapic *apic)
static inline bool kvm_apic_present(struct kvm_vcpu *vcpu)
{
- return kvm_vcpu_has_lapic(vcpu) && kvm_apic_hw_enabled(vcpu->arch.apic);
+ return lapic_in_kernel(vcpu) && kvm_apic_hw_enabled(vcpu->arch.apic);
}
static inline int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
@@ -150,7 +153,7 @@ static inline bool kvm_vcpu_apicv_active(struct kvm_vcpu *vcpu)
static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
{
- return kvm_vcpu_has_lapic(vcpu) && vcpu->arch.apic->pending_events;
+ return lapic_in_kernel(vcpu) && vcpu->arch.apic->pending_events;
}
static inline bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq)
@@ -161,7 +164,7 @@ static inline bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq)
static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu)
{
- return kvm_vcpu_has_lapic(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
+ return lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
}
static inline int kvm_apic_id(struct kvm_lapic *apic)
@@ -175,4 +178,6 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu);
bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
struct kvm_vcpu **dest_vcpu);
+int kvm_vector_to_index(u32 vector, u32 dest_vcpus,
+ const unsigned long *bitmap, u32 bitmap_size);
#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 95a955d..2463de0 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -41,6 +41,7 @@
#include <asm/cmpxchg.h>
#include <asm/io.h>
#include <asm/vmx.h>
+#include <asm/kvm_page_track.h>
/*
* When setting this variable to true it enables Two-Dimensional-Paging
@@ -776,62 +777,85 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
return &slot->arch.lpage_info[level - 2][idx];
}
+static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot,
+ gfn_t gfn, int count)
+{
+ struct kvm_lpage_info *linfo;
+ int i;
+
+ for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
+ linfo = lpage_info_slot(gfn, slot, i);
+ linfo->disallow_lpage += count;
+ WARN_ON(linfo->disallow_lpage < 0);
+ }
+}
+
+void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
+{
+ update_gfn_disallow_lpage_count(slot, gfn, 1);
+}
+
+void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
+{
+ update_gfn_disallow_lpage_count(slot, gfn, -1);
+}
+
static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
{
struct kvm_memslots *slots;
struct kvm_memory_slot *slot;
- struct kvm_lpage_info *linfo;
gfn_t gfn;
- int i;
+ kvm->arch.indirect_shadow_pages++;
gfn = sp->gfn;
slots = kvm_memslots_for_spte_role(kvm, sp->role);
slot = __gfn_to_memslot(slots, gfn);
- for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
- linfo = lpage_info_slot(gfn, slot, i);
- linfo->write_count += 1;
- }
- kvm->arch.indirect_shadow_pages++;
+
+ /* the non-leaf shadow pages are keeping readonly. */
+ if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+ return kvm_slot_page_track_add_page(kvm, slot, gfn,
+ KVM_PAGE_TRACK_WRITE);
+
+ kvm_mmu_gfn_disallow_lpage(slot, gfn);
}
static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
{
struct kvm_memslots *slots;
struct kvm_memory_slot *slot;
- struct kvm_lpage_info *linfo;
gfn_t gfn;
- int i;
+ kvm->arch.indirect_shadow_pages--;
gfn = sp->gfn;
slots = kvm_memslots_for_spte_role(kvm, sp->role);
slot = __gfn_to_memslot(slots, gfn);
- for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
- linfo = lpage_info_slot(gfn, slot, i);
- linfo->write_count -= 1;
- WARN_ON(linfo->write_count < 0);
- }
- kvm->arch.indirect_shadow_pages--;
+ if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+ return kvm_slot_page_track_remove_page(kvm, slot, gfn,
+ KVM_PAGE_TRACK_WRITE);
+
+ kvm_mmu_gfn_allow_lpage(slot, gfn);
}
-static int __has_wrprotected_page(gfn_t gfn, int level,
- struct kvm_memory_slot *slot)
+static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,
+ struct kvm_memory_slot *slot)
{
struct kvm_lpage_info *linfo;
if (slot) {
linfo = lpage_info_slot(gfn, slot, level);
- return linfo->write_count;
+ return !!linfo->disallow_lpage;
}
- return 1;
+ return true;
}
-static int has_wrprotected_page(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
+static bool mmu_gfn_lpage_is_disallowed(struct kvm_vcpu *vcpu, gfn_t gfn,
+ int level)
{
struct kvm_memory_slot *slot;
slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
- return __has_wrprotected_page(gfn, level, slot);
+ return __mmu_gfn_lpage_is_disallowed(gfn, level, slot);
}
static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
@@ -897,7 +921,7 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn,
max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
- if (__has_wrprotected_page(large_gfn, level, slot))
+ if (__mmu_gfn_lpage_is_disallowed(large_gfn, level, slot))
break;
return level - 1;
@@ -1323,23 +1347,29 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
}
-static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
+bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
+ struct kvm_memory_slot *slot, u64 gfn)
{
- struct kvm_memory_slot *slot;
struct kvm_rmap_head *rmap_head;
int i;
bool write_protected = false;
- slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
-
for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
rmap_head = __gfn_to_rmap(gfn, i, slot);
- write_protected |= __rmap_write_protect(vcpu->kvm, rmap_head, true);
+ write_protected |= __rmap_write_protect(kvm, rmap_head, true);
}
return write_protected;
}
+static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
+{
+ struct kvm_memory_slot *slot;
+
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn);
+}
+
static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
{
u64 *sptep;
@@ -1754,7 +1784,7 @@ static void mark_unsync(u64 *spte)
static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp)
{
- return 1;
+ return 0;
}
static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
@@ -1840,13 +1870,16 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
return nr_unsync_leaf;
}
+#define INVALID_INDEX (-1)
+
static int mmu_unsync_walk(struct kvm_mmu_page *sp,
struct kvm_mmu_pages *pvec)
{
+ pvec->nr = 0;
if (!sp->unsync_children)
return 0;
- mmu_pages_add(pvec, sp, 0);
+ mmu_pages_add(pvec, sp, INVALID_INDEX);
return __mmu_unsync_walk(sp, pvec);
}
@@ -1883,37 +1916,35 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
if ((_sp)->role.direct || (_sp)->role.invalid) {} else
/* @sp->gfn should be write-protected at the call site */
-static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- struct list_head *invalid_list, bool clear_unsync)
+static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ struct list_head *invalid_list)
{
if (sp->role.cr4_pae != !!is_pae(vcpu)) {
kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
- return 1;
+ return false;
}
- if (clear_unsync)
- kvm_unlink_unsync_page(vcpu->kvm, sp);
-
- if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
+ if (vcpu->arch.mmu.sync_page(vcpu, sp) == 0) {
kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
- return 1;
+ return false;
}
- kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
- return 0;
+ return true;
}
-static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp)
+static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu,
+ struct list_head *invalid_list,
+ bool remote_flush, bool local_flush)
{
- LIST_HEAD(invalid_list);
- int ret;
-
- ret = __kvm_sync_page(vcpu, sp, &invalid_list, false);
- if (ret)
- kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+ if (!list_empty(invalid_list)) {
+ kvm_mmu_commit_zap_page(vcpu->kvm, invalid_list);
+ return;
+ }
- return ret;
+ if (remote_flush)
+ kvm_flush_remote_tlbs(vcpu->kvm);
+ else if (local_flush)
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
}
#ifdef CONFIG_KVM_MMU_AUDIT
@@ -1923,46 +1954,38 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
static void mmu_audit_disable(void) { }
#endif
-static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
struct list_head *invalid_list)
{
- return __kvm_sync_page(vcpu, sp, invalid_list, true);
+ kvm_unlink_unsync_page(vcpu->kvm, sp);
+ return __kvm_sync_page(vcpu, sp, invalid_list);
}
/* @gfn should be write-protected at the call site */
-static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
+static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn,
+ struct list_head *invalid_list)
{
struct kvm_mmu_page *s;
- LIST_HEAD(invalid_list);
- bool flush = false;
+ bool ret = false;
for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
if (!s->unsync)
continue;
WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
- kvm_unlink_unsync_page(vcpu->kvm, s);
- if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
- (vcpu->arch.mmu.sync_page(vcpu, s))) {
- kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
- continue;
- }
- flush = true;
+ ret |= kvm_sync_page(vcpu, s, invalid_list);
}
- kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
- if (flush)
- kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+ return ret;
}
struct mmu_page_path {
- struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
- unsigned int idx[PT64_ROOT_LEVEL-1];
+ struct kvm_mmu_page *parent[PT64_ROOT_LEVEL];
+ unsigned int idx[PT64_ROOT_LEVEL];
};
#define for_each_sp(pvec, sp, parents, i) \
- for (i = mmu_pages_next(&pvec, &parents, -1), \
- sp = pvec.page[i].sp; \
+ for (i = mmu_pages_first(&pvec, &parents); \
i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \
i = mmu_pages_next(&pvec, &parents, i))
@@ -1974,19 +1997,43 @@ static int mmu_pages_next(struct kvm_mmu_pages *pvec,
for (n = i+1; n < pvec->nr; n++) {
struct kvm_mmu_page *sp = pvec->page[n].sp;
+ unsigned idx = pvec->page[n].idx;
+ int level = sp->role.level;
- if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
- parents->idx[0] = pvec->page[n].idx;
- return n;
- }
+ parents->idx[level-1] = idx;
+ if (level == PT_PAGE_TABLE_LEVEL)
+ break;
- parents->parent[sp->role.level-2] = sp;
- parents->idx[sp->role.level-1] = pvec->page[n].idx;
+ parents->parent[level-2] = sp;
}
return n;
}
+static int mmu_pages_first(struct kvm_mmu_pages *pvec,
+ struct mmu_page_path *parents)
+{
+ struct kvm_mmu_page *sp;
+ int level;
+
+ if (pvec->nr == 0)
+ return 0;
+
+ WARN_ON(pvec->page[0].idx != INVALID_INDEX);
+
+ sp = pvec->page[0].sp;
+ level = sp->role.level;
+ WARN_ON(level == PT_PAGE_TABLE_LEVEL);
+
+ parents->parent[level-2] = sp;
+
+ /* Also set up a sentinel. Further entries in pvec are all
+ * children of sp, so this element is never overwritten.
+ */
+ parents->parent[level-1] = NULL;
+ return mmu_pages_next(pvec, parents, 0);
+}
+
static void mmu_pages_clear_parents(struct mmu_page_path *parents)
{
struct kvm_mmu_page *sp;
@@ -1994,22 +2041,14 @@ static void mmu_pages_clear_parents(struct mmu_page_path *parents)
do {
unsigned int idx = parents->idx[level];
-
sp = parents->parent[level];
if (!sp)
return;
+ WARN_ON(idx == INVALID_INDEX);
clear_unsync_child_bit(sp, idx);
level++;
- } while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children);
-}
-
-static void kvm_mmu_pages_init(struct kvm_mmu_page *parent,
- struct mmu_page_path *parents,
- struct kvm_mmu_pages *pvec)
-{
- parents->parent[parent->role.level-1] = NULL;
- pvec->nr = 0;
+ } while (!sp->unsync_children);
}
static void mmu_sync_children(struct kvm_vcpu *vcpu,
@@ -2020,30 +2059,36 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
struct mmu_page_path parents;
struct kvm_mmu_pages pages;
LIST_HEAD(invalid_list);
+ bool flush = false;
- kvm_mmu_pages_init(parent, &parents, &pages);
while (mmu_unsync_walk(parent, &pages)) {
bool protected = false;
for_each_sp(pages, sp, parents, i)
protected |= rmap_write_protect(vcpu, sp->gfn);
- if (protected)
+ if (protected) {
kvm_flush_remote_tlbs(vcpu->kvm);
+ flush = false;
+ }
for_each_sp(pages, sp, parents, i) {
- kvm_sync_page(vcpu, sp, &invalid_list);
+ flush |= kvm_sync_page(vcpu, sp, &invalid_list);
mmu_pages_clear_parents(&parents);
}
- kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
- cond_resched_lock(&vcpu->kvm->mmu_lock);
- kvm_mmu_pages_init(parent, &parents, &pages);
+ if (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)) {
+ kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
+ cond_resched_lock(&vcpu->kvm->mmu_lock);
+ flush = false;
+ }
}
+
+ kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
}
static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
{
- sp->write_flooding_count = 0;
+ atomic_set(&sp->write_flooding_count, 0);
}
static void clear_sp_write_flooding_count(u64 *spte)
@@ -2069,6 +2114,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
unsigned quadrant;
struct kvm_mmu_page *sp;
bool need_sync = false;
+ bool flush = false;
+ LIST_HEAD(invalid_list);
role = vcpu->arch.mmu.base_role;
role.level = level;
@@ -2092,8 +2139,16 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
if (sp->role.word != role.word)
continue;
- if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
- break;
+ if (sp->unsync) {
+ /* The page is good, but __kvm_sync_page might still end
+ * up zapping it. If so, break in order to rebuild it.
+ */
+ if (!__kvm_sync_page(vcpu, sp, &invalid_list))
+ break;
+
+ WARN_ON(!list_empty(&invalid_list));
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+ }
if (sp->unsync_children)
kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
@@ -2112,16 +2167,24 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
hlist_add_head(&sp->hash_link,
&vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
if (!direct) {
- if (rmap_write_protect(vcpu, gfn))
+ /*
+ * we should do write protection before syncing pages
+ * otherwise the content of the synced shadow page may
+ * be inconsistent with guest page table.
+ */
+ account_shadowed(vcpu->kvm, sp);
+ if (level == PT_PAGE_TABLE_LEVEL &&
+ rmap_write_protect(vcpu, gfn))
kvm_flush_remote_tlbs(vcpu->kvm);
- if (level > PT_PAGE_TABLE_LEVEL && need_sync)
- kvm_sync_pages(vcpu, gfn);
- account_shadowed(vcpu->kvm, sp);
+ if (level > PT_PAGE_TABLE_LEVEL && need_sync)
+ flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
}
sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
clear_page(sp->spt);
trace_kvm_mmu_get_page(sp, true);
+
+ kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
return sp;
}
@@ -2269,7 +2332,6 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
if (parent->role.level == PT_PAGE_TABLE_LEVEL)
return 0;
- kvm_mmu_pages_init(parent, &parents, &pages);
while (mmu_unsync_walk(parent, &pages)) {
struct kvm_mmu_page *sp;
@@ -2278,7 +2340,6 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
mmu_pages_clear_parents(&parents);
zapped++;
}
- kvm_mmu_pages_init(parent, &parents, &pages);
}
return zapped;
@@ -2354,8 +2415,8 @@ static bool prepare_zap_oldest_mmu_page(struct kvm *kvm,
if (list_empty(&kvm->arch.active_mmu_pages))
return false;
- sp = list_entry(kvm->arch.active_mmu_pages.prev,
- struct kvm_mmu_page, link);
+ sp = list_last_entry(&kvm->arch.active_mmu_pages,
+ struct kvm_mmu_page, link);
kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
return true;
@@ -2408,7 +2469,7 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
}
EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
-static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
trace_kvm_mmu_unsync_page(sp);
++vcpu->kvm->stat.mmu_unsync;
@@ -2417,37 +2478,26 @@ static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
kvm_mmu_mark_parents_unsync(sp);
}
-static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
+static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
+ bool can_unsync)
{
- struct kvm_mmu_page *s;
-
- for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
- if (s->unsync)
- continue;
- WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
- __kvm_unsync_page(vcpu, s);
- }
-}
+ struct kvm_mmu_page *sp;
-static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
- bool can_unsync)
-{
- struct kvm_mmu_page *s;
- bool need_unsync = false;
+ if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
+ return true;
- for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
+ for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
if (!can_unsync)
- return 1;
+ return true;
- if (s->role.level != PT_PAGE_TABLE_LEVEL)
- return 1;
+ if (sp->unsync)
+ continue;
- if (!s->unsync)
- need_unsync = true;
+ WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL);
+ kvm_unsync_page(vcpu, sp);
}
- if (need_unsync)
- kvm_unsync_pages(vcpu, gfn);
- return 0;
+
+ return false;
}
static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
@@ -2503,7 +2553,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
* be fixed if guest refault.
*/
if (level > PT_PAGE_TABLE_LEVEL &&
- has_wrprotected_page(vcpu, gfn, level))
+ mmu_gfn_lpage_is_disallowed(vcpu, gfn, level))
goto done;
spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
@@ -2768,7 +2818,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
level == PT_PAGE_TABLE_LEVEL &&
PageTransCompound(pfn_to_page(pfn)) &&
- !has_wrprotected_page(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
+ !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
unsigned long mask;
/*
* mmu_notifier_retry was successful and we hold the
@@ -2796,20 +2846,16 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
kvm_pfn_t pfn, unsigned access, int *ret_val)
{
- bool ret = true;
-
/* The pfn is invalid, report the error! */
if (unlikely(is_error_pfn(pfn))) {
*ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
- goto exit;
+ return true;
}
if (unlikely(is_noslot_pfn(pfn)))
vcpu_cache_mmio_info(vcpu, gva, gfn, access);
- ret = false;
-exit:
- return ret;
+ return false;
}
static bool page_fault_can_be_fast(u32 error_code)
@@ -3273,7 +3319,7 @@ static bool is_shadow_zero_bits_set(struct kvm_mmu *mmu, u64 spte, int level)
return __is_rsvd_bits_set(&mmu->shadow_zero_check, spte, level);
}
-static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
{
if (direct)
return vcpu_match_mmio_gpa(vcpu, addr);
@@ -3332,7 +3378,7 @@ int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
u64 spte;
bool reserved;
- if (quickly_check_mmio_pf(vcpu, addr, direct))
+ if (mmio_info_in_cache(vcpu, addr, direct))
return RET_MMIO_PF_EMULATE;
reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte);
@@ -3362,20 +3408,53 @@ int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
}
EXPORT_SYMBOL_GPL(handle_mmio_page_fault);
+static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
+ u32 error_code, gfn_t gfn)
+{
+ if (unlikely(error_code & PFERR_RSVD_MASK))
+ return false;
+
+ if (!(error_code & PFERR_PRESENT_MASK) ||
+ !(error_code & PFERR_WRITE_MASK))
+ return false;
+
+ /*
+ * guest is writing the page which is write tracked which can
+ * not be fixed by page fault handler.
+ */
+ if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
+ return true;
+
+ return false;
+}
+
+static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
+{
+ struct kvm_shadow_walk_iterator iterator;
+ u64 spte;
+
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ return;
+
+ walk_shadow_page_lockless_begin(vcpu);
+ for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
+ clear_sp_write_flooding_count(iterator.sptep);
+ if (!is_shadow_present_pte(spte))
+ break;
+ }
+ walk_shadow_page_lockless_end(vcpu);
+}
+
static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
u32 error_code, bool prefault)
{
- gfn_t gfn;
+ gfn_t gfn = gva >> PAGE_SHIFT;
int r;
pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
- if (unlikely(error_code & PFERR_RSVD_MASK)) {
- r = handle_mmio_page_fault(vcpu, gva, true);
-
- if (likely(r != RET_MMIO_PF_INVALID))
- return r;
- }
+ if (page_fault_handle_page_track(vcpu, error_code, gfn))
+ return 1;
r = mmu_topup_memory_caches(vcpu);
if (r)
@@ -3383,7 +3462,6 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
- gfn = gva >> PAGE_SHIFT;
return nonpaging_map(vcpu, gva & PAGE_MASK,
error_code, gfn, prefault);
@@ -3460,12 +3538,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
- if (unlikely(error_code & PFERR_RSVD_MASK)) {
- r = handle_mmio_page_fault(vcpu, gpa, true);
-
- if (likely(r != RET_MMIO_PF_INVALID))
- return r;
- }
+ if (page_fault_handle_page_track(vcpu, error_code, gfn))
+ return 1;
r = mmu_topup_memory_caches(vcpu);
if (r)
@@ -3558,13 +3632,24 @@ static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
return false;
}
-static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte)
+static inline bool is_last_gpte(struct kvm_mmu *mmu,
+ unsigned level, unsigned gpte)
{
- unsigned index;
+ /*
+ * PT_PAGE_TABLE_LEVEL always terminates. The RHS has bit 7 set
+ * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means
+ * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then.
+ */
+ gpte |= level - PT_PAGE_TABLE_LEVEL - 1;
- index = level - 1;
- index |= (gpte & PT_PAGE_SIZE_MASK) >> (PT_PAGE_SIZE_SHIFT - 2);
- return mmu->last_pte_bitmap & (1 << index);
+ /*
+ * The RHS has bit 7 set iff level < mmu->last_nonleaf_level.
+ * If it is clear, there are no large pages at this level, so clear
+ * PT_PAGE_SIZE_MASK in gpte if that is the case.
+ */
+ gpte &= level - mmu->last_nonleaf_level;
+
+ return gpte & PT_PAGE_SIZE_MASK;
}
#define PTTYPE_EPT 18 /* arbitrary */
@@ -3836,22 +3921,13 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu,
}
}
-static void update_last_pte_bitmap(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
+static void update_last_nonleaf_level(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
{
- u8 map;
- unsigned level, root_level = mmu->root_level;
- const unsigned ps_set_index = 1 << 2; /* bit 2 of index: ps */
-
- if (root_level == PT32E_ROOT_LEVEL)
- --root_level;
- /* PT_PAGE_TABLE_LEVEL always terminates */
- map = 1 | (1 << ps_set_index);
- for (level = PT_DIRECTORY_LEVEL; level <= root_level; ++level) {
- if (level <= PT_PDPE_LEVEL
- && (mmu->root_level >= PT32E_ROOT_LEVEL || is_pse(vcpu)))
- map |= 1 << (ps_set_index | (level - 1));
- }
- mmu->last_pte_bitmap = map;
+ unsigned root_level = mmu->root_level;
+
+ mmu->last_nonleaf_level = root_level;
+ if (root_level == PT32_ROOT_LEVEL && is_pse(vcpu))
+ mmu->last_nonleaf_level++;
}
static void paging64_init_context_common(struct kvm_vcpu *vcpu,
@@ -3863,7 +3939,7 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu,
reset_rsvds_bits_mask(vcpu, context);
update_permission_bitmask(vcpu, context, false);
- update_last_pte_bitmap(vcpu, context);
+ update_last_nonleaf_level(vcpu, context);
MMU_WARN_ON(!is_pae(vcpu));
context->page_fault = paging64_page_fault;
@@ -3890,7 +3966,7 @@ static void paging32_init_context(struct kvm_vcpu *vcpu,
reset_rsvds_bits_mask(vcpu, context);
update_permission_bitmask(vcpu, context, false);
- update_last_pte_bitmap(vcpu, context);
+ update_last_nonleaf_level(vcpu, context);
context->page_fault = paging32_page_fault;
context->gva_to_gpa = paging32_gva_to_gpa;
@@ -3948,7 +4024,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
}
update_permission_bitmask(vcpu, context, false);
- update_last_pte_bitmap(vcpu, context);
+ update_last_nonleaf_level(vcpu, context);
reset_tdp_shadow_zero_bits_mask(vcpu, context);
}
@@ -4054,7 +4130,7 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
}
update_permission_bitmask(vcpu, g_context, false);
- update_last_pte_bitmap(vcpu, g_context);
+ update_last_nonleaf_level(vcpu, g_context);
}
static void init_kvm_mmu(struct kvm_vcpu *vcpu)
@@ -4125,18 +4201,6 @@ static bool need_remote_flush(u64 old, u64 new)
return (old & ~new & PT64_PERM_MASK) != 0;
}
-static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page,
- bool remote_flush, bool local_flush)
-{
- if (zap_page)
- return;
-
- if (remote_flush)
- kvm_flush_remote_tlbs(vcpu->kvm);
- else if (local_flush)
- kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
-}
-
static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
const u8 *new, int *bytes)
{
@@ -4186,7 +4250,8 @@ static bool detect_write_flooding(struct kvm_mmu_page *sp)
if (sp->role.level == PT_PAGE_TABLE_LEVEL)
return false;
- return ++sp->write_flooding_count >= 3;
+ atomic_inc(&sp->write_flooding_count);
+ return atomic_read(&sp->write_flooding_count) >= 3;
}
/*
@@ -4248,15 +4313,15 @@ static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
return spte;
}
-void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
- const u8 *new, int bytes)
+static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+ const u8 *new, int bytes)
{
gfn_t gfn = gpa >> PAGE_SHIFT;
struct kvm_mmu_page *sp;
LIST_HEAD(invalid_list);
u64 entry, gentry, *spte;
int npte;
- bool remote_flush, local_flush, zap_page;
+ bool remote_flush, local_flush;
union kvm_mmu_page_role mask = { };
mask.cr0_wp = 1;
@@ -4273,7 +4338,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
return;
- zap_page = remote_flush = local_flush = false;
+ remote_flush = local_flush = false;
pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
@@ -4293,8 +4358,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
if (detect_write_misaligned(sp, gpa, bytes) ||
detect_write_flooding(sp)) {
- zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
- &invalid_list);
+ kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
++vcpu->kvm->stat.mmu_flooded;
continue;
}
@@ -4316,8 +4380,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
++spte;
}
}
- mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
- kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+ kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush);
kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
spin_unlock(&vcpu->kvm->mmu_lock);
}
@@ -4354,32 +4417,34 @@ static void make_mmu_pages_available(struct kvm_vcpu *vcpu)
kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
}
-static bool is_mmio_page_fault(struct kvm_vcpu *vcpu, gva_t addr)
-{
- if (vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu))
- return vcpu_match_mmio_gpa(vcpu, addr);
-
- return vcpu_match_mmio_gva(vcpu, addr);
-}
-
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
void *insn, int insn_len)
{
int r, emulation_type = EMULTYPE_RETRY;
enum emulation_result er;
+ bool direct = vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu);
+
+ if (unlikely(error_code & PFERR_RSVD_MASK)) {
+ r = handle_mmio_page_fault(vcpu, cr2, direct);
+ if (r == RET_MMIO_PF_EMULATE) {
+ emulation_type = 0;
+ goto emulate;
+ }
+ if (r == RET_MMIO_PF_RETRY)
+ return 1;
+ if (r < 0)
+ return r;
+ }
r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
if (r < 0)
- goto out;
-
- if (!r) {
- r = 1;
- goto out;
- }
+ return r;
+ if (!r)
+ return 1;
- if (is_mmio_page_fault(vcpu, cr2))
+ if (mmio_info_in_cache(vcpu, cr2, direct))
emulation_type = 0;
-
+emulate:
er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len);
switch (er) {
@@ -4393,8 +4458,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
default:
BUG();
}
-out:
- return r;
}
EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
@@ -4463,6 +4526,21 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu)
init_kvm_mmu(vcpu);
}
+void kvm_mmu_init_vm(struct kvm *kvm)
+{
+ struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
+
+ node->track_write = kvm_mmu_pte_write;
+ kvm_page_track_register_notifier(kvm, node);
+}
+
+void kvm_mmu_uninit_vm(struct kvm *kvm)
+{
+ struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
+
+ kvm_page_track_unregister_notifier(kvm, node);
+}
+
/* The return value indicates if tlb flush on all vcpus is needed. */
typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 55ffb7b..58fe98a 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -174,4 +174,9 @@ static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm);
void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end);
+
+void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
+void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
+bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
+ struct kvm_memory_slot *slot, u64 gfn);
#endif
diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c
new file mode 100644
index 0000000..11f7643
--- /dev/null
+++ b/arch/x86/kvm/page_track.c
@@ -0,0 +1,222 @@
+/*
+ * Support KVM gust page tracking
+ *
+ * This feature allows us to track page access in guest. Currently, only
+ * write access is tracked.
+ *
+ * Copyright(C) 2015 Intel Corporation.
+ *
+ * Author:
+ * Xiao Guangrong <guangrong.xiao@linux.intel.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/kvm_host.h>
+#include <asm/kvm_page_track.h>
+
+#include "mmu.h"
+
+void kvm_page_track_free_memslot(struct kvm_memory_slot *free,
+ struct kvm_memory_slot *dont)
+{
+ int i;
+
+ for (i = 0; i < KVM_PAGE_TRACK_MAX; i++)
+ if (!dont || free->arch.gfn_track[i] !=
+ dont->arch.gfn_track[i]) {
+ kvfree(free->arch.gfn_track[i]);
+ free->arch.gfn_track[i] = NULL;
+ }
+}
+
+int kvm_page_track_create_memslot(struct kvm_memory_slot *slot,
+ unsigned long npages)
+{
+ int i;
+
+ for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) {
+ slot->arch.gfn_track[i] = kvm_kvzalloc(npages *
+ sizeof(*slot->arch.gfn_track[i]));
+ if (!slot->arch.gfn_track[i])
+ goto track_free;
+ }
+
+ return 0;
+
+track_free:
+ kvm_page_track_free_memslot(slot, NULL);
+ return -ENOMEM;
+}
+
+static inline bool page_track_mode_is_valid(enum kvm_page_track_mode mode)
+{
+ if (mode < 0 || mode >= KVM_PAGE_TRACK_MAX)
+ return false;
+
+ return true;
+}
+
+static void update_gfn_track(struct kvm_memory_slot *slot, gfn_t gfn,
+ enum kvm_page_track_mode mode, short count)
+{
+ int index, val;
+
+ index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL);
+
+ val = slot->arch.gfn_track[mode][index];
+
+ if (WARN_ON(val + count < 0 || val + count > USHRT_MAX))
+ return;
+
+ slot->arch.gfn_track[mode][index] += count;
+}
+
+/*
+ * add guest page to the tracking pool so that corresponding access on that
+ * page will be intercepted.
+ *
+ * It should be called under the protection both of mmu-lock and kvm->srcu
+ * or kvm->slots_lock.
+ *
+ * @kvm: the guest instance we are interested in.
+ * @slot: the @gfn belongs to.
+ * @gfn: the guest page.
+ * @mode: tracking mode, currently only write track is supported.
+ */
+void kvm_slot_page_track_add_page(struct kvm *kvm,
+ struct kvm_memory_slot *slot, gfn_t gfn,
+ enum kvm_page_track_mode mode)
+{
+
+ if (WARN_ON(!page_track_mode_is_valid(mode)))
+ return;
+
+ update_gfn_track(slot, gfn, mode, 1);
+
+ /*
+ * new track stops large page mapping for the
+ * tracked page.
+ */
+ kvm_mmu_gfn_disallow_lpage(slot, gfn);
+
+ if (mode == KVM_PAGE_TRACK_WRITE)
+ if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn))
+ kvm_flush_remote_tlbs(kvm);
+}
+
+/*
+ * remove the guest page from the tracking pool which stops the interception
+ * of corresponding access on that page. It is the opposed operation of
+ * kvm_slot_page_track_add_page().
+ *
+ * It should be called under the protection both of mmu-lock and kvm->srcu
+ * or kvm->slots_lock.
+ *
+ * @kvm: the guest instance we are interested in.
+ * @slot: the @gfn belongs to.
+ * @gfn: the guest page.
+ * @mode: tracking mode, currently only write track is supported.
+ */
+void kvm_slot_page_track_remove_page(struct kvm *kvm,
+ struct kvm_memory_slot *slot, gfn_t gfn,
+ enum kvm_page_track_mode mode)
+{
+ if (WARN_ON(!page_track_mode_is_valid(mode)))
+ return;
+
+ update_gfn_track(slot, gfn, mode, -1);
+
+ /*
+ * allow large page mapping for the tracked page
+ * after the tracker is gone.
+ */
+ kvm_mmu_gfn_allow_lpage(slot, gfn);
+}
+
+/*
+ * check if the corresponding access on the specified guest page is tracked.
+ */
+bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn,
+ enum kvm_page_track_mode mode)
+{
+ struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ int index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL);
+
+ if (WARN_ON(!page_track_mode_is_valid(mode)))
+ return false;
+
+ return !!ACCESS_ONCE(slot->arch.gfn_track[mode][index]);
+}
+
+void kvm_page_track_init(struct kvm *kvm)
+{
+ struct kvm_page_track_notifier_head *head;
+
+ head = &kvm->arch.track_notifier_head;
+ init_srcu_struct(&head->track_srcu);
+ INIT_HLIST_HEAD(&head->track_notifier_list);
+}
+
+/*
+ * register the notifier so that event interception for the tracked guest
+ * pages can be received.
+ */
+void
+kvm_page_track_register_notifier(struct kvm *kvm,
+ struct kvm_page_track_notifier_node *n)
+{
+ struct kvm_page_track_notifier_head *head;
+
+ head = &kvm->arch.track_notifier_head;
+
+ spin_lock(&kvm->mmu_lock);
+ hlist_add_head_rcu(&n->node, &head->track_notifier_list);
+ spin_unlock(&kvm->mmu_lock);
+}
+
+/*
+ * stop receiving the event interception. It is the opposed operation of
+ * kvm_page_track_register_notifier().
+ */
+void
+kvm_page_track_unregister_notifier(struct kvm *kvm,
+ struct kvm_page_track_notifier_node *n)
+{
+ struct kvm_page_track_notifier_head *head;
+
+ head = &kvm->arch.track_notifier_head;
+
+ spin_lock(&kvm->mmu_lock);
+ hlist_del_rcu(&n->node);
+ spin_unlock(&kvm->mmu_lock);
+ synchronize_srcu(&head->track_srcu);
+}
+
+/*
+ * Notify the node that write access is intercepted and write emulation is
+ * finished at this time.
+ *
+ * The node should figure out if the written page is the one that node is
+ * interested in by itself.
+ */
+void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
+ int bytes)
+{
+ struct kvm_page_track_notifier_head *head;
+ struct kvm_page_track_notifier_node *n;
+ int idx;
+
+ head = &vcpu->kvm->arch.track_notifier_head;
+
+ if (hlist_empty(&head->track_notifier_list))
+ return;
+
+ idx = srcu_read_lock(&head->track_srcu);
+ hlist_for_each_entry_rcu(n, &head->track_notifier_list, node)
+ if (n->track_write)
+ n->track_write(vcpu, gpa, new, bytes);
+ srcu_read_unlock(&head->track_srcu, idx);
+}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 2ce4f05..e159a81 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -189,8 +189,11 @@ static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) |
ACC_USER_MASK;
#else
- access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
- access &= ~(gpte >> PT64_NX_SHIFT);
+ BUILD_BUG_ON(ACC_EXEC_MASK != PT_PRESENT_MASK);
+ BUILD_BUG_ON(ACC_EXEC_MASK != 1);
+ access = gpte & (PT_WRITABLE_MASK | PT_USER_MASK | PT_PRESENT_MASK);
+ /* Combine NX with P (which is set here) to get ACC_EXEC_MASK. */
+ access ^= (gpte >> PT64_NX_SHIFT);
#endif
return access;
@@ -702,24 +705,17 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
- if (unlikely(error_code & PFERR_RSVD_MASK)) {
- r = handle_mmio_page_fault(vcpu, addr, mmu_is_nested(vcpu));
- if (likely(r != RET_MMIO_PF_INVALID))
- return r;
-
- /*
- * page fault with PFEC.RSVD = 1 is caused by shadow
- * page fault, should not be used to walk guest page
- * table.
- */
- error_code &= ~PFERR_RSVD_MASK;
- };
-
r = mmu_topup_memory_caches(vcpu);
if (r)
return r;
/*
+ * If PFEC.RSVD is set, this is a shadow page fault.
+ * The bit needs to be cleared before walking guest page tables.
+ */
+ error_code &= ~PFERR_RSVD_MASK;
+
+ /*
* Look up the guest pte for the faulting address.
*/
r = FNAME(walk_addr)(&walker, vcpu, addr, error_code);
@@ -735,6 +731,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
return 0;
}
+ if (page_fault_handle_page_track(vcpu, error_code, walker.gfn)) {
+ shadow_page_table_clear_flood(vcpu, addr);
+ return 1;
+ }
+
vcpu->arch.write_fault_to_shadow_pgtable = false;
is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
@@ -945,7 +946,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte,
sizeof(pt_element_t)))
- return -EINVAL;
+ return 0;
if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
vcpu->kvm->tlbs_dirty++;
@@ -977,7 +978,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
host_writable);
}
- return !nr_present;
+ return nr_present;
}
#undef pt_element_t
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 31aa2c8..06ce377 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -257,7 +257,7 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
{
- if (vcpu->arch.apic)
+ if (lapic_in_kernel(vcpu))
kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index c13a64b..9507038 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1858,8 +1858,7 @@ static int halt_interception(struct vcpu_svm *svm)
static int vmmcall_interception(struct vcpu_svm *svm)
{
svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
- kvm_emulate_hypercall(&svm->vcpu);
- return 1;
+ return kvm_emulate_hypercall(&svm->vcpu);
}
static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index ad9f6a2..2f1ea2f 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -996,11 +996,13 @@ TRACE_EVENT(kvm_enter_smm,
* Tracepoint for VT-d posted-interrupts.
*/
TRACE_EVENT(kvm_pi_irte_update,
- TP_PROTO(unsigned int vcpu_id, unsigned int gsi,
- unsigned int gvec, u64 pi_desc_addr, bool set),
- TP_ARGS(vcpu_id, gsi, gvec, pi_desc_addr, set),
+ TP_PROTO(unsigned int host_irq, unsigned int vcpu_id,
+ unsigned int gsi, unsigned int gvec,
+ u64 pi_desc_addr, bool set),
+ TP_ARGS(host_irq, vcpu_id, gsi, gvec, pi_desc_addr, set),
TP_STRUCT__entry(
+ __field( unsigned int, host_irq )
__field( unsigned int, vcpu_id )
__field( unsigned int, gsi )
__field( unsigned int, gvec )
@@ -1009,6 +1011,7 @@ TRACE_EVENT(kvm_pi_irte_update,
),
TP_fast_assign(
+ __entry->host_irq = host_irq;
__entry->vcpu_id = vcpu_id;
__entry->gsi = gsi;
__entry->gvec = gvec;
@@ -1016,9 +1019,10 @@ TRACE_EVENT(kvm_pi_irte_update,
__entry->set = set;
),
- TP_printk("VT-d PI is %s for this irq, vcpu %u, gsi: 0x%x, "
+ TP_printk("VT-d PI is %s for irq %u, vcpu %u, gsi: 0x%x, "
"gvec: 0x%x, pi_desc_addr: 0x%llx",
__entry->set ? "enabled and being updated" : "disabled",
+ __entry->host_irq,
__entry->vcpu_id,
__entry->gsi,
__entry->gvec,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e2951b6..46154da 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -961,25 +961,36 @@ static const u32 vmx_msr_index[] = {
MSR_EFER, MSR_TSC_AUX, MSR_STAR,
};
-static inline bool is_page_fault(u32 intr_info)
+static inline bool is_exception_n(u32 intr_info, u8 vector)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
INTR_INFO_VALID_MASK)) ==
- (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
+ (INTR_TYPE_HARD_EXCEPTION | vector | INTR_INFO_VALID_MASK);
+}
+
+static inline bool is_debug(u32 intr_info)
+{
+ return is_exception_n(intr_info, DB_VECTOR);
+}
+
+static inline bool is_breakpoint(u32 intr_info)
+{
+ return is_exception_n(intr_info, BP_VECTOR);
+}
+
+static inline bool is_page_fault(u32 intr_info)
+{
+ return is_exception_n(intr_info, PF_VECTOR);
}
static inline bool is_no_device(u32 intr_info)
{
- return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
- INTR_INFO_VALID_MASK)) ==
- (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
+ return is_exception_n(intr_info, NM_VECTOR);
}
static inline bool is_invalid_opcode(u32 intr_info)
{
- return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
- INTR_INFO_VALID_MASK)) ==
- (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
+ return is_exception_n(intr_info, UD_VECTOR);
}
static inline bool is_external_interrupt(u32 intr_info)
@@ -5608,11 +5619,8 @@ static int handle_dr(struct kvm_vcpu *vcpu)
}
if (vcpu->guest_debug == 0) {
- u32 cpu_based_vm_exec_control;
-
- cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
- cpu_based_vm_exec_control &= ~CPU_BASED_MOV_DR_EXITING;
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+ vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
+ CPU_BASED_MOV_DR_EXITING);
/*
* No more DR vmexits; force a reload of the debug registers
@@ -5649,8 +5657,6 @@ static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
{
- u32 cpu_based_vm_exec_control;
-
get_debugreg(vcpu->arch.db[0], 0);
get_debugreg(vcpu->arch.db[1], 1);
get_debugreg(vcpu->arch.db[2], 2);
@@ -5659,10 +5665,7 @@ static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
-
- cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
- cpu_based_vm_exec_control |= CPU_BASED_MOV_DR_EXITING;
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+ vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_MOV_DR_EXITING);
}
static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
@@ -5747,8 +5750,7 @@ static int handle_halt(struct kvm_vcpu *vcpu)
static int handle_vmcall(struct kvm_vcpu *vcpu)
{
- kvm_emulate_hypercall(vcpu);
- return 1;
+ return kvm_emulate_hypercall(vcpu);
}
static int handle_invd(struct kvm_vcpu *vcpu)
@@ -6435,8 +6437,8 @@ static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
/* Recycle the least recently used VMCS. */
- item = list_entry(vmx->nested.vmcs02_pool.prev,
- struct vmcs02_list, list);
+ item = list_last_entry(&vmx->nested.vmcs02_pool,
+ struct vmcs02_list, list);
item->vmptr = vmx->nested.current_vmptr;
list_move(&item->list, &vmx->nested.vmcs02_pool);
return &item->vmcs02;
@@ -7752,6 +7754,13 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
else if (is_no_device(intr_info) &&
!(vmcs12->guest_cr0 & X86_CR0_TS))
return false;
+ else if (is_debug(intr_info) &&
+ vcpu->guest_debug &
+ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
+ return false;
+ else if (is_breakpoint(intr_info) &&
+ vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
+ return false;
return vmcs12->exception_bitmap &
(1u << (intr_info & INTR_INFO_VECTOR_MASK));
case EXIT_REASON_EXTERNAL_INTERRUPT:
@@ -10764,13 +10773,26 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
*/
kvm_set_msi_irq(e, &irq);
- if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu))
+ if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
+ /*
+ * Make sure the IRTE is in remapped mode if
+ * we don't handle it in posted mode.
+ */
+ ret = irq_set_vcpu_affinity(host_irq, NULL);
+ if (ret < 0) {
+ printk(KERN_INFO
+ "failed to back to remapped mode, irq: %u\n",
+ host_irq);
+ goto out;
+ }
+
continue;
+ }
vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
vcpu_info.vector = irq.vector;
- trace_kvm_pi_irte_update(vcpu->vcpu_id, e->gsi,
+ trace_kvm_pi_irte_update(vcpu->vcpu_id, host_irq, e->gsi,
vcpu_info.vector, vcpu_info.pi_desc_addr, set);
if (set)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f4891f2..60d6c00 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -123,6 +123,9 @@ module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
unsigned int __read_mostly lapic_timer_advance_ns = 0;
module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
+static bool __read_mostly vector_hashing = true;
+module_param(vector_hashing, bool, S_IRUGO);
+
static bool __read_mostly backwards_tsc_observed = false;
#define KVM_NR_SHARED_MSRS 16
@@ -1196,17 +1199,11 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
{
- uint32_t quotient, remainder;
-
- /* Don't try to replace with do_div(), this one calculates
- * "(dividend << 32) / divisor" */
- __asm__ ( "divl %4"
- : "=a" (quotient), "=d" (remainder)
- : "0" (0), "1" (dividend), "r" (divisor) );
- return quotient;
+ do_shl32_div32(dividend, divisor);
+ return dividend;
}
-static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
+static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz,
s8 *pshift, u32 *pmultiplier)
{
uint64_t scaled64;
@@ -1214,8 +1211,8 @@ static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
uint64_t tps64;
uint32_t tps32;
- tps64 = base_khz * 1000LL;
- scaled64 = scaled_khz * 1000LL;
+ tps64 = base_hz;
+ scaled64 = scaled_hz;
while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
tps64 >>= 1;
shift--;
@@ -1233,8 +1230,8 @@ static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
*pshift = shift;
*pmultiplier = div_frac(scaled64, tps32);
- pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n",
- __func__, base_khz, scaled_khz, shift, *pmultiplier);
+ pr_debug("%s: base_hz %llu => %llu, shift %d, mul %u\n",
+ __func__, base_hz, scaled_hz, shift, *pmultiplier);
}
#ifdef CONFIG_X86_64
@@ -1293,23 +1290,23 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
return 0;
}
-static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
+static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
{
u32 thresh_lo, thresh_hi;
int use_scaling = 0;
/* tsc_khz can be zero if TSC calibration fails */
- if (this_tsc_khz == 0) {
+ if (user_tsc_khz == 0) {
/* set tsc_scaling_ratio to a safe value */
vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
return -1;
}
/* Compute a scale to convert nanoseconds in TSC cycles */
- kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
+ kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC,
&vcpu->arch.virtual_tsc_shift,
&vcpu->arch.virtual_tsc_mult);
- vcpu->arch.virtual_tsc_khz = this_tsc_khz;
+ vcpu->arch.virtual_tsc_khz = user_tsc_khz;
/*
* Compute the variation in TSC rate which is acceptable
@@ -1319,11 +1316,11 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
*/
thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
- if (this_tsc_khz < thresh_lo || this_tsc_khz > thresh_hi) {
- pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi);
+ if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) {
+ pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi);
use_scaling = 1;
}
- return set_tsc_khz(vcpu, this_tsc_khz, use_scaling);
+ return set_tsc_khz(vcpu, user_tsc_khz, use_scaling);
}
static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
@@ -1716,7 +1713,7 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
static int kvm_guest_time_update(struct kvm_vcpu *v)
{
- unsigned long flags, this_tsc_khz, tgt_tsc_khz;
+ unsigned long flags, tgt_tsc_khz;
struct kvm_vcpu_arch *vcpu = &v->arch;
struct kvm_arch *ka = &v->kvm->arch;
s64 kernel_ns;
@@ -1742,8 +1739,8 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
/* Keep irq disabled to prevent changes to the clock */
local_irq_save(flags);
- this_tsc_khz = __this_cpu_read(cpu_tsc_khz);
- if (unlikely(this_tsc_khz == 0)) {
+ tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz);
+ if (unlikely(tgt_tsc_khz == 0)) {
local_irq_restore(flags);
kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
return 1;
@@ -1778,13 +1775,14 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
if (!vcpu->pv_time_enabled)
return 0;
- if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
- tgt_tsc_khz = kvm_has_tsc_control ?
- vcpu->virtual_tsc_khz : this_tsc_khz;
- kvm_get_time_scale(NSEC_PER_SEC / 1000, tgt_tsc_khz,
+ if (kvm_has_tsc_control)
+ tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
+
+ if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
+ kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
&vcpu->hv_clock.tsc_shift,
&vcpu->hv_clock.tsc_to_system_mul);
- vcpu->hw_tsc_khz = this_tsc_khz;
+ vcpu->hw_tsc_khz = tgt_tsc_khz;
}
/* With all the info we got, fill in the values */
@@ -2988,7 +2986,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
- kvm_vcpu_has_lapic(vcpu))
+ lapic_in_kernel(vcpu))
vcpu->arch.apic->sipi_vector = events->sipi_vector;
if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
@@ -3001,7 +2999,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
else
vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
- if (kvm_vcpu_has_lapic(vcpu)) {
+ if (lapic_in_kernel(vcpu)) {
if (events->smi.latched_init)
set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
else
@@ -3241,7 +3239,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
switch (ioctl) {
case KVM_GET_LAPIC: {
r = -EINVAL;
- if (!vcpu->arch.apic)
+ if (!lapic_in_kernel(vcpu))
goto out;
u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
@@ -3259,7 +3257,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
}
case KVM_SET_LAPIC: {
r = -EINVAL;
- if (!vcpu->arch.apic)
+ if (!lapic_in_kernel(vcpu))
goto out;
u.lapic = memdup_user(argp, sizeof(*u.lapic));
if (IS_ERR(u.lapic))
@@ -3606,20 +3604,26 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
{
- mutex_lock(&kvm->arch.vpit->pit_state.lock);
- memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
- mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+ struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state;
+
+ BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels));
+
+ mutex_lock(&kps->lock);
+ memcpy(ps, &kps->channels, sizeof(*ps));
+ mutex_unlock(&kps->lock);
return 0;
}
static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
{
int i;
- mutex_lock(&kvm->arch.vpit->pit_state.lock);
- memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
+ struct kvm_pit *pit = kvm->arch.vpit;
+
+ mutex_lock(&pit->pit_state.lock);
+ memcpy(&pit->pit_state.channels, ps, sizeof(*ps));
for (i = 0; i < 3; i++)
- kvm_pit_load_count(kvm, i, ps->channels[i].count, 0);
- mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+ kvm_pit_load_count(pit, i, ps->channels[i].count, 0);
+ mutex_unlock(&pit->pit_state.lock);
return 0;
}
@@ -3639,29 +3643,39 @@ static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
int start = 0;
int i;
u32 prev_legacy, cur_legacy;
- mutex_lock(&kvm->arch.vpit->pit_state.lock);
- prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
+ struct kvm_pit *pit = kvm->arch.vpit;
+
+ mutex_lock(&pit->pit_state.lock);
+ prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
if (!prev_legacy && cur_legacy)
start = 1;
- memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels,
- sizeof(kvm->arch.vpit->pit_state.channels));
- kvm->arch.vpit->pit_state.flags = ps->flags;
+ memcpy(&pit->pit_state.channels, &ps->channels,
+ sizeof(pit->pit_state.channels));
+ pit->pit_state.flags = ps->flags;
for (i = 0; i < 3; i++)
- kvm_pit_load_count(kvm, i, kvm->arch.vpit->pit_state.channels[i].count,
+ kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count,
start && i == 0);
- mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+ mutex_unlock(&pit->pit_state.lock);
return 0;
}
static int kvm_vm_ioctl_reinject(struct kvm *kvm,
struct kvm_reinject_control *control)
{
- if (!kvm->arch.vpit)
+ struct kvm_pit *pit = kvm->arch.vpit;
+
+ if (!pit)
return -ENXIO;
- mutex_lock(&kvm->arch.vpit->pit_state.lock);
- kvm->arch.vpit->pit_state.reinject = control->pit_reinject;
- mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+
+ /* pit->pit_state.lock was overloaded to prevent userspace from getting
+ * an inconsistent state after running multiple KVM_REINJECT_CONTROL
+ * ioctls in parallel. Use a separate lock if that ioctl isn't rare.
+ */
+ mutex_lock(&pit->pit_state.lock);
+ kvm_pit_set_reinject(pit, control->pit_reinject);
+ mutex_unlock(&pit->pit_state.lock);
+
return 0;
}
@@ -4094,7 +4108,7 @@ static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
do {
n = min(len, 8);
- if (!(vcpu->arch.apic &&
+ if (!(lapic_in_kernel(vcpu) &&
!kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v))
&& kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v))
break;
@@ -4114,7 +4128,7 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
do {
n = min(len, 8);
- if (!(vcpu->arch.apic &&
+ if (!(lapic_in_kernel(vcpu) &&
!kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev,
addr, n, v))
&& kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
@@ -4347,7 +4361,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
ret = kvm_vcpu_write_guest(vcpu, gpa, val, bytes);
if (ret < 0)
return 0;
- kvm_mmu_pte_write(vcpu, gpa, val, bytes);
+ kvm_page_track_write(vcpu, gpa, val, bytes);
return 1;
}
@@ -4605,7 +4619,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
return X86EMUL_CMPXCHG_FAILED;
kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
- kvm_mmu_pte_write(vcpu, gpa, new, bytes);
+ kvm_page_track_write(vcpu, gpa, new, bytes);
return X86EMUL_CONTINUE;
@@ -6011,7 +6025,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
if (!kvm_x86_ops->update_cr8_intercept)
return;
- if (!vcpu->arch.apic)
+ if (!lapic_in_kernel(vcpu))
return;
if (vcpu->arch.apicv_active)
@@ -7039,7 +7053,7 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
struct kvm_mp_state *mp_state)
{
- if (!kvm_vcpu_has_lapic(vcpu) &&
+ if (!lapic_in_kernel(vcpu) &&
mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
return -EINVAL;
@@ -7594,6 +7608,7 @@ bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
}
struct static_key kvm_no_apic_vcpu __read_mostly;
+EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu);
int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
{
@@ -7725,6 +7740,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
+ kvm_page_track_init(kvm);
+ kvm_mmu_init_vm(kvm);
+
return 0;
}
@@ -7851,6 +7869,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
kfree(kvm->arch.vioapic);
kvm_free_vcpus(kvm);
kfree(rcu_dereference_check(kvm->arch.apic_map, 1));
+ kvm_mmu_uninit_vm(kvm);
}
void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
@@ -7872,6 +7891,8 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
free->arch.lpage_info[i - 1] = NULL;
}
}
+
+ kvm_page_track_free_memslot(free, dont);
}
int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
@@ -7880,6 +7901,7 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
int i;
for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
+ struct kvm_lpage_info *linfo;
unsigned long ugfn;
int lpages;
int level = i + 1;
@@ -7894,15 +7916,16 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
if (i == 0)
continue;
- slot->arch.lpage_info[i - 1] = kvm_kvzalloc(lpages *
- sizeof(*slot->arch.lpage_info[i - 1]));
- if (!slot->arch.lpage_info[i - 1])
+ linfo = kvm_kvzalloc(lpages * sizeof(*linfo));
+ if (!linfo)
goto out_free;
+ slot->arch.lpage_info[i - 1] = linfo;
+
if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
- slot->arch.lpage_info[i - 1][0].write_count = 1;
+ linfo[0].disallow_lpage = 1;
if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
- slot->arch.lpage_info[i - 1][lpages - 1].write_count = 1;
+ linfo[lpages - 1].disallow_lpage = 1;
ugfn = slot->userspace_addr >> PAGE_SHIFT;
/*
* If the gfn and userspace address are not aligned wrt each
@@ -7914,10 +7937,13 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
unsigned long j;
for (j = 0; j < lpages; ++j)
- slot->arch.lpage_info[i - 1][j].write_count = 1;
+ linfo[j].disallow_lpage = 1;
}
}
+ if (kvm_page_track_create_memslot(slot, npages))
+ goto out_free;
+
return 0;
out_free:
@@ -8371,6 +8397,12 @@ int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set);
}
+bool kvm_vector_hashing_enabled(void)
+{
+ return vector_hashing;
+}
+EXPORT_SYMBOL_GPL(kvm_vector_hashing_enabled);
+
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index f2afa5f..007940f 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -179,6 +179,7 @@ int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data);
int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
int page_num);
+bool kvm_vector_hashing_enabled(void);
#define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
| XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
@@ -192,4 +193,19 @@ extern unsigned int min_timer_period_us;
extern unsigned int lapic_timer_advance_ns;
extern struct static_key kvm_no_apic_vcpu;
+
+/* Same "calling convention" as do_div:
+ * - divide (n << 32) by base
+ * - put result in n
+ * - return remainder
+ */
+#define do_shl32_div32(n, base) \
+ ({ \
+ u32 __quot, __rem; \
+ asm("divl %2" : "=a" (__quot), "=d" (__rem) \
+ : "rm" (base), "0" (0), "1" ((u32) n)); \
+ n = __quot; \
+ __rem; \
+ })
+
#endif
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index 4ebc796..2f8c0f4 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -256,12 +256,6 @@ struct hv_monitor_page {
u8 rsvdz4[1984];
};
-/* Declare the various hypercall operations. */
-enum hv_call_code {
- HVCALL_POST_MESSAGE = 0x005c,
- HVCALL_SIGNAL_EVENT = 0x005d,
-};
-
/* Definition of the hv_post_message hypercall input structure. */
struct hv_input_post_message {
union hv_connection_id connectionid;
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index d6f8322..aa69253 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -359,14 +359,15 @@ TRACE_EVENT(
#endif
TRACE_EVENT(kvm_halt_poll_ns,
- TP_PROTO(bool grow, unsigned int vcpu_id, int new, int old),
+ TP_PROTO(bool grow, unsigned int vcpu_id, unsigned int new,
+ unsigned int old),
TP_ARGS(grow, vcpu_id, new, old),
TP_STRUCT__entry(
__field(bool, grow)
__field(unsigned int, vcpu_id)
- __field(int, new)
- __field(int, old)
+ __field(unsigned int, new)
+ __field(unsigned int, old)
),
TP_fast_assign(
@@ -376,7 +377,7 @@ TRACE_EVENT(kvm_halt_poll_ns,
__entry->old = old;
),
- TP_printk("vcpu %u: halt_poll_ns %d (%s %d)",
+ TP_printk("vcpu %u: halt_poll_ns %u (%s %u)",
__entry->vcpu_id,
__entry->new,
__entry->grow ? "grow" : "shrink",
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 50f44a2..a7f1f80 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -157,6 +157,7 @@ struct kvm_s390_skeys {
struct kvm_hyperv_exit {
#define KVM_EXIT_HYPERV_SYNIC 1
+#define KVM_EXIT_HYPERV_HCALL 2
__u32 type;
union {
struct {
@@ -165,6 +166,11 @@ struct kvm_hyperv_exit {
__u64 evt_page;
__u64 msg_page;
} synic;
+ struct {
+ __u64 input;
+ __u64 result;
+ __u64 params[2];
+ } hcall;
} u;
};
@@ -541,7 +547,13 @@ struct kvm_s390_pgm_info {
__u8 exc_access_id;
__u8 per_access_id;
__u8 op_access_id;
- __u8 pad[3];
+#define KVM_S390_PGM_FLAGS_ILC_VALID 0x01
+#define KVM_S390_PGM_FLAGS_ILC_0 0x02
+#define KVM_S390_PGM_FLAGS_ILC_1 0x04
+#define KVM_S390_PGM_FLAGS_ILC_MASK 0x06
+#define KVM_S390_PGM_FLAGS_NO_REWIND 0x08
+ __u8 flags;
+ __u8 pad[2];
};
struct kvm_s390_prefix_info {
@@ -850,8 +862,9 @@ struct kvm_ppc_smmu_info {
#define KVM_CAP_IOEVENTFD_ANY_LENGTH 122
#define KVM_CAP_HYPERV_SYNIC 123
#define KVM_CAP_S390_RI 124
-#define KVM_CAP_ARM_PMU_V3 125
-#define KVM_CAP_VCPU_ATTRIBUTES 126
+#define KVM_CAP_SPAPR_TCE_64 125
+#define KVM_CAP_ARM_PMU_V3 126
+#define KVM_CAP_VCPU_ATTRIBUTES 127
#ifdef KVM_CAP_IRQ_ROUTING
@@ -1144,6 +1157,8 @@ struct kvm_s390_ucas_mapping {
/* Available with KVM_CAP_PPC_ALLOC_HTAB */
#define KVM_PPC_ALLOCATE_HTAB _IOWR(KVMIO, 0xa7, __u32)
#define KVM_CREATE_SPAPR_TCE _IOW(KVMIO, 0xa8, struct kvm_create_spapr_tce)
+#define KVM_CREATE_SPAPR_TCE_64 _IOW(KVMIO, 0xa8, \
+ struct kvm_create_spapr_tce_64)
/* Available with KVM_CAP_RMA */
#define KVM_ALLOCATE_RMA _IOR(KVMIO, 0xa9, struct kvm_allocate_rma)
/* Available with KVM_CAP_PPC_HTAB_FD */
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index db2dd33..b866374 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -109,8 +109,8 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
/* cancel outstanding work queue item */
while (!list_empty(&vcpu->async_pf.queue)) {
struct kvm_async_pf *work =
- list_entry(vcpu->async_pf.queue.next,
- typeof(*work), queue);
+ list_first_entry(&vcpu->async_pf.queue,
+ typeof(*work), queue);
list_del(&work->queue);
#ifdef CONFIG_KVM_ASYNC_PF_SYNC
@@ -127,8 +127,8 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
spin_lock(&vcpu->async_pf.lock);
while (!list_empty(&vcpu->async_pf.done)) {
struct kvm_async_pf *work =
- list_entry(vcpu->async_pf.done.next,
- typeof(*work), link);
+ list_first_entry(&vcpu->async_pf.done,
+ typeof(*work), link);
list_del(&work->link);
kmem_cache_free(async_pf_cache, work);
}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a11cfd2..1eae052 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -72,11 +72,11 @@ module_param(halt_poll_ns, uint, S_IRUGO | S_IWUSR);
/* Default doubles per-vcpu halt_poll_ns. */
static unsigned int halt_poll_ns_grow = 2;
-module_param(halt_poll_ns_grow, int, S_IRUGO);
+module_param(halt_poll_ns_grow, uint, S_IRUGO | S_IWUSR);
/* Default resets per-vcpu halt_poll_ns . */
static unsigned int halt_poll_ns_shrink;
-module_param(halt_poll_ns_shrink, int, S_IRUGO);
+module_param(halt_poll_ns_shrink, uint, S_IRUGO | S_IWUSR);
/*
* Ordering of locks:
@@ -620,13 +620,10 @@ void *kvm_kvzalloc(unsigned long size)
static void kvm_destroy_devices(struct kvm *kvm)
{
- struct list_head *node, *tmp;
+ struct kvm_device *dev, *tmp;
- list_for_each_safe(node, tmp, &kvm->devices) {
- struct kvm_device *dev =
- list_entry(node, struct kvm_device, vm_node);
-
- list_del(node);
+ list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
+ list_del(&dev->vm_node);
dev->ops->destroy(dev);
}
}
@@ -1437,11 +1434,17 @@ kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
{
unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
- if (addr == KVM_HVA_ERR_RO_BAD)
+ if (addr == KVM_HVA_ERR_RO_BAD) {
+ if (writable)
+ *writable = false;
return KVM_PFN_ERR_RO_FAULT;
+ }
- if (kvm_is_error_hva(addr))
+ if (kvm_is_error_hva(addr)) {
+ if (writable)
+ *writable = false;
return KVM_PFN_NOSLOT;
+ }
/* Do not map writable pfn in the readonly memslot. */
if (writable && memslot_is_readonly(slot)) {
@@ -1943,14 +1946,15 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
{
- int old, val;
+ unsigned int old, val, grow;
old = val = vcpu->halt_poll_ns;
+ grow = READ_ONCE(halt_poll_ns_grow);
/* 10us base */
- if (val == 0 && halt_poll_ns_grow)
+ if (val == 0 && grow)
val = 10000;
else
- val *= halt_poll_ns_grow;
+ val *= grow;
vcpu->halt_poll_ns = val;
trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
@@ -1958,13 +1962,14 @@ static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
{
- int old, val;
+ unsigned int old, val, shrink;
old = val = vcpu->halt_poll_ns;
- if (halt_poll_ns_shrink == 0)
+ shrink = READ_ONCE(halt_poll_ns_shrink);
+ if (shrink == 0)
val = 0;
else
- val /= halt_poll_ns_shrink;
+ val /= shrink;
vcpu->halt_poll_ns = val;
trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
OpenPOWER on IntegriCloud