diff options
author | Ingo Molnar <mingo@kernel.org> | 2016-01-29 09:41:18 +0100 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2016-01-29 09:41:18 +0100 |
commit | 76b36fa896a2db64582690e085f36adc76604134 (patch) | |
tree | 78007f123ead6f96cdee6ba98ac3c289c706cc39 /arch/x86 | |
parent | 14365449b6ce34cf6a3040ff8ebbb39d89d67159 (diff) | |
parent | 92e963f50fc74041b5e9e744c330dca48e04f08d (diff) | |
download | op-kernel-dev-76b36fa896a2db64582690e085f36adc76604134.zip op-kernel-dev-76b36fa896a2db64582690e085f36adc76604134.tar.gz |
Merge tag 'v4.5-rc1' into x86/asm, to refresh the branch before merging new changes
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'arch/x86')
74 files changed, 3317 insertions, 893 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 258965d..330e738 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -24,12 +24,14 @@ config X86 select ARCH_DISCARD_MEMBLOCK select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS + select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FAST_MULTIPLIER select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_PMEM_API if X86_64 select ARCH_HAS_MMIO_FLUSH select ARCH_HAS_SG_CHAIN + select ARCH_HAS_UBSAN_SANITIZE_ALL select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI select ARCH_MIGHT_HAVE_PC_PARPORT @@ -82,6 +84,8 @@ config X86 select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP select HAVE_ARCH_KGDB select HAVE_ARCH_KMEMCHECK + select HAVE_ARCH_MMAP_RND_BITS if MMU + select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_SOFT_DIRTY if X86_64 select HAVE_ARCH_TRACEHOOK @@ -96,7 +100,6 @@ config X86 select HAVE_DEBUG_KMEMLEAK select HAVE_DEBUG_STACKOVERFLOW select HAVE_DMA_API_DEBUG - select HAVE_DMA_ATTRS select HAVE_DMA_CONTIGUOUS select HAVE_DYNAMIC_FTRACE select HAVE_DYNAMIC_FTRACE_WITH_REGS @@ -177,12 +180,23 @@ config LOCKDEP_SUPPORT config STACKTRACE_SUPPORT def_bool y -config HAVE_LATENCYTOP_SUPPORT - def_bool y - config MMU def_bool y +config ARCH_MMAP_RND_BITS_MIN + default 28 if 64BIT + default 8 + +config ARCH_MMAP_RND_BITS_MAX + default 32 if 64BIT + default 16 + +config ARCH_MMAP_RND_COMPAT_BITS_MIN + default 8 + +config ARCH_MMAP_RND_COMPAT_BITS_MAX + default 16 + config SBUS bool @@ -534,9 +548,10 @@ config X86_INTEL_QUARK config X86_INTEL_LPSS bool "Intel Low Power Subsystem Support" - depends on ACPI + depends on X86 && ACPI select COMMON_CLK select PINCTRL + select IOSF_MBI ---help--- Select to build support for Intel Low Power Subsystem such as found on Intel Lynxpoint PCH. Selecting this option enables @@ -2684,6 +2699,19 @@ config PMC_ATOM def_bool y depends on PCI +config VMD + depends on PCI_MSI + tristate "Volume Management Device Driver" + default N + ---help--- + Adds support for the Intel Volume Management Device (VMD). VMD is a + secondary PCI host bridge that allows PCI Express root ports, + and devices attached to them, to be removed from the default + PCI domain and placed within the VMD domain. This provides + more bus resources than are otherwise possible with a + single domain. If you know your system provides one of these and + has devices attached to it, say Y; if you are not sure, say N. + source "net/Kconfig" source "drivers/Kconfig" diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 110253c..9b18ed9 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -5,23 +5,6 @@ config TRACE_IRQFLAGS_SUPPORT source "lib/Kconfig.debug" -config STRICT_DEVMEM - bool "Filter access to /dev/mem" - ---help--- - If this option is disabled, you allow userspace (root) access to all - of memory, including kernel and userspace memory. Accidental - access to this is obviously disastrous, but specific access can - be used by people debugging the kernel. Note that with PAT support - enabled, even in this case there are restrictions on /dev/mem - use due to the cache aliasing requirements. - - If this option is switched on, the /dev/mem file only allows - userspace access to PCI space and the BIOS code and data regions. - This is sufficient for dosemu and X and all common users of - /dev/mem. - - If in doubt, say Y. - config X86_VERBOSE_BOOTUP bool "Enable verbose x86 bootup info messages" default y diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 2ee62db..bbe1a62 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -60,6 +60,7 @@ clean-files += cpustr.h KBUILD_CFLAGS := $(USERINCLUDE) $(REALMODE_CFLAGS) -D_SETUP KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ GCOV_PROFILE := n +UBSAN_SANITIZE := n $(obj)/bzImage: asflags-y := $(SVGA_MODE) diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 0a291cd..f9ce75d 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -33,6 +33,7 @@ KBUILD_CFLAGS += $(call cc-option,-fno-stack-protector) KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ GCOV_PROFILE := n +UBSAN_SANITIZE :=n LDFLAGS := -m elf_$(UTS_MACHINE) LDFLAGS_vmlinux := -T diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index 440df0c..a69321a 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -219,6 +219,29 @@ static int ghash_async_final(struct ahash_request *req) } } +static int ghash_async_import(struct ahash_request *req, const void *in) +{ + struct ahash_request *cryptd_req = ahash_request_ctx(req); + struct shash_desc *desc = cryptd_shash_desc(cryptd_req); + struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); + + ghash_async_init(req); + memcpy(dctx, in, sizeof(*dctx)); + return 0; + +} + +static int ghash_async_export(struct ahash_request *req, void *out) +{ + struct ahash_request *cryptd_req = ahash_request_ctx(req); + struct shash_desc *desc = cryptd_shash_desc(cryptd_req); + struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); + + memcpy(out, dctx, sizeof(*dctx)); + return 0; + +} + static int ghash_async_digest(struct ahash_request *req) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); @@ -288,8 +311,11 @@ static struct ahash_alg ghash_async_alg = { .final = ghash_async_final, .setkey = ghash_async_setkey, .digest = ghash_async_digest, + .export = ghash_async_export, + .import = ghash_async_import, .halg = { .digestsize = GHASH_DIGEST_SIZE, + .statesize = sizeof(struct ghash_desc_ctx), .base = { .cra_name = "ghash", .cra_driver_name = "ghash-clmulni", diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index f17705e..cb713df 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -383,3 +383,4 @@ 374 i386 userfaultfd sys_userfaultfd 375 i386 membarrier sys_membarrier 376 i386 mlock2 sys_mlock2 +377 i386 copy_file_range sys_copy_file_range diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 314a90b..dc1040a 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -332,6 +332,7 @@ 323 common userfaultfd sys_userfaultfd 324 common membarrier sys_membarrier 325 common mlock2 sys_mlock2 +326 common copy_file_range sys_copy_file_range # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 265c0ed..c854541 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -4,6 +4,7 @@ KBUILD_CFLAGS += $(DISABLE_LTO) KASAN_SANITIZE := n +UBSAN_SANITIZE := n VDSO64-$(CONFIG_X86_64) := y VDSOX32-$(CONFIG_X86_X32_ABI) := y diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index 8602f06..1a50e09 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -126,23 +126,23 @@ static notrace cycle_t vread_pvclock(int *mode) * * On Xen, we don't appear to have that guarantee, but Xen still * supplies a valid seqlock using the version field. - + * * We only do pvclock vdso timing at all if * PVCLOCK_TSC_STABLE_BIT is set, and we interpret that bit to * mean that all vCPUs have matching pvti and that the TSC is * synced, so we can just look at vCPU 0's pvti. */ - if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) { - *mode = VCLOCK_NONE; - return 0; - } - do { version = pvti->version; smp_rmb(); + if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) { + *mode = VCLOCK_NONE; + return 0; + } + tsc = rdtsc_ordered(); pvti_tsc_to_system_mul = pvti->tsc_to_system_mul; pvti_tsc_shift = pvti->tsc_shift; diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 0681d25..a584e1c 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -31,20 +31,10 @@ #endif #define dma_wmb() barrier() -#ifdef CONFIG_SMP -#define smp_mb() mb() -#define smp_rmb() dma_rmb() -#define smp_wmb() barrier() -#define smp_store_mb(var, value) do { (void)xchg(&var, value); } while (0) -#else /* !SMP */ -#define smp_mb() barrier() -#define smp_rmb() barrier() -#define smp_wmb() barrier() -#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); barrier(); } while (0) -#endif /* SMP */ - -#define read_barrier_depends() do { } while (0) -#define smp_read_barrier_depends() do { } while (0) +#define __smp_mb() mb() +#define __smp_rmb() dma_rmb() +#define __smp_wmb() barrier() +#define __smp_store_mb(var, value) do { (void)xchg(&var, value); } while (0) #if defined(CONFIG_X86_PPRO_FENCE) @@ -53,31 +43,31 @@ * model and we should fall back to full barriers. */ -#define smp_store_release(p, v) \ +#define __smp_store_release(p, v) \ do { \ compiletime_assert_atomic_type(*p); \ - smp_mb(); \ + __smp_mb(); \ WRITE_ONCE(*p, v); \ } while (0) -#define smp_load_acquire(p) \ +#define __smp_load_acquire(p) \ ({ \ typeof(*p) ___p1 = READ_ONCE(*p); \ compiletime_assert_atomic_type(*p); \ - smp_mb(); \ + __smp_mb(); \ ___p1; \ }) #else /* regular x86 TSO memory ordering */ -#define smp_store_release(p, v) \ +#define __smp_store_release(p, v) \ do { \ compiletime_assert_atomic_type(*p); \ barrier(); \ WRITE_ONCE(*p, v); \ } while (0) -#define smp_load_acquire(p) \ +#define __smp_load_acquire(p) \ ({ \ typeof(*p) ___p1 = READ_ONCE(*p); \ compiletime_assert_atomic_type(*p); \ @@ -88,7 +78,9 @@ do { \ #endif /* Atomic operations are already serializing on x86 */ -#define smp_mb__before_atomic() barrier() -#define smp_mb__after_atomic() barrier() +#define __smp_mb__before_atomic() barrier() +#define __smp_mb__after_atomic() barrier() + +#include <asm-generic/barrier.h> #endif /* _ASM_X86_BARRIER_H */ diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h index 03dd729..684ed6c 100644 --- a/arch/x86/include/asm/device.h +++ b/arch/x86/include/asm/device.h @@ -10,6 +10,16 @@ struct dev_archdata { #endif }; +#if defined(CONFIG_X86_DEV_DMA_OPS) && defined(CONFIG_PCI_DOMAINS) +struct dma_domain { + struct list_head node; + struct dma_map_ops *dma_ops; + int domain_nr; +}; +void add_dma_domain(struct dma_domain *domain); +void del_dma_domain(struct dma_domain *domain); +#endif + struct pdev_archdata { }; diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 953b726..3a27b93 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -46,8 +46,6 @@ bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp); #define HAVE_ARCH_DMA_SUPPORTED 1 extern int dma_supported(struct device *hwdev, u64 mask); -#include <asm-generic/dma-mapping-common.h> - extern void *dma_generic_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, gfp_t flag, struct dma_attrs *attrs); diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 1e3408e..1815b73 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -130,6 +130,11 @@ struct irq_alloc_info { char *uv_name; }; #endif +#if IS_ENABLED(CONFIG_VMD) + struct { + struct msi_desc *desc; + }; +#endif }; }; diff --git a/arch/x86/include/asm/intel_punit_ipc.h b/arch/x86/include/asm/intel_punit_ipc.h new file mode 100644 index 0000000..201eb9d --- /dev/null +++ b/arch/x86/include/asm/intel_punit_ipc.h @@ -0,0 +1,101 @@ +#ifndef _ASM_X86_INTEL_PUNIT_IPC_H_ +#define _ASM_X86_INTEL_PUNIT_IPC_H_ + +/* + * Three types of 8bit P-Unit IPC commands are supported, + * bit[7:6]: [00]: BIOS; [01]: GTD; [10]: ISPD. + */ +typedef enum { + BIOS_IPC = 0, + GTDRIVER_IPC, + ISPDRIVER_IPC, + RESERVED_IPC, +} IPC_TYPE; + +#define IPC_TYPE_OFFSET 6 +#define IPC_PUNIT_BIOS_CMD_BASE (BIOS_IPC << IPC_TYPE_OFFSET) +#define IPC_PUNIT_GTD_CMD_BASE (GTDDRIVER_IPC << IPC_TYPE_OFFSET) +#define IPC_PUNIT_ISPD_CMD_BASE (ISPDRIVER_IPC << IPC_TYPE_OFFSET) +#define IPC_PUNIT_CMD_TYPE_MASK (RESERVED_IPC << IPC_TYPE_OFFSET) + +/* BIOS => Pcode commands */ +#define IPC_PUNIT_BIOS_ZERO (IPC_PUNIT_BIOS_CMD_BASE | 0x00) +#define IPC_PUNIT_BIOS_VR_INTERFACE (IPC_PUNIT_BIOS_CMD_BASE | 0x01) +#define IPC_PUNIT_BIOS_READ_PCS (IPC_PUNIT_BIOS_CMD_BASE | 0x02) +#define IPC_PUNIT_BIOS_WRITE_PCS (IPC_PUNIT_BIOS_CMD_BASE | 0x03) +#define IPC_PUNIT_BIOS_READ_PCU_CONFIG (IPC_PUNIT_BIOS_CMD_BASE | 0x04) +#define IPC_PUNIT_BIOS_WRITE_PCU_CONFIG (IPC_PUNIT_BIOS_CMD_BASE | 0x05) +#define IPC_PUNIT_BIOS_READ_PL1_SETTING (IPC_PUNIT_BIOS_CMD_BASE | 0x06) +#define IPC_PUNIT_BIOS_WRITE_PL1_SETTING (IPC_PUNIT_BIOS_CMD_BASE | 0x07) +#define IPC_PUNIT_BIOS_TRIGGER_VDD_RAM (IPC_PUNIT_BIOS_CMD_BASE | 0x08) +#define IPC_PUNIT_BIOS_READ_TELE_INFO (IPC_PUNIT_BIOS_CMD_BASE | 0x09) +#define IPC_PUNIT_BIOS_READ_TELE_TRACE_CTRL (IPC_PUNIT_BIOS_CMD_BASE | 0x0a) +#define IPC_PUNIT_BIOS_WRITE_TELE_TRACE_CTRL (IPC_PUNIT_BIOS_CMD_BASE | 0x0b) +#define IPC_PUNIT_BIOS_READ_TELE_EVENT_CTRL (IPC_PUNIT_BIOS_CMD_BASE | 0x0c) +#define IPC_PUNIT_BIOS_WRITE_TELE_EVENT_CTRL (IPC_PUNIT_BIOS_CMD_BASE | 0x0d) +#define IPC_PUNIT_BIOS_READ_TELE_TRACE (IPC_PUNIT_BIOS_CMD_BASE | 0x0e) +#define IPC_PUNIT_BIOS_WRITE_TELE_TRACE (IPC_PUNIT_BIOS_CMD_BASE | 0x0f) +#define IPC_PUNIT_BIOS_READ_TELE_EVENT (IPC_PUNIT_BIOS_CMD_BASE | 0x10) +#define IPC_PUNIT_BIOS_WRITE_TELE_EVENT (IPC_PUNIT_BIOS_CMD_BASE | 0x11) +#define IPC_PUNIT_BIOS_READ_MODULE_TEMP (IPC_PUNIT_BIOS_CMD_BASE | 0x12) +#define IPC_PUNIT_BIOS_RESERVED (IPC_PUNIT_BIOS_CMD_BASE | 0x13) +#define IPC_PUNIT_BIOS_READ_VOLTAGE_OVER (IPC_PUNIT_BIOS_CMD_BASE | 0x14) +#define IPC_PUNIT_BIOS_WRITE_VOLTAGE_OVER (IPC_PUNIT_BIOS_CMD_BASE | 0x15) +#define IPC_PUNIT_BIOS_READ_RATIO_OVER (IPC_PUNIT_BIOS_CMD_BASE | 0x16) +#define IPC_PUNIT_BIOS_WRITE_RATIO_OVER (IPC_PUNIT_BIOS_CMD_BASE | 0x17) +#define IPC_PUNIT_BIOS_READ_VF_GL_CTRL (IPC_PUNIT_BIOS_CMD_BASE | 0x18) +#define IPC_PUNIT_BIOS_WRITE_VF_GL_CTRL (IPC_PUNIT_BIOS_CMD_BASE | 0x19) +#define IPC_PUNIT_BIOS_READ_FM_SOC_TEMP_THRESH (IPC_PUNIT_BIOS_CMD_BASE | 0x1a) +#define IPC_PUNIT_BIOS_WRITE_FM_SOC_TEMP_THRESH (IPC_PUNIT_BIOS_CMD_BASE | 0x1b) + +/* GT Driver => Pcode commands */ +#define IPC_PUNIT_GTD_ZERO (IPC_PUNIT_GTD_CMD_BASE | 0x00) +#define IPC_PUNIT_GTD_CONFIG (IPC_PUNIT_GTD_CMD_BASE | 0x01) +#define IPC_PUNIT_GTD_READ_ICCP_LIC_CDYN_SCAL (IPC_PUNIT_GTD_CMD_BASE | 0x02) +#define IPC_PUNIT_GTD_WRITE_ICCP_LIC_CDYN_SCAL (IPC_PUNIT_GTD_CMD_BASE | 0x03) +#define IPC_PUNIT_GTD_GET_WM_VAL (IPC_PUNIT_GTD_CMD_BASE | 0x06) +#define IPC_PUNIT_GTD_WRITE_CONFIG_WISHREQ (IPC_PUNIT_GTD_CMD_BASE | 0x07) +#define IPC_PUNIT_GTD_READ_REQ_DUTY_CYCLE (IPC_PUNIT_GTD_CMD_BASE | 0x16) +#define IPC_PUNIT_GTD_DIS_VOL_FREQ_CHG_REQUEST (IPC_PUNIT_GTD_CMD_BASE | 0x17) +#define IPC_PUNIT_GTD_DYNA_DUTY_CYCLE_CTRL (IPC_PUNIT_GTD_CMD_BASE | 0x1a) +#define IPC_PUNIT_GTD_DYNA_DUTY_CYCLE_TUNING (IPC_PUNIT_GTD_CMD_BASE | 0x1c) + +/* ISP Driver => Pcode commands */ +#define IPC_PUNIT_ISPD_ZERO (IPC_PUNIT_ISPD_CMD_BASE | 0x00) +#define IPC_PUNIT_ISPD_CONFIG (IPC_PUNIT_ISPD_CMD_BASE | 0x01) +#define IPC_PUNIT_ISPD_GET_ISP_LTR_VAL (IPC_PUNIT_ISPD_CMD_BASE | 0x02) +#define IPC_PUNIT_ISPD_ACCESS_IU_FREQ_BOUNDS (IPC_PUNIT_ISPD_CMD_BASE | 0x03) +#define IPC_PUNIT_ISPD_READ_CDYN_LEVEL (IPC_PUNIT_ISPD_CMD_BASE | 0x04) +#define IPC_PUNIT_ISPD_WRITE_CDYN_LEVEL (IPC_PUNIT_ISPD_CMD_BASE | 0x05) + +/* Error codes */ +#define IPC_PUNIT_ERR_SUCCESS 0 +#define IPC_PUNIT_ERR_INVALID_CMD 1 +#define IPC_PUNIT_ERR_INVALID_PARAMETER 2 +#define IPC_PUNIT_ERR_CMD_TIMEOUT 3 +#define IPC_PUNIT_ERR_CMD_LOCKED 4 +#define IPC_PUNIT_ERR_INVALID_VR_ID 5 +#define IPC_PUNIT_ERR_VR_ERR 6 + +#if IS_ENABLED(CONFIG_INTEL_PUNIT_IPC) + +int intel_punit_ipc_simple_command(int cmd, int para1, int para2); +int intel_punit_ipc_command(u32 cmd, u32 para1, u32 para2, u32 *in, u32 *out); + +#else + +static inline int intel_punit_ipc_simple_command(int cmd, + int para1, int para2) +{ + return -ENODEV; +} + +static inline int intel_punit_ipc_command(u32 cmd, u32 para1, u32 para2, + u32 *in, u32 *out) +{ + return -ENODEV; +} + +#endif /* CONFIG_INTEL_PUNIT_IPC */ + +#endif diff --git a/arch/x86/include/asm/intel_telemetry.h b/arch/x86/include/asm/intel_telemetry.h new file mode 100644 index 0000000..ed65fe7 --- /dev/null +++ b/arch/x86/include/asm/intel_telemetry.h @@ -0,0 +1,147 @@ +/* + * Intel SOC Telemetry Driver Header File + * Copyright (C) 2015, Intel Corporation. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ +#ifndef INTEL_TELEMETRY_H +#define INTEL_TELEMETRY_H + +#define TELEM_MAX_EVENTS_SRAM 28 +#define TELEM_MAX_OS_ALLOCATED_EVENTS 20 + +enum telemetry_unit { + TELEM_PSS = 0, + TELEM_IOSS, + TELEM_UNIT_NONE +}; + +struct telemetry_evtlog { + u32 telem_evtid; + u64 telem_evtlog; +}; + +struct telemetry_evtconfig { + /* Array of Event-IDs to Enable */ + u32 *evtmap; + + /* Number of Events (<29) in evtmap */ + u8 num_evts; + + /* Sampling period */ + u8 period; +}; + +struct telemetry_evtmap { + const char *name; + u32 evt_id; +}; + +struct telemetry_unit_config { + struct telemetry_evtmap *telem_evts; + void __iomem *regmap; + u32 ssram_base_addr; + u8 ssram_evts_used; + u8 curr_period; + u8 max_period; + u8 min_period; + u32 ssram_size; + +}; + +struct telemetry_plt_config { + struct telemetry_unit_config pss_config; + struct telemetry_unit_config ioss_config; + struct mutex telem_trace_lock; + struct mutex telem_lock; + bool telem_in_use; +}; + +struct telemetry_core_ops { + int (*get_sampling_period)(u8 *pss_min_period, u8 *pss_max_period, + u8 *ioss_min_period, u8 *ioss_max_period); + + int (*get_eventconfig)(struct telemetry_evtconfig *pss_evtconfig, + struct telemetry_evtconfig *ioss_evtconfig, + int pss_len, int ioss_len); + + int (*update_events)(struct telemetry_evtconfig pss_evtconfig, + struct telemetry_evtconfig ioss_evtconfig); + + int (*set_sampling_period)(u8 pss_period, u8 ioss_period); + + int (*get_trace_verbosity)(enum telemetry_unit telem_unit, + u32 *verbosity); + + int (*set_trace_verbosity)(enum telemetry_unit telem_unit, + u32 verbosity); + + int (*raw_read_eventlog)(enum telemetry_unit telem_unit, + struct telemetry_evtlog *evtlog, + int len, int log_all_evts); + + int (*read_eventlog)(enum telemetry_unit telem_unit, + struct telemetry_evtlog *evtlog, + int len, int log_all_evts); + + int (*add_events)(u8 num_pss_evts, u8 num_ioss_evts, + u32 *pss_evtmap, u32 *ioss_evtmap); + + int (*reset_events)(void); +}; + +int telemetry_set_pltdata(struct telemetry_core_ops *ops, + struct telemetry_plt_config *pltconfig); + +int telemetry_clear_pltdata(void); + +int telemetry_pltconfig_valid(void); + +int telemetry_get_evtname(enum telemetry_unit telem_unit, + const char **name, int len); + +int telemetry_update_events(struct telemetry_evtconfig pss_evtconfig, + struct telemetry_evtconfig ioss_evtconfig); + +int telemetry_add_events(u8 num_pss_evts, u8 num_ioss_evts, + u32 *pss_evtmap, u32 *ioss_evtmap); + +int telemetry_reset_events(void); + +int telemetry_get_eventconfig(struct telemetry_evtconfig *pss_config, + struct telemetry_evtconfig *ioss_config, + int pss_len, int ioss_len); + +int telemetry_read_events(enum telemetry_unit telem_unit, + struct telemetry_evtlog *evtlog, int len); + +int telemetry_raw_read_events(enum telemetry_unit telem_unit, + struct telemetry_evtlog *evtlog, int len); + +int telemetry_read_eventlog(enum telemetry_unit telem_unit, + struct telemetry_evtlog *evtlog, int len); + +int telemetry_raw_read_eventlog(enum telemetry_unit telem_unit, + struct telemetry_evtlog *evtlog, int len); + +int telemetry_get_sampling_period(u8 *pss_min_period, u8 *pss_max_period, + u8 *ioss_min_period, u8 *ioss_max_period); + +int telemetry_set_sampling_period(u8 pss_period, u8 ioss_period); + +int telemetry_set_trace_verbosity(enum telemetry_unit telem_unit, + u32 verbosity); + +int telemetry_get_trace_verbosity(enum telemetry_unit telem_unit, + u32 *verbosity); + +#endif /* INTEL_TELEMETRY_H */ diff --git a/arch/x86/include/asm/iosf_mbi.h b/arch/x86/include/asm/iosf_mbi.h index b72ad0f..b41ee16 100644 --- a/arch/x86/include/asm/iosf_mbi.h +++ b/arch/x86/include/asm/iosf_mbi.h @@ -1,5 +1,5 @@ /* - * iosf_mbi.h: Intel OnChip System Fabric MailBox access support + * Intel OnChip System Fabric MailBox access support */ #ifndef IOSF_MBI_SYMS_H @@ -16,6 +16,18 @@ #define MBI_MASK_LO 0x000000FF #define MBI_ENABLE 0xF0 +/* IOSF SB read/write opcodes */ +#define MBI_MMIO_READ 0x00 +#define MBI_MMIO_WRITE 0x01 +#define MBI_CFG_READ 0x04 +#define MBI_CFG_WRITE 0x05 +#define MBI_CR_READ 0x06 +#define MBI_CR_WRITE 0x07 +#define MBI_REG_READ 0x10 +#define MBI_REG_WRITE 0x11 +#define MBI_ESRAM_READ 0x12 +#define MBI_ESRAM_WRITE 0x13 + /* Baytrail available units */ #define BT_MBI_UNIT_AUNIT 0x00 #define BT_MBI_UNIT_SMC 0x01 @@ -28,50 +40,13 @@ #define BT_MBI_UNIT_SATA 0xA3 #define BT_MBI_UNIT_PCIE 0xA6 -/* Baytrail read/write opcodes */ -#define BT_MBI_AUNIT_READ 0x10 -#define BT_MBI_AUNIT_WRITE 0x11 -#define BT_MBI_SMC_READ 0x10 -#define BT_MBI_SMC_WRITE 0x11 -#define BT_MBI_CPU_READ 0x10 -#define BT_MBI_CPU_WRITE 0x11 -#define BT_MBI_BUNIT_READ 0x10 -#define BT_MBI_BUNIT_WRITE 0x11 -#define BT_MBI_PMC_READ 0x06 -#define BT_MBI_PMC_WRITE 0x07 -#define BT_MBI_GFX_READ 0x00 -#define BT_MBI_GFX_WRITE 0x01 -#define BT_MBI_SMIO_READ 0x06 -#define BT_MBI_SMIO_WRITE 0x07 -#define BT_MBI_USB_READ 0x06 -#define BT_MBI_USB_WRITE 0x07 -#define BT_MBI_SATA_READ 0x00 -#define BT_MBI_SATA_WRITE 0x01 -#define BT_MBI_PCIE_READ 0x00 -#define BT_MBI_PCIE_WRITE 0x01 - /* Quark available units */ #define QRK_MBI_UNIT_HBA 0x00 #define QRK_MBI_UNIT_HB 0x03 #define QRK_MBI_UNIT_RMU 0x04 #define QRK_MBI_UNIT_MM 0x05 -#define QRK_MBI_UNIT_MMESRAM 0x05 #define QRK_MBI_UNIT_SOC 0x31 -/* Quark read/write opcodes */ -#define QRK_MBI_HBA_READ 0x10 -#define QRK_MBI_HBA_WRITE 0x11 -#define QRK_MBI_HB_READ 0x10 -#define QRK_MBI_HB_WRITE 0x11 -#define QRK_MBI_RMU_READ 0x10 -#define QRK_MBI_RMU_WRITE 0x11 -#define QRK_MBI_MM_READ 0x10 -#define QRK_MBI_MM_WRITE 0x11 -#define QRK_MBI_MMESRAM_READ 0x12 -#define QRK_MBI_MMESRAM_WRITE 0x13 -#define QRK_MBI_SOC_READ 0x06 -#define QRK_MBI_SOC_WRITE 0x07 - #if IS_ENABLED(CONFIG_IOSF_MBI) bool iosf_mbi_available(void); diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 30cfd64..44adbb8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -25,6 +25,7 @@ #include <linux/pvclock_gtod.h> #include <linux/clocksource.h> #include <linux/irqbypass.h> +#include <linux/hyperv.h> #include <asm/pvclock-abi.h> #include <asm/desc.h> @@ -45,6 +46,31 @@ #define KVM_IRQCHIP_NUM_PINS KVM_IOAPIC_NUM_PINS +/* x86-specific vcpu->requests bit members */ +#define KVM_REQ_MIGRATE_TIMER 8 +#define KVM_REQ_REPORT_TPR_ACCESS 9 +#define KVM_REQ_TRIPLE_FAULT 10 +#define KVM_REQ_MMU_SYNC 11 +#define KVM_REQ_CLOCK_UPDATE 12 +#define KVM_REQ_DEACTIVATE_FPU 13 +#define KVM_REQ_EVENT 14 +#define KVM_REQ_APF_HALT 15 +#define KVM_REQ_STEAL_UPDATE 16 +#define KVM_REQ_NMI 17 +#define KVM_REQ_PMU 18 +#define KVM_REQ_PMI 19 +#define KVM_REQ_SMI 20 +#define KVM_REQ_MASTERCLOCK_UPDATE 21 +#define KVM_REQ_MCLOCK_INPROGRESS 22 +#define KVM_REQ_SCAN_IOAPIC 23 +#define KVM_REQ_GLOBAL_CLOCK_UPDATE 24 +#define KVM_REQ_APIC_PAGE_RELOAD 25 +#define KVM_REQ_HV_CRASH 26 +#define KVM_REQ_IOAPIC_EOI_EXIT 27 +#define KVM_REQ_HV_RESET 28 +#define KVM_REQ_HV_EXIT 29 +#define KVM_REQ_HV_STIMER 30 + #define CR0_RESERVED_BITS \ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ @@ -213,6 +239,10 @@ union kvm_mmu_page_role { }; }; +struct kvm_rmap_head { + unsigned long val; +}; + struct kvm_mmu_page { struct list_head link; struct hlist_node hash_link; @@ -230,7 +260,7 @@ struct kvm_mmu_page { bool unsync; int root_count; /* Currently serving as active root */ unsigned int unsync_children; - unsigned long parent_ptes; /* Reverse mapping for parent_pte */ + struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ /* The page is obsolete if mmu_valid_gen != kvm->arch.mmu_valid_gen. */ unsigned long mmu_valid_gen; @@ -374,10 +404,38 @@ struct kvm_mtrr { struct list_head head; }; +/* Hyper-V SynIC timer */ +struct kvm_vcpu_hv_stimer { + struct hrtimer timer; + int index; + u64 config; + u64 count; + u64 exp_time; + struct hv_message msg; + bool msg_pending; +}; + +/* Hyper-V synthetic interrupt controller (SynIC)*/ +struct kvm_vcpu_hv_synic { + u64 version; + u64 control; + u64 msg_page; + u64 evt_page; + atomic64_t sint[HV_SYNIC_SINT_COUNT]; + atomic_t sint_to_gsi[HV_SYNIC_SINT_COUNT]; + DECLARE_BITMAP(auto_eoi_bitmap, 256); + DECLARE_BITMAP(vec_bitmap, 256); + bool active; +}; + /* Hyper-V per vcpu emulation context */ struct kvm_vcpu_hv { u64 hv_vapic; s64 runtime_offset; + struct kvm_vcpu_hv_synic synic; + struct kvm_hyperv_exit exit; + struct kvm_vcpu_hv_stimer stimer[HV_SYNIC_STIMER_COUNT]; + DECLARE_BITMAP(stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT); }; struct kvm_vcpu_arch { @@ -400,7 +458,8 @@ struct kvm_vcpu_arch { u64 efer; u64 apic_base; struct kvm_lapic *apic; /* kernel irqchip context */ - u64 eoi_exit_bitmap[4]; + bool apicv_active; + DECLARE_BITMAP(ioapic_handled_vectors, 256); unsigned long apic_attention; int32_t apic_arb_prio; int mp_state; @@ -589,7 +648,7 @@ struct kvm_lpage_info { }; struct kvm_arch_memory_slot { - unsigned long *rmap[KVM_NR_PAGE_SIZES]; + struct kvm_rmap_head *rmap[KVM_NR_PAGE_SIZES]; struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1]; }; @@ -831,10 +890,11 @@ struct kvm_x86_ops { void (*enable_nmi_window)(struct kvm_vcpu *vcpu); void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); - int (*cpu_uses_apicv)(struct kvm_vcpu *vcpu); + bool (*get_enable_apicv)(void); + void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); void (*hwapic_isr_update)(struct kvm *kvm, int isr); - void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu); + void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set); void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa); void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector); @@ -1086,6 +1146,8 @@ gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception); +void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu); + int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code, @@ -1231,6 +1293,9 @@ u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc); unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu); bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip); +void kvm_make_mclock_inprogress_request(struct kvm *kvm); +void kvm_make_scan_ioapic_request(struct kvm *kvm); + void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, struct kvm_async_pf *work); void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h index 3bbc07a..73d0c9b 100644 --- a/arch/x86/include/asm/lguest.h +++ b/arch/x86/include/asm/lguest.h @@ -12,7 +12,9 @@ #define GUEST_PL 1 /* Page for Switcher text itself, then two pages per cpu */ -#define TOTAL_SWITCHER_PAGES (1 + 2 * nr_cpu_ids) +#define SWITCHER_TEXT_PAGES (1) +#define SWITCHER_STACK_PAGES (2 * nr_cpu_ids) +#define TOTAL_SWITCHER_PAGES (SWITCHER_TEXT_PAGES + SWITCHER_STACK_PAGES) /* Where we map the Switcher, in both Host and Guest. */ extern unsigned long switcher_addr; diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 1edc9cd..bfd9b2a 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -132,14 +132,16 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, * be sent, and CPU 0's TLB will contain a stale entry.) * * The bad outcome can occur if either CPU's load is - * reordered before that CPU's store, so both CPUs much + * reordered before that CPU's store, so both CPUs must * execute full barriers to prevent this from happening. * * Thus, switch_mm needs a full barrier between the * store to mm_cpumask and any operation that could load - * from next->pgd. This barrier synchronizes with - * remote TLB flushers. Fortunately, load_cr3 is - * serializing and thus acts as a full barrier. + * from next->pgd. TLB fills are special and can happen + * due to instruction fetches or for no reason at all, + * and neither LOCK nor MFENCE orders them. + * Fortunately, load_cr3() is serializing and gives the + * ordering guarantee we need. * */ load_cr3(next->pgd); @@ -188,9 +190,8 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, * tlb flush IPI delivery. We must reload CR3 * to make sure to use no freed page tables. * - * As above, this is a barrier that forces - * TLB repopulation to be ordered after the - * store to mm_cpumask. + * As above, load_cr3() is serializing and orders TLB + * fills with respect to the mm_cpumask write. */ load_cr3(next->pgd); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index fa1195d..46873fb 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -151,11 +151,11 @@ extern struct list_head pci_mmcfg_list; #define PCI_MMCFG_BUS_OFFSET(bus) ((bus) << 20) /* - * AMD Fam10h CPUs are buggy, and cannot access MMIO config space - * on their northbrige except through the * %eax register. As such, you MUST - * NOT use normal IOMEM accesses, you need to only use the magic mmio-config - * accessor functions. - * In fact just use pci_config_*, nothing else please. + * On AMD Fam10h CPUs, all PCI MMIO configuration space accesses must use + * %eax. No other source or target registers may be used. The following + * mmio_config_* accessors enforce this. See "BIOS and Kernel Developer's + * Guide (BKDG) For AMD Family 10h Processors", rev. 3.48, sec 2.11.1, + * "MMIO Configuration Coding Requirements". */ static inline unsigned char mmio_config_readb(void __iomem *pos) { diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index d3eee66..0687c47 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -162,20 +162,22 @@ static inline int pmd_large(pmd_t pte) } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -static inline int pmd_trans_splitting(pmd_t pmd) -{ - return pmd_val(pmd) & _PAGE_SPLITTING; -} - static inline int pmd_trans_huge(pmd_t pmd) { - return pmd_val(pmd) & _PAGE_PSE; + return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE; } static inline int has_transparent_hugepage(void) { return cpu_has_pse; } + +#ifdef __HAVE_ARCH_PTE_DEVMAP +static inline int pmd_devmap(pmd_t pmd) +{ + return !!(pmd_val(pmd) & _PAGE_DEVMAP); +} +#endif #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline pte_t pte_set_flags(pte_t pte, pteval_t set) @@ -252,6 +254,11 @@ static inline pte_t pte_mkspecial(pte_t pte) return pte_set_flags(pte, _PAGE_SPECIAL); } +static inline pte_t pte_mkdevmap(pte_t pte) +{ + return pte_set_flags(pte, _PAGE_SPECIAL|_PAGE_DEVMAP); +} + static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set) { pmdval_t v = native_pmd_val(pmd); @@ -271,6 +278,11 @@ static inline pmd_t pmd_mkold(pmd_t pmd) return pmd_clear_flags(pmd, _PAGE_ACCESSED); } +static inline pmd_t pmd_mkclean(pmd_t pmd) +{ + return pmd_clear_flags(pmd, _PAGE_DIRTY); +} + static inline pmd_t pmd_wrprotect(pmd_t pmd) { return pmd_clear_flags(pmd, _PAGE_RW); @@ -281,6 +293,11 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd) return pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); } +static inline pmd_t pmd_mkdevmap(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_DEVMAP); +} + static inline pmd_t pmd_mkhuge(pmd_t pmd) { return pmd_set_flags(pmd, _PAGE_PSE); @@ -462,6 +479,13 @@ static inline int pte_present(pte_t a) return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); } +#ifdef __HAVE_ARCH_PTE_DEVMAP +static inline int pte_devmap(pte_t a) +{ + return (pte_flags(a) & _PAGE_DEVMAP) == _PAGE_DEVMAP; +} +#endif + #define pte_accessible pte_accessible static inline bool pte_accessible(struct mm_struct *mm, pte_t a) { @@ -808,10 +832,6 @@ extern int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); -#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH -extern void pmdp_splitting_flush(struct vm_area_struct *vma, - unsigned long addr, pmd_t *pmdp); - #define __HAVE_ARCH_PMD_WRITE static inline int pmd_write(pmd_t pmd) { diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index a471cad..04c27a0 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -22,10 +22,11 @@ #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ #define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1 #define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1 -#define _PAGE_BIT_SPLITTING _PAGE_BIT_SOFTW2 /* only valid on a PSE pmd */ #define _PAGE_BIT_HIDDEN _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */ #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ -#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ +#define _PAGE_BIT_SOFTW4 58 /* available for programmer */ +#define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4 +#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ /* If _PAGE_BIT_PRESENT is clear, we use these: */ /* - if the user mapped it with PROT_NONE; pte_present gives true */ @@ -46,7 +47,6 @@ #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) #define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) #define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) -#define _PAGE_SPLITTING (_AT(pteval_t, 1) << _PAGE_BIT_SPLITTING) #define __HAVE_ARCH_PTE_SPECIAL #ifdef CONFIG_KMEMCHECK @@ -85,8 +85,11 @@ #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) +#define _PAGE_DEVMAP (_AT(u64, 1) << _PAGE_BIT_DEVMAP) +#define __HAVE_ARCH_PTE_DEVMAP #else #define _PAGE_NX (_AT(pteval_t, 0)) +#define _PAGE_DEVMAP (_AT(pteval_t, 0)) #endif #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) diff --git a/arch/x86/include/asm/platform_sst_audio.h b/arch/x86/include/asm/platform_sst_audio.h index 7249e6d..5973a2f 100644 --- a/arch/x86/include/asm/platform_sst_audio.h +++ b/arch/x86/include/asm/platform_sst_audio.h @@ -55,6 +55,7 @@ enum sst_audio_device_id_mrfld { PIPE_MEDIA0_IN = 0x8F, PIPE_MEDIA1_IN = 0x90, PIPE_MEDIA2_IN = 0x91, + PIPE_MEDIA3_IN = 0x9C, PIPE_RSVD = 0xFF, }; diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h index d8ce3ec..c57fd1e 100644 --- a/arch/x86/include/asm/pmem.h +++ b/arch/x86/include/asm/pmem.h @@ -67,18 +67,19 @@ static inline void arch_wmb_pmem(void) } /** - * __arch_wb_cache_pmem - write back a cache range with CLWB + * arch_wb_cache_pmem - write back a cache range with CLWB * @vaddr: virtual start address * @size: number of bytes to write back * * Write back a cache range using the CLWB (cache line write back) * instruction. This function requires explicit ordering with an - * arch_wmb_pmem() call. This API is internal to the x86 PMEM implementation. + * arch_wmb_pmem() call. */ -static inline void __arch_wb_cache_pmem(void *vaddr, size_t size) +static inline void arch_wb_cache_pmem(void __pmem *addr, size_t size) { u16 x86_clflush_size = boot_cpu_data.x86_clflush_size; unsigned long clflush_mask = x86_clflush_size - 1; + void *vaddr = (void __force *)addr; void *vend = vaddr + size; void *p; @@ -115,7 +116,7 @@ static inline size_t arch_copy_from_iter_pmem(void __pmem *addr, size_t bytes, len = copy_from_iter_nocache(vaddr, bytes, i); if (__iter_needs_pmem_wb(i)) - __arch_wb_cache_pmem(vaddr, bytes); + arch_wb_cache_pmem(addr, bytes); return len; } @@ -132,13 +133,8 @@ static inline void arch_clear_pmem(void __pmem *addr, size_t size) { void *vaddr = (void __force *)addr; - /* TODO: implement the zeroing via non-temporal writes */ - if (size == PAGE_SIZE && ((unsigned long)vaddr & ~PAGE_MASK) == 0) - clear_page(vaddr); - else - memset(vaddr, 0, size); - - __arch_wb_cache_pmem(vaddr, size); + memset(vaddr, 0, size); + arch_wb_cache_pmem(addr, size); } static inline bool __arch_has_wmb_pmem(void) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 660458a..a4a30e4 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -134,6 +134,9 @@ extern int __get_user_4(void); extern int __get_user_8(void); extern int __get_user_bad(void); +#define __uaccess_begin() stac() +#define __uaccess_end() clac() + /* * This is a type: either unsigned long, if the argument fits into * that type, or otherwise unsigned long long. @@ -193,10 +196,10 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) #ifdef CONFIG_X86_32 #define __put_user_asm_u64(x, addr, err, errret) \ - asm volatile(ASM_STAC "\n" \ + asm volatile("\n" \ "1: movl %%eax,0(%2)\n" \ "2: movl %%edx,4(%2)\n" \ - "3: " ASM_CLAC "\n" \ + "3:" \ ".section .fixup,\"ax\"\n" \ "4: movl %3,%0\n" \ " jmp 3b\n" \ @@ -207,10 +210,10 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) : "A" (x), "r" (addr), "i" (errret), "0" (err)) #define __put_user_asm_ex_u64(x, addr) \ - asm volatile(ASM_STAC "\n" \ + asm volatile("\n" \ "1: movl %%eax,0(%1)\n" \ "2: movl %%edx,4(%1)\n" \ - "3: " ASM_CLAC "\n" \ + "3:" \ _ASM_EXTABLE_EX(1b, 2b) \ _ASM_EXTABLE_EX(2b, 3b) \ : : "A" (x), "r" (addr)) @@ -304,6 +307,10 @@ do { \ } \ } while (0) +/* + * This doesn't do __uaccess_begin/end - the exception handling + * around it must do that. + */ #define __put_user_size_ex(x, ptr, size) \ do { \ __chk_user_ptr(ptr); \ @@ -358,9 +365,9 @@ do { \ } while (0) #define __get_user_asm(x, addr, err, itype, rtype, ltype, errret) \ - asm volatile(ASM_STAC "\n" \ + asm volatile("\n" \ "1: mov"itype" %2,%"rtype"1\n" \ - "2: " ASM_CLAC "\n" \ + "2:\n" \ ".section .fixup,\"ax\"\n" \ "3: mov %3,%0\n" \ " xor"itype" %"rtype"1,%"rtype"1\n" \ @@ -370,6 +377,10 @@ do { \ : "=r" (err), ltype(x) \ : "m" (__m(addr)), "i" (errret), "0" (err)) +/* + * This doesn't do __uaccess_begin/end - the exception handling + * around it must do that. + */ #define __get_user_size_ex(x, ptr, size) \ do { \ __chk_user_ptr(ptr); \ @@ -400,7 +411,9 @@ do { \ #define __put_user_nocheck(x, ptr, size) \ ({ \ int __pu_err; \ + __uaccess_begin(); \ __put_user_size((x), (ptr), (size), __pu_err, -EFAULT); \ + __uaccess_end(); \ __builtin_expect(__pu_err, 0); \ }) @@ -408,7 +421,9 @@ do { \ ({ \ int __gu_err; \ unsigned long __gu_val; \ + __uaccess_begin(); \ __get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \ + __uaccess_end(); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ __builtin_expect(__gu_err, 0); \ }) @@ -423,9 +438,9 @@ struct __large_struct { unsigned long buf[100]; }; * aliasing issues. */ #define __put_user_asm(x, addr, err, itype, rtype, ltype, errret) \ - asm volatile(ASM_STAC "\n" \ + asm volatile("\n" \ "1: mov"itype" %"rtype"1,%2\n" \ - "2: " ASM_CLAC "\n" \ + "2:\n" \ ".section .fixup,\"ax\"\n" \ "3: mov %3,%0\n" \ " jmp 2b\n" \ @@ -445,11 +460,11 @@ struct __large_struct { unsigned long buf[100]; }; */ #define uaccess_try do { \ current_thread_info()->uaccess_err = 0; \ - stac(); \ + __uaccess_begin(); \ barrier(); #define uaccess_catch(err) \ - clac(); \ + __uaccess_end(); \ (err) |= (current_thread_info()->uaccess_err ? -EFAULT : 0); \ } while (0) @@ -547,12 +562,13 @@ extern void __cmpxchg_wrong_size(void) __typeof__(ptr) __uval = (uval); \ __typeof__(*(ptr)) __old = (old); \ __typeof__(*(ptr)) __new = (new); \ + __uaccess_begin(); \ switch (size) { \ case 1: \ { \ - asm volatile("\t" ASM_STAC "\n" \ + asm volatile("\n" \ "1:\t" LOCK_PREFIX "cmpxchgb %4, %2\n" \ - "2:\t" ASM_CLAC "\n" \ + "2:\n" \ "\t.section .fixup, \"ax\"\n" \ "3:\tmov %3, %0\n" \ "\tjmp 2b\n" \ @@ -566,9 +582,9 @@ extern void __cmpxchg_wrong_size(void) } \ case 2: \ { \ - asm volatile("\t" ASM_STAC "\n" \ + asm volatile("\n" \ "1:\t" LOCK_PREFIX "cmpxchgw %4, %2\n" \ - "2:\t" ASM_CLAC "\n" \ + "2:\n" \ "\t.section .fixup, \"ax\"\n" \ "3:\tmov %3, %0\n" \ "\tjmp 2b\n" \ @@ -582,9 +598,9 @@ extern void __cmpxchg_wrong_size(void) } \ case 4: \ { \ - asm volatile("\t" ASM_STAC "\n" \ + asm volatile("\n" \ "1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n" \ - "2:\t" ASM_CLAC "\n" \ + "2:\n" \ "\t.section .fixup, \"ax\"\n" \ "3:\tmov %3, %0\n" \ "\tjmp 2b\n" \ @@ -601,9 +617,9 @@ extern void __cmpxchg_wrong_size(void) if (!IS_ENABLED(CONFIG_X86_64)) \ __cmpxchg_wrong_size(); \ \ - asm volatile("\t" ASM_STAC "\n" \ + asm volatile("\n" \ "1:\t" LOCK_PREFIX "cmpxchgq %4, %2\n" \ - "2:\t" ASM_CLAC "\n" \ + "2:\n" \ "\t.section .fixup, \"ax\"\n" \ "3:\tmov %3, %0\n" \ "\tjmp 2b\n" \ @@ -618,6 +634,7 @@ extern void __cmpxchg_wrong_size(void) default: \ __cmpxchg_wrong_size(); \ } \ + __uaccess_end(); \ *__uval = __old; \ __ret; \ }) @@ -754,5 +771,30 @@ copy_to_user(void __user *to, const void *from, unsigned long n) */ #define __copy_from_user_nmi __copy_from_user_inatomic +/* + * The "unsafe" user accesses aren't really "unsafe", but the naming + * is a big fat warning: you have to not only do the access_ok() + * checking before using them, but you have to surround them with the + * user_access_begin/end() pair. + */ +#define user_access_begin() __uaccess_begin() +#define user_access_end() __uaccess_end() + +#define unsafe_put_user(x, ptr) \ +({ \ + int __pu_err; \ + __put_user_size((x), (ptr), sizeof(*(ptr)), __pu_err, -EFAULT); \ + __builtin_expect(__pu_err, 0); \ +}) + +#define unsafe_get_user(x, ptr) \ +({ \ + int __gu_err; \ + unsigned long __gu_val; \ + __get_user_size(__gu_val, (ptr), sizeof(*(ptr)), __gu_err, -EFAULT); \ + (x) = (__force __typeof__(*(ptr)))__gu_val; \ + __builtin_expect(__gu_err, 0); \ +}) + #endif /* _ASM_X86_UACCESS_H */ diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index f2f9b39..b89c34c 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -56,35 +56,49 @@ int __copy_from_user_nocheck(void *dst, const void __user *src, unsigned size) if (!__builtin_constant_p(size)) return copy_user_generic(dst, (__force void *)src, size); switch (size) { - case 1:__get_user_asm(*(u8 *)dst, (u8 __user *)src, + case 1: + __uaccess_begin(); + __get_user_asm(*(u8 *)dst, (u8 __user *)src, ret, "b", "b", "=q", 1); + __uaccess_end(); return ret; - case 2:__get_user_asm(*(u16 *)dst, (u16 __user *)src, + case 2: + __uaccess_begin(); + __get_user_asm(*(u16 *)dst, (u16 __user *)src, ret, "w", "w", "=r", 2); + __uaccess_end(); return ret; - case 4:__get_user_asm(*(u32 *)dst, (u32 __user *)src, + case 4: + __uaccess_begin(); + __get_user_asm(*(u32 *)dst, (u32 __user *)src, ret, "l", "k", "=r", 4); + __uaccess_end(); return ret; - case 8:__get_user_asm(*(u64 *)dst, (u64 __user *)src, + case 8: + __uaccess_begin(); + __get_user_asm(*(u64 *)dst, (u64 __user *)src, ret, "q", "", "=r", 8); + __uaccess_end(); return ret; case 10: + __uaccess_begin(); __get_user_asm(*(u64 *)dst, (u64 __user *)src, ret, "q", "", "=r", 10); - if (unlikely(ret)) - return ret; - __get_user_asm(*(u16 *)(8 + (char *)dst), - (u16 __user *)(8 + (char __user *)src), - ret, "w", "w", "=r", 2); + if (likely(!ret)) + __get_user_asm(*(u16 *)(8 + (char *)dst), + (u16 __user *)(8 + (char __user *)src), + ret, "w", "w", "=r", 2); + __uaccess_end(); return ret; case 16: + __uaccess_begin(); __get_user_asm(*(u64 *)dst, (u64 __user *)src, ret, "q", "", "=r", 16); - if (unlikely(ret)) - return ret; - __get_user_asm(*(u64 *)(8 + (char *)dst), - (u64 __user *)(8 + (char __user *)src), - ret, "q", "", "=r", 8); + if (likely(!ret)) + __get_user_asm(*(u64 *)(8 + (char *)dst), + (u64 __user *)(8 + (char __user *)src), + ret, "q", "", "=r", 8); + __uaccess_end(); return ret; default: return copy_user_generic(dst, (__force void *)src, size); @@ -106,35 +120,51 @@ int __copy_to_user_nocheck(void __user *dst, const void *src, unsigned size) if (!__builtin_constant_p(size)) return copy_user_generic((__force void *)dst, src, size); switch (size) { - case 1:__put_user_asm(*(u8 *)src, (u8 __user *)dst, + case 1: + __uaccess_begin(); + __put_user_asm(*(u8 *)src, (u8 __user *)dst, ret, "b", "b", "iq", 1); + __uaccess_end(); return ret; - case 2:__put_user_asm(*(u16 *)src, (u16 __user *)dst, + case 2: + __uaccess_begin(); + __put_user_asm(*(u16 *)src, (u16 __user *)dst, ret, "w", "w", "ir", 2); + __uaccess_end(); return ret; - case 4:__put_user_asm(*(u32 *)src, (u32 __user *)dst, + case 4: + __uaccess_begin(); + __put_user_asm(*(u32 *)src, (u32 __user *)dst, ret, "l", "k", "ir", 4); + __uaccess_end(); return ret; - case 8:__put_user_asm(*(u64 *)src, (u64 __user *)dst, + case 8: + __uaccess_begin(); + __put_user_asm(*(u64 *)src, (u64 __user *)dst, ret, "q", "", "er", 8); + __uaccess_end(); return ret; case 10: + __uaccess_begin(); __put_user_asm(*(u64 *)src, (u64 __user *)dst, ret, "q", "", "er", 10); - if (unlikely(ret)) - return ret; - asm("":::"memory"); - __put_user_asm(4[(u16 *)src], 4 + (u16 __user *)dst, - ret, "w", "w", "ir", 2); + if (likely(!ret)) { + asm("":::"memory"); + __put_user_asm(4[(u16 *)src], 4 + (u16 __user *)dst, + ret, "w", "w", "ir", 2); + } + __uaccess_end(); return ret; case 16: + __uaccess_begin(); __put_user_asm(*(u64 *)src, (u64 __user *)dst, ret, "q", "", "er", 16); - if (unlikely(ret)) - return ret; - asm("":::"memory"); - __put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst, - ret, "q", "", "er", 8); + if (likely(!ret)) { + asm("":::"memory"); + __put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst, + ret, "q", "", "er", 8); + } + __uaccess_end(); return ret; default: return copy_user_generic((__force void *)dst, src, size); @@ -160,39 +190,47 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size) switch (size) { case 1: { u8 tmp; + __uaccess_begin(); __get_user_asm(tmp, (u8 __user *)src, ret, "b", "b", "=q", 1); if (likely(!ret)) __put_user_asm(tmp, (u8 __user *)dst, ret, "b", "b", "iq", 1); + __uaccess_end(); return ret; } case 2: { u16 tmp; + __uaccess_begin(); __get_user_asm(tmp, (u16 __user *)src, ret, "w", "w", "=r", 2); if (likely(!ret)) __put_user_asm(tmp, (u16 __user *)dst, ret, "w", "w", "ir", 2); + __uaccess_end(); return ret; } case 4: { u32 tmp; + __uaccess_begin(); __get_user_asm(tmp, (u32 __user *)src, ret, "l", "k", "=r", 4); if (likely(!ret)) __put_user_asm(tmp, (u32 __user *)dst, ret, "l", "k", "ir", 4); + __uaccess_end(); return ret; } case 8: { u64 tmp; + __uaccess_begin(); __get_user_asm(tmp, (u64 __user *)src, ret, "q", "", "=r", 8); if (likely(!ret)) __put_user_asm(tmp, (u64 __user *)dst, ret, "q", "", "er", 8); + __uaccess_end(); return ret; } default: diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 4c20dd3..3bcdcc8 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -310,10 +310,10 @@ HYPERVISOR_mca(struct xen_mc *mc_op) } static inline int -HYPERVISOR_dom0_op(struct xen_platform_op *platform_op) +HYPERVISOR_platform_op(struct xen_platform_op *op) { - platform_op->interface_version = XENPF_INTERFACE_VERSION; - return _hypercall1(int, dom0_op, platform_op); + op->interface_version = XENPF_INTERFACE_VERSION; + return _hypercall1(int, platform_op, op); } static inline int diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h index 040d408..7956412 100644 --- a/arch/x86/include/uapi/asm/hyperv.h +++ b/arch/x86/include/uapi/asm/hyperv.h @@ -269,4 +269,96 @@ typedef struct _HV_REFERENCE_TSC_PAGE { #define HV_SYNIC_SINT_AUTO_EOI (1ULL << 17) #define HV_SYNIC_SINT_VECTOR_MASK (0xFF) +#define HV_SYNIC_STIMER_COUNT (4) + +/* Define synthetic interrupt controller message constants. */ +#define HV_MESSAGE_SIZE (256) +#define HV_MESSAGE_PAYLOAD_BYTE_COUNT (240) +#define HV_MESSAGE_PAYLOAD_QWORD_COUNT (30) + +/* Define hypervisor message types. */ +enum hv_message_type { + HVMSG_NONE = 0x00000000, + + /* Memory access messages. */ + HVMSG_UNMAPPED_GPA = 0x80000000, + HVMSG_GPA_INTERCEPT = 0x80000001, + + /* Timer notification messages. */ + HVMSG_TIMER_EXPIRED = 0x80000010, + + /* Error messages. */ + HVMSG_INVALID_VP_REGISTER_VALUE = 0x80000020, + HVMSG_UNRECOVERABLE_EXCEPTION = 0x80000021, + HVMSG_UNSUPPORTED_FEATURE = 0x80000022, + + /* Trace buffer complete messages. */ + HVMSG_EVENTLOG_BUFFERCOMPLETE = 0x80000040, + + /* Platform-specific processor intercept messages. */ + HVMSG_X64_IOPORT_INTERCEPT = 0x80010000, + HVMSG_X64_MSR_INTERCEPT = 0x80010001, + HVMSG_X64_CPUID_INTERCEPT = 0x80010002, + HVMSG_X64_EXCEPTION_INTERCEPT = 0x80010003, + HVMSG_X64_APIC_EOI = 0x80010004, + HVMSG_X64_LEGACY_FP_ERROR = 0x80010005 +}; + +/* Define synthetic interrupt controller message flags. */ +union hv_message_flags { + __u8 asu8; + struct { + __u8 msg_pending:1; + __u8 reserved:7; + }; +}; + +/* Define port identifier type. */ +union hv_port_id { + __u32 asu32; + struct { + __u32 id:24; + __u32 reserved:8; + } u; +}; + +/* Define synthetic interrupt controller message header. */ +struct hv_message_header { + __u32 message_type; + __u8 payload_size; + union hv_message_flags message_flags; + __u8 reserved[2]; + union { + __u64 sender; + union hv_port_id port; + }; +}; + +/* Define synthetic interrupt controller message format. */ +struct hv_message { + struct hv_message_header header; + union { + __u64 payload[HV_MESSAGE_PAYLOAD_QWORD_COUNT]; + } u; +}; + +/* Define the synthetic interrupt message page layout. */ +struct hv_message_page { + struct hv_message sint_message[HV_SYNIC_SINT_COUNT]; +}; + +/* Define timer message payload structure. */ +struct hv_timer_message_payload { + __u32 timer_index; + __u32 reserved; + __u64 expiration_time; /* When the timer expired */ + __u64 delivery_time; /* When the message was delivered */ +}; + +#define HV_STIMER_ENABLE (1ULL << 0) +#define HV_STIMER_PERIODIC (1ULL << 1) +#define HV_STIMER_LAZY (1ULL << 2) +#define HV_STIMER_AUTOENABLE (1ULL << 3) +#define HV_STIMER_SINT(config) (__u8)(((config) >> 16) & 0x0F) + #endif diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index e678dde..a07956a 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -434,8 +434,7 @@ static void srat_detect_node(struct cpuinfo_x86 *c) */ int ht_nodeid = c->initial_apicid; - if (ht_nodeid >= 0 && - __apicid_to_node[ht_nodeid] != NUMA_NO_NODE) + if (__apicid_to_node[ht_nodeid] != NUMA_NO_NODE) node = __apicid_to_node[ht_nodeid]; /* Pick a nearby node */ if (!node_online(node)) diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index bd3507d..2836de3 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -58,28 +58,6 @@ static void cpuid_smp_cpuid(void *cmd_block) &cmd->eax, &cmd->ebx, &cmd->ecx, &cmd->edx); } -static loff_t cpuid_seek(struct file *file, loff_t offset, int orig) -{ - loff_t ret; - struct inode *inode = file->f_mapping->host; - - mutex_lock(&inode->i_mutex); - switch (orig) { - case 0: - file->f_pos = offset; - ret = file->f_pos; - break; - case 1: - file->f_pos += offset; - ret = file->f_pos; - break; - default: - ret = -EINVAL; - } - mutex_unlock(&inode->i_mutex); - return ret; -} - static ssize_t cpuid_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -132,7 +110,7 @@ static int cpuid_open(struct inode *inode, struct file *file) */ static const struct file_operations cpuid_fops = { .owner = THIS_MODULE, - .llseek = cpuid_seek, + .llseek = no_seek_end_llseek, .read = cpuid_read, .open = cpuid_open, }; diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index db9a675..bca14c8 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -547,6 +547,7 @@ static const struct pci_device_id intel_stolen_ids[] __initconst = { INTEL_CHV_IDS(&chv_stolen_funcs), INTEL_SKL_IDS(&gen9_stolen_funcs), INTEL_BXT_IDS(&gen9_stolen_funcs), + INTEL_KBL_IDS(&gen9_stolen_funcs), }; static void __init intel_graphics_stolen(int num, int slot, int func) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 311bcf3..29408d6 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -105,14 +105,14 @@ ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code, { unsigned char replaced[MCOUNT_INSN_SIZE]; + ftrace_expected = old_code; + /* - * Note: Due to modules and __init, code can - * disappear and change, we need to protect against faulting - * as well as code changing. We do this by using the - * probe_kernel_* functions. - * - * No real locking needed, this code is run through - * kstop_machine, or before SMP starts. + * Note: + * We are paranoid about modifying text, as if a bug was to happen, it + * could cause us to read or write to someplace that could cause harm. + * Carefully read and modify the code with probe_kernel_*(), and make + * sure what we read is what we expected it to be before modifying it. */ /* read the text we want to modify */ @@ -154,6 +154,8 @@ int ftrace_make_nop(struct module *mod, if (addr == MCOUNT_ADDR) return ftrace_modify_code_direct(rec->ip, old, new); + ftrace_expected = NULL; + /* Normal cases use add_brk_on_nop */ WARN_ONCE(1, "invalid use of ftrace_make_nop"); return -EINVAL; @@ -220,6 +222,7 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, unsigned long addr) { WARN_ON(1); + ftrace_expected = NULL; return -EINVAL; } @@ -314,6 +317,8 @@ static int add_break(unsigned long ip, const char *old) if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE)) return -EFAULT; + ftrace_expected = old; + /* Make sure it is what we expect it to be */ if (memcmp(replaced, old, MCOUNT_INSN_SIZE) != 0) return -EINVAL; @@ -413,6 +418,8 @@ static int remove_breakpoint(struct dyn_ftrace *rec) ftrace_addr = ftrace_get_addr_curr(rec); nop = ftrace_call_replace(ip, ftrace_addr); + ftrace_expected = nop; + if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) return -EINVAL; } diff --git a/arch/x86/kernel/livepatch.c b/arch/x86/kernel/livepatch.c index d1d35cc..92fc1a5 100644 --- a/arch/x86/kernel/livepatch.c +++ b/arch/x86/kernel/livepatch.c @@ -20,8 +20,6 @@ #include <linux/module.h> #include <linux/uaccess.h> -#include <asm/cacheflush.h> -#include <asm/page_types.h> #include <asm/elf.h> #include <asm/livepatch.h> @@ -38,11 +36,10 @@ int klp_write_module_reloc(struct module *mod, unsigned long type, unsigned long loc, unsigned long value) { - int ret, numpages, size = 4; - bool readonly; + size_t size = 4; unsigned long val; - unsigned long core = (unsigned long)mod->module_core; - unsigned long core_size = mod->core_size; + unsigned long core = (unsigned long)mod->core_layout.base; + unsigned long core_size = mod->core_layout.size; switch (type) { case R_X86_64_NONE: @@ -69,23 +66,5 @@ int klp_write_module_reloc(struct module *mod, unsigned long type, /* loc does not point to any symbol inside the module */ return -EINVAL; - readonly = false; - -#ifdef CONFIG_DEBUG_SET_MODULE_RONX - if (loc < core + mod->core_ro_size) - readonly = true; -#endif - - /* determine if the relocation spans a page boundary */ - numpages = ((loc & PAGE_MASK) == ((loc + size) & PAGE_MASK)) ? 1 : 2; - - if (readonly) - set_memory_rw(loc & PAGE_MASK, numpages); - - ret = probe_kernel_write((void *)loc, &val, size); - - if (readonly) - set_memory_ro(loc & PAGE_MASK, numpages); - - return ret; + return probe_kernel_write((void *)loc, &val, size); } diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 819ab3f..ba7fbba 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -385,6 +385,7 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image) return image->fops->cleanup(image->image_loader_data); } +#ifdef CONFIG_KEXEC_VERIFY_SIG int arch_kexec_kernel_verify_sig(struct kimage *image, void *kernel, unsigned long kernel_len) { @@ -395,6 +396,7 @@ int arch_kexec_kernel_verify_sig(struct kimage *image, void *kernel, return image->fops->verify_sig(kernel, kernel_len); } +#endif /* * Apply purgatory relocations. diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 113e707..64f9616 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -45,28 +45,6 @@ static struct class *msr_class; -static loff_t msr_seek(struct file *file, loff_t offset, int orig) -{ - loff_t ret; - struct inode *inode = file_inode(file); - - mutex_lock(&inode->i_mutex); - switch (orig) { - case SEEK_SET: - file->f_pos = offset; - ret = file->f_pos; - break; - case SEEK_CUR: - file->f_pos += offset; - ret = file->f_pos; - break; - default: - ret = -EINVAL; - } - mutex_unlock(&inode->i_mutex); - return ret; -} - static ssize_t msr_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -194,7 +172,7 @@ static int msr_open(struct inode *inode, struct file *file) */ static const struct file_operations msr_fops = { .owner = THIS_MODULE, - .llseek = msr_seek, + .llseek = no_seek_end_llseek, .read = msr_read, .write = msr_write, .open = msr_open, diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index d64889a..ab0adc0 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -182,6 +182,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"), }, }, + { /* Handle problems with rebooting on the iMac10,1. */ + .callback = set_pci_reboot, + .ident = "Apple iMac10,1", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "iMac10,1"), + }, + }, /* ASRock */ { /* Handle problems with rebooting on ASRock Q1900DC-ITX */ diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 483231e..e574b85 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -175,7 +175,11 @@ static void mark_screen_rdonly(struct mm_struct *mm) if (pud_none_or_clear_bad(pud)) goto out; pmd = pmd_offset(pud, 0xA0000); - split_huge_page_pmd_mm(mm, 0xA0000, pmd); + + if (pmd_trans_huge(*pmd)) { + struct vm_area_struct *vma = find_vma(mm, 0xA0000); + split_huge_pmd(vma, pmd, 0xA0000); + } if (pmd_none_or_clear_bad(pmd)) goto out; pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl); diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 62cf8c9..c58ba67 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -23,13 +23,665 @@ #include "x86.h" #include "lapic.h" +#include "ioapic.h" #include "hyperv.h" #include <linux/kvm_host.h> +#include <linux/highmem.h> +#include <asm/apicdef.h> #include <trace/events/kvm.h> #include "trace.h" +static inline u64 synic_read_sint(struct kvm_vcpu_hv_synic *synic, int sint) +{ + return atomic64_read(&synic->sint[sint]); +} + +static inline int synic_get_sint_vector(u64 sint_value) +{ + if (sint_value & HV_SYNIC_SINT_MASKED) + return -1; + return sint_value & HV_SYNIC_SINT_VECTOR_MASK; +} + +static bool synic_has_vector_connected(struct kvm_vcpu_hv_synic *synic, + int vector) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(synic->sint); i++) { + if (synic_get_sint_vector(synic_read_sint(synic, i)) == vector) + return true; + } + return false; +} + +static bool synic_has_vector_auto_eoi(struct kvm_vcpu_hv_synic *synic, + int vector) +{ + int i; + u64 sint_value; + + for (i = 0; i < ARRAY_SIZE(synic->sint); i++) { + sint_value = synic_read_sint(synic, i); + if (synic_get_sint_vector(sint_value) == vector && + sint_value & HV_SYNIC_SINT_AUTO_EOI) + return true; + } + return false; +} + +static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint, + u64 data, bool host) +{ + int vector; + + vector = data & HV_SYNIC_SINT_VECTOR_MASK; + if (vector < 16 && !host) + return 1; + /* + * Guest may configure multiple SINTs to use the same vector, so + * we maintain a bitmap of vectors handled by synic, and a + * bitmap of vectors with auto-eoi behavior. The bitmaps are + * updated here, and atomically queried on fast paths. + */ + + atomic64_set(&synic->sint[sint], data); + + if (synic_has_vector_connected(synic, vector)) + __set_bit(vector, synic->vec_bitmap); + else + __clear_bit(vector, synic->vec_bitmap); + + if (synic_has_vector_auto_eoi(synic, vector)) + __set_bit(vector, synic->auto_eoi_bitmap); + else + __clear_bit(vector, synic->auto_eoi_bitmap); + + /* Load SynIC vectors into EOI exit bitmap */ + kvm_make_request(KVM_REQ_SCAN_IOAPIC, synic_to_vcpu(synic)); + return 0; +} + +static struct kvm_vcpu_hv_synic *synic_get(struct kvm *kvm, u32 vcpu_id) +{ + struct kvm_vcpu *vcpu; + struct kvm_vcpu_hv_synic *synic; + + if (vcpu_id >= atomic_read(&kvm->online_vcpus)) + return NULL; + vcpu = kvm_get_vcpu(kvm, vcpu_id); + if (!vcpu) + return NULL; + synic = vcpu_to_synic(vcpu); + return (synic->active) ? synic : NULL; +} + +static void synic_clear_sint_msg_pending(struct kvm_vcpu_hv_synic *synic, + u32 sint) +{ + struct kvm_vcpu *vcpu = synic_to_vcpu(synic); + struct page *page; + gpa_t gpa; + struct hv_message *msg; + struct hv_message_page *msg_page; + + gpa = synic->msg_page & PAGE_MASK; + page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT); + if (is_error_page(page)) { + vcpu_err(vcpu, "Hyper-V SynIC can't get msg page, gpa 0x%llx\n", + gpa); + return; + } + msg_page = kmap_atomic(page); + + msg = &msg_page->sint_message[sint]; + msg->header.message_flags.msg_pending = 0; + + kunmap_atomic(msg_page); + kvm_release_page_dirty(page); + kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); +} + +static void kvm_hv_notify_acked_sint(struct kvm_vcpu *vcpu, u32 sint) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu); + struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu); + struct kvm_vcpu_hv_stimer *stimer; + int gsi, idx, stimers_pending; + + trace_kvm_hv_notify_acked_sint(vcpu->vcpu_id, sint); + + if (synic->msg_page & HV_SYNIC_SIMP_ENABLE) + synic_clear_sint_msg_pending(synic, sint); + + /* Try to deliver pending Hyper-V SynIC timers messages */ + stimers_pending = 0; + for (idx = 0; idx < ARRAY_SIZE(hv_vcpu->stimer); idx++) { + stimer = &hv_vcpu->stimer[idx]; + if (stimer->msg_pending && + (stimer->config & HV_STIMER_ENABLE) && + HV_STIMER_SINT(stimer->config) == sint) { + set_bit(stimer->index, + hv_vcpu->stimer_pending_bitmap); + stimers_pending++; + } + } + if (stimers_pending) + kvm_make_request(KVM_REQ_HV_STIMER, vcpu); + + idx = srcu_read_lock(&kvm->irq_srcu); + gsi = atomic_read(&synic->sint_to_gsi[sint]); + if (gsi != -1) + kvm_notify_acked_gsi(kvm, gsi); + srcu_read_unlock(&kvm->irq_srcu, idx); +} + +static void synic_exit(struct kvm_vcpu_hv_synic *synic, u32 msr) +{ + struct kvm_vcpu *vcpu = synic_to_vcpu(synic); + struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv; + + hv_vcpu->exit.type = KVM_EXIT_HYPERV_SYNIC; + hv_vcpu->exit.u.synic.msr = msr; + hv_vcpu->exit.u.synic.control = synic->control; + hv_vcpu->exit.u.synic.evt_page = synic->evt_page; + hv_vcpu->exit.u.synic.msg_page = synic->msg_page; + + kvm_make_request(KVM_REQ_HV_EXIT, vcpu); +} + +static int synic_set_msr(struct kvm_vcpu_hv_synic *synic, + u32 msr, u64 data, bool host) +{ + struct kvm_vcpu *vcpu = synic_to_vcpu(synic); + int ret; + + if (!synic->active) + return 1; + + trace_kvm_hv_synic_set_msr(vcpu->vcpu_id, msr, data, host); + + ret = 0; + switch (msr) { + case HV_X64_MSR_SCONTROL: + synic->control = data; + if (!host) + synic_exit(synic, msr); + break; + case HV_X64_MSR_SVERSION: + if (!host) { + ret = 1; + break; + } + synic->version = data; + break; + case HV_X64_MSR_SIEFP: + if (data & HV_SYNIC_SIEFP_ENABLE) + if (kvm_clear_guest(vcpu->kvm, + data & PAGE_MASK, PAGE_SIZE)) { + ret = 1; + break; + } + synic->evt_page = data; + if (!host) + synic_exit(synic, msr); + break; + case HV_X64_MSR_SIMP: + if (data & HV_SYNIC_SIMP_ENABLE) + if (kvm_clear_guest(vcpu->kvm, + data & PAGE_MASK, PAGE_SIZE)) { + ret = 1; + break; + } + synic->msg_page = data; + if (!host) + synic_exit(synic, msr); + break; + case HV_X64_MSR_EOM: { + int i; + + for (i = 0; i < ARRAY_SIZE(synic->sint); i++) + kvm_hv_notify_acked_sint(vcpu, i); + break; + } + case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: + ret = synic_set_sint(synic, msr - HV_X64_MSR_SINT0, data, host); + break; + default: + ret = 1; + break; + } + return ret; +} + +static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata) +{ + int ret; + + if (!synic->active) + return 1; + + ret = 0; + switch (msr) { + case HV_X64_MSR_SCONTROL: + *pdata = synic->control; + break; + case HV_X64_MSR_SVERSION: + *pdata = synic->version; + break; + case HV_X64_MSR_SIEFP: + *pdata = synic->evt_page; + break; + case HV_X64_MSR_SIMP: + *pdata = synic->msg_page; + break; + case HV_X64_MSR_EOM: + *pdata = 0; + break; + case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: + *pdata = atomic64_read(&synic->sint[msr - HV_X64_MSR_SINT0]); + break; + default: + ret = 1; + break; + } + return ret; +} + +int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint) +{ + struct kvm_vcpu *vcpu = synic_to_vcpu(synic); + struct kvm_lapic_irq irq; + int ret, vector; + + if (sint >= ARRAY_SIZE(synic->sint)) + return -EINVAL; + + vector = synic_get_sint_vector(synic_read_sint(synic, sint)); + if (vector < 0) + return -ENOENT; + + memset(&irq, 0, sizeof(irq)); + irq.dest_id = kvm_apic_id(vcpu->arch.apic); + irq.dest_mode = APIC_DEST_PHYSICAL; + irq.delivery_mode = APIC_DM_FIXED; + irq.vector = vector; + irq.level = 1; + + ret = kvm_irq_delivery_to_apic(vcpu->kvm, NULL, &irq, NULL); + trace_kvm_hv_synic_set_irq(vcpu->vcpu_id, sint, irq.vector, ret); + return ret; +} + +int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint) +{ + struct kvm_vcpu_hv_synic *synic; + + synic = synic_get(kvm, vcpu_id); + if (!synic) + return -EINVAL; + + return synic_set_irq(synic, sint); +} + +void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector) +{ + struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu); + int i; + + trace_kvm_hv_synic_send_eoi(vcpu->vcpu_id, vector); + + for (i = 0; i < ARRAY_SIZE(synic->sint); i++) + if (synic_get_sint_vector(synic_read_sint(synic, i)) == vector) + kvm_hv_notify_acked_sint(vcpu, i); +} + +static int kvm_hv_set_sint_gsi(struct kvm *kvm, u32 vcpu_id, u32 sint, int gsi) +{ + struct kvm_vcpu_hv_synic *synic; + + synic = synic_get(kvm, vcpu_id); + if (!synic) + return -EINVAL; + + if (sint >= ARRAY_SIZE(synic->sint_to_gsi)) + return -EINVAL; + + atomic_set(&synic->sint_to_gsi[sint], gsi); + return 0; +} + +void kvm_hv_irq_routing_update(struct kvm *kvm) +{ + struct kvm_irq_routing_table *irq_rt; + struct kvm_kernel_irq_routing_entry *e; + u32 gsi; + + irq_rt = srcu_dereference_check(kvm->irq_routing, &kvm->irq_srcu, + lockdep_is_held(&kvm->irq_lock)); + + for (gsi = 0; gsi < irq_rt->nr_rt_entries; gsi++) { + hlist_for_each_entry(e, &irq_rt->map[gsi], link) { + if (e->type == KVM_IRQ_ROUTING_HV_SINT) + kvm_hv_set_sint_gsi(kvm, e->hv_sint.vcpu, + e->hv_sint.sint, gsi); + } + } +} + +static void synic_init(struct kvm_vcpu_hv_synic *synic) +{ + int i; + + memset(synic, 0, sizeof(*synic)); + synic->version = HV_SYNIC_VERSION_1; + for (i = 0; i < ARRAY_SIZE(synic->sint); i++) { + atomic64_set(&synic->sint[i], HV_SYNIC_SINT_MASKED); + atomic_set(&synic->sint_to_gsi[i], -1); + } +} + +static u64 get_time_ref_counter(struct kvm *kvm) +{ + return div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset, 100); +} + +static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer, + bool vcpu_kick) +{ + struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer); + + set_bit(stimer->index, + vcpu_to_hv_vcpu(vcpu)->stimer_pending_bitmap); + kvm_make_request(KVM_REQ_HV_STIMER, vcpu); + if (vcpu_kick) + kvm_vcpu_kick(vcpu); +} + +static void stimer_cleanup(struct kvm_vcpu_hv_stimer *stimer) +{ + struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer); + + trace_kvm_hv_stimer_cleanup(stimer_to_vcpu(stimer)->vcpu_id, + stimer->index); + + hrtimer_cancel(&stimer->timer); + clear_bit(stimer->index, + vcpu_to_hv_vcpu(vcpu)->stimer_pending_bitmap); + stimer->msg_pending = false; + stimer->exp_time = 0; +} + +static enum hrtimer_restart stimer_timer_callback(struct hrtimer *timer) +{ + struct kvm_vcpu_hv_stimer *stimer; + + stimer = container_of(timer, struct kvm_vcpu_hv_stimer, timer); + trace_kvm_hv_stimer_callback(stimer_to_vcpu(stimer)->vcpu_id, + stimer->index); + stimer_mark_pending(stimer, true); + + return HRTIMER_NORESTART; +} + +/* + * stimer_start() assumptions: + * a) stimer->count is not equal to 0 + * b) stimer->config has HV_STIMER_ENABLE flag + */ +static int stimer_start(struct kvm_vcpu_hv_stimer *stimer) +{ + u64 time_now; + ktime_t ktime_now; + + time_now = get_time_ref_counter(stimer_to_vcpu(stimer)->kvm); + ktime_now = ktime_get(); + + if (stimer->config & HV_STIMER_PERIODIC) { + if (stimer->exp_time) { + if (time_now >= stimer->exp_time) { + u64 remainder; + + div64_u64_rem(time_now - stimer->exp_time, + stimer->count, &remainder); + stimer->exp_time = + time_now + (stimer->count - remainder); + } + } else + stimer->exp_time = time_now + stimer->count; + + trace_kvm_hv_stimer_start_periodic( + stimer_to_vcpu(stimer)->vcpu_id, + stimer->index, + time_now, stimer->exp_time); + + hrtimer_start(&stimer->timer, + ktime_add_ns(ktime_now, + 100 * (stimer->exp_time - time_now)), + HRTIMER_MODE_ABS); + return 0; + } + stimer->exp_time = stimer->count; + if (time_now >= stimer->count) { + /* + * Expire timer according to Hypervisor Top-Level Functional + * specification v4(15.3.1): + * "If a one shot is enabled and the specified count is in + * the past, it will expire immediately." + */ + stimer_mark_pending(stimer, false); + return 0; + } + + trace_kvm_hv_stimer_start_one_shot(stimer_to_vcpu(stimer)->vcpu_id, + stimer->index, + time_now, stimer->count); + + hrtimer_start(&stimer->timer, + ktime_add_ns(ktime_now, 100 * (stimer->count - time_now)), + HRTIMER_MODE_ABS); + return 0; +} + +static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config, + bool host) +{ + trace_kvm_hv_stimer_set_config(stimer_to_vcpu(stimer)->vcpu_id, + stimer->index, config, host); + + stimer_cleanup(stimer); + if ((stimer->config & HV_STIMER_ENABLE) && HV_STIMER_SINT(config) == 0) + config &= ~HV_STIMER_ENABLE; + stimer->config = config; + stimer_mark_pending(stimer, false); + return 0; +} + +static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count, + bool host) +{ + trace_kvm_hv_stimer_set_count(stimer_to_vcpu(stimer)->vcpu_id, + stimer->index, count, host); + + stimer_cleanup(stimer); + stimer->count = count; + if (stimer->count == 0) + stimer->config &= ~HV_STIMER_ENABLE; + else if (stimer->config & HV_STIMER_AUTOENABLE) + stimer->config |= HV_STIMER_ENABLE; + stimer_mark_pending(stimer, false); + return 0; +} + +static int stimer_get_config(struct kvm_vcpu_hv_stimer *stimer, u64 *pconfig) +{ + *pconfig = stimer->config; + return 0; +} + +static int stimer_get_count(struct kvm_vcpu_hv_stimer *stimer, u64 *pcount) +{ + *pcount = stimer->count; + return 0; +} + +static int synic_deliver_msg(struct kvm_vcpu_hv_synic *synic, u32 sint, + struct hv_message *src_msg) +{ + struct kvm_vcpu *vcpu = synic_to_vcpu(synic); + struct page *page; + gpa_t gpa; + struct hv_message *dst_msg; + int r; + struct hv_message_page *msg_page; + + if (!(synic->msg_page & HV_SYNIC_SIMP_ENABLE)) + return -ENOENT; + + gpa = synic->msg_page & PAGE_MASK; + page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT); + if (is_error_page(page)) + return -EFAULT; + + msg_page = kmap_atomic(page); + dst_msg = &msg_page->sint_message[sint]; + if (sync_cmpxchg(&dst_msg->header.message_type, HVMSG_NONE, + src_msg->header.message_type) != HVMSG_NONE) { + dst_msg->header.message_flags.msg_pending = 1; + r = -EAGAIN; + } else { + memcpy(&dst_msg->u.payload, &src_msg->u.payload, + src_msg->header.payload_size); + dst_msg->header.message_type = src_msg->header.message_type; + dst_msg->header.payload_size = src_msg->header.payload_size; + r = synic_set_irq(synic, sint); + if (r >= 1) + r = 0; + else if (r == 0) + r = -EFAULT; + } + kunmap_atomic(msg_page); + kvm_release_page_dirty(page); + kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); + return r; +} + +static int stimer_send_msg(struct kvm_vcpu_hv_stimer *stimer) +{ + struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer); + struct hv_message *msg = &stimer->msg; + struct hv_timer_message_payload *payload = + (struct hv_timer_message_payload *)&msg->u.payload; + + payload->expiration_time = stimer->exp_time; + payload->delivery_time = get_time_ref_counter(vcpu->kvm); + return synic_deliver_msg(vcpu_to_synic(vcpu), + HV_STIMER_SINT(stimer->config), msg); +} + +static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer) +{ + int r; + + stimer->msg_pending = true; + r = stimer_send_msg(stimer); + trace_kvm_hv_stimer_expiration(stimer_to_vcpu(stimer)->vcpu_id, + stimer->index, r); + if (!r) { + stimer->msg_pending = false; + if (!(stimer->config & HV_STIMER_PERIODIC)) + stimer->config &= ~HV_STIMER_ENABLE; + } +} + +void kvm_hv_process_stimers(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu); + struct kvm_vcpu_hv_stimer *stimer; + u64 time_now, exp_time; + int i; + + for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) + if (test_and_clear_bit(i, hv_vcpu->stimer_pending_bitmap)) { + stimer = &hv_vcpu->stimer[i]; + if (stimer->config & HV_STIMER_ENABLE) { + exp_time = stimer->exp_time; + + if (exp_time) { + time_now = + get_time_ref_counter(vcpu->kvm); + if (time_now >= exp_time) + stimer_expiration(stimer); + } + + if ((stimer->config & HV_STIMER_ENABLE) && + stimer->count) + stimer_start(stimer); + else + stimer_cleanup(stimer); + } + } +} + +void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu); + int i; + + for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) + stimer_cleanup(&hv_vcpu->stimer[i]); +} + +static void stimer_prepare_msg(struct kvm_vcpu_hv_stimer *stimer) +{ + struct hv_message *msg = &stimer->msg; + struct hv_timer_message_payload *payload = + (struct hv_timer_message_payload *)&msg->u.payload; + + memset(&msg->header, 0, sizeof(msg->header)); + msg->header.message_type = HVMSG_TIMER_EXPIRED; + msg->header.payload_size = sizeof(*payload); + + payload->timer_index = stimer->index; + payload->expiration_time = 0; + payload->delivery_time = 0; +} + +static void stimer_init(struct kvm_vcpu_hv_stimer *stimer, int timer_index) +{ + memset(stimer, 0, sizeof(*stimer)); + stimer->index = timer_index; + hrtimer_init(&stimer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + stimer->timer.function = stimer_timer_callback; + stimer_prepare_msg(stimer); +} + +void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu); + int i; + + synic_init(&hv_vcpu->synic); + + bitmap_zero(hv_vcpu->stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT); + for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) + stimer_init(&hv_vcpu->stimer[i], i); +} + +int kvm_hv_activate_synic(struct kvm_vcpu *vcpu) +{ + /* + * Hyper-V SynIC auto EOI SINT's are + * not compatible with APICV, so deactivate APICV + */ + kvm_vcpu_deactivate_apicv(vcpu); + vcpu_to_synic(vcpu)->active = true; + return 0; +} + static bool kvm_hv_msr_partition_wide(u32 msr) { bool r = false; @@ -226,6 +878,31 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) return 1; hv->runtime_offset = data - current_task_runtime_100ns(); break; + case HV_X64_MSR_SCONTROL: + case HV_X64_MSR_SVERSION: + case HV_X64_MSR_SIEFP: + case HV_X64_MSR_SIMP: + case HV_X64_MSR_EOM: + case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: + return synic_set_msr(vcpu_to_synic(vcpu), msr, data, host); + case HV_X64_MSR_STIMER0_CONFIG: + case HV_X64_MSR_STIMER1_CONFIG: + case HV_X64_MSR_STIMER2_CONFIG: + case HV_X64_MSR_STIMER3_CONFIG: { + int timer_index = (msr - HV_X64_MSR_STIMER0_CONFIG)/2; + + return stimer_set_config(vcpu_to_stimer(vcpu, timer_index), + data, host); + } + case HV_X64_MSR_STIMER0_COUNT: + case HV_X64_MSR_STIMER1_COUNT: + case HV_X64_MSR_STIMER2_COUNT: + case HV_X64_MSR_STIMER3_COUNT: { + int timer_index = (msr - HV_X64_MSR_STIMER0_COUNT)/2; + + return stimer_set_count(vcpu_to_stimer(vcpu, timer_index), + data, host); + } default: vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n", msr, data); @@ -248,11 +925,9 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case HV_X64_MSR_HYPERCALL: data = hv->hv_hypercall; break; - case HV_X64_MSR_TIME_REF_COUNT: { - data = - div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset, 100); + case HV_X64_MSR_TIME_REF_COUNT: + data = get_time_ref_counter(kvm); break; - } case HV_X64_MSR_REFERENCE_TSC: data = hv->hv_tsc_page; break; @@ -304,6 +979,31 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case HV_X64_MSR_VP_RUNTIME: data = current_task_runtime_100ns() + hv->runtime_offset; break; + case HV_X64_MSR_SCONTROL: + case HV_X64_MSR_SVERSION: + case HV_X64_MSR_SIEFP: + case HV_X64_MSR_SIMP: + case HV_X64_MSR_EOM: + case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: + return synic_get_msr(vcpu_to_synic(vcpu), msr, pdata); + case HV_X64_MSR_STIMER0_CONFIG: + case HV_X64_MSR_STIMER1_CONFIG: + case HV_X64_MSR_STIMER2_CONFIG: + case HV_X64_MSR_STIMER3_CONFIG: { + int timer_index = (msr - HV_X64_MSR_STIMER0_CONFIG)/2; + + return stimer_get_config(vcpu_to_stimer(vcpu, timer_index), + pdata); + } + case HV_X64_MSR_STIMER0_COUNT: + case HV_X64_MSR_STIMER1_COUNT: + case HV_X64_MSR_STIMER2_COUNT: + case HV_X64_MSR_STIMER3_COUNT: { + int timer_index = (msr - HV_X64_MSR_STIMER0_COUNT)/2; + + return stimer_get_count(vcpu_to_stimer(vcpu, timer_index), + pdata); + } default: vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); return 1; diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index c7bce55..60eccd4 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -24,9 +24,64 @@ #ifndef __ARCH_X86_KVM_HYPERV_H__ #define __ARCH_X86_KVM_HYPERV_H__ +static inline struct kvm_vcpu_hv *vcpu_to_hv_vcpu(struct kvm_vcpu *vcpu) +{ + return &vcpu->arch.hyperv; +} + +static inline struct kvm_vcpu *hv_vcpu_to_vcpu(struct kvm_vcpu_hv *hv_vcpu) +{ + struct kvm_vcpu_arch *arch; + + arch = container_of(hv_vcpu, struct kvm_vcpu_arch, hyperv); + return container_of(arch, struct kvm_vcpu, arch); +} + +static inline struct kvm_vcpu_hv_synic *vcpu_to_synic(struct kvm_vcpu *vcpu) +{ + return &vcpu->arch.hyperv.synic; +} + +static inline struct kvm_vcpu *synic_to_vcpu(struct kvm_vcpu_hv_synic *synic) +{ + return hv_vcpu_to_vcpu(container_of(synic, struct kvm_vcpu_hv, synic)); +} + int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host); int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); + bool kvm_hv_hypercall_enabled(struct kvm *kvm); int kvm_hv_hypercall(struct kvm_vcpu *vcpu); +void kvm_hv_irq_routing_update(struct kvm *kvm); +int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint); +void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector); +int kvm_hv_activate_synic(struct kvm_vcpu *vcpu); + +void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu); +void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu); + +static inline struct kvm_vcpu_hv_stimer *vcpu_to_stimer(struct kvm_vcpu *vcpu, + int timer_index) +{ + return &vcpu_to_hv_vcpu(vcpu)->stimer[timer_index]; +} + +static inline struct kvm_vcpu *stimer_to_vcpu(struct kvm_vcpu_hv_stimer *stimer) +{ + struct kvm_vcpu_hv *hv_vcpu; + + hv_vcpu = container_of(stimer - stimer->index, struct kvm_vcpu_hv, + stimer[0]); + return hv_vcpu_to_vcpu(hv_vcpu); +} + +static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu) +{ + return !bitmap_empty(vcpu->arch.hyperv.stimer_pending_bitmap, + HV_SYNIC_STIMER_COUNT); +} + +void kvm_hv_process_stimers(struct kvm_vcpu *vcpu); + #endif diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 88d0a92..1facfd6 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -233,7 +233,7 @@ static void kvm_ioapic_inject_all(struct kvm_ioapic *ioapic, unsigned long irr) } -void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) +void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors) { struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; union kvm_ioapic_redirect_entry *e; @@ -250,7 +250,7 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) (e->fields.trig_mode == IOAPIC_EDGE_TRIG && kvm_apic_pending_eoi(vcpu, e->fields.vector))) __set_bit(e->fields.vector, - (unsigned long *)eoi_exit_bitmap); + ioapic_handled_vectors); } } spin_unlock(&ioapic->lock); diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index 084617d..2d16dc2 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -121,7 +121,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, unsigned long *dest_map); int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); -void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); -void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); - +void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, + ulong *ioapic_handled_vectors); +void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, + ulong *ioapic_handled_vectors); #endif diff --git a/arch/x86/kvm/iommu.c b/arch/x86/kvm/iommu.c index 5c520eb..a22a488 100644 --- a/arch/x86/kvm/iommu.c +++ b/arch/x86/kvm/iommu.c @@ -43,11 +43,11 @@ static int kvm_iommu_unmap_memslots(struct kvm *kvm); static void kvm_iommu_put_pages(struct kvm *kvm, gfn_t base_gfn, unsigned long npages); -static pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn, +static kvm_pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn, unsigned long npages) { gfn_t end_gfn; - pfn_t pfn; + kvm_pfn_t pfn; pfn = gfn_to_pfn_memslot(slot, gfn); end_gfn = gfn + npages; @@ -62,7 +62,8 @@ static pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn, return pfn; } -static void kvm_unpin_pages(struct kvm *kvm, pfn_t pfn, unsigned long npages) +static void kvm_unpin_pages(struct kvm *kvm, kvm_pfn_t pfn, + unsigned long npages) { unsigned long i; @@ -73,7 +74,7 @@ static void kvm_unpin_pages(struct kvm *kvm, pfn_t pfn, unsigned long npages) int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot) { gfn_t gfn, end_gfn; - pfn_t pfn; + kvm_pfn_t pfn; int r = 0; struct iommu_domain *domain = kvm->arch.iommu_domain; int flags; @@ -275,7 +276,7 @@ static void kvm_iommu_put_pages(struct kvm *kvm, { struct iommu_domain *domain; gfn_t end_gfn, gfn; - pfn_t pfn; + kvm_pfn_t pfn; u64 phys; domain = kvm->arch.iommu_domain; diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 097060e..3982b47 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -76,7 +76,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v) if (kvm_cpu_has_extint(v)) return 1; - if (kvm_vcpu_apic_vid_enabled(v)) + if (kvm_vcpu_apicv_active(v)) return 0; return kvm_apic_has_interrupt(v) != -1; /* LAPIC */ diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 84b96d3..8fc89ef 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -33,6 +33,8 @@ #include "lapic.h" +#include "hyperv.h" + static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, bool line_status) @@ -219,6 +221,16 @@ void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin, srcu_read_unlock(&kvm->irq_srcu, idx); } +static int kvm_hv_set_sint(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, int level, + bool line_status) +{ + if (!level) + return -1; + + return kvm_hv_synic_set_irq(kvm, e->hv_sint.vcpu, e->hv_sint.sint); +} + int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue) { @@ -257,6 +269,11 @@ int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e, e->msi.address_hi = ue->u.msi.address_hi; e->msi.data = ue->u.msi.data; break; + case KVM_IRQ_ROUTING_HV_SINT: + e->set = kvm_hv_set_sint; + e->hv_sint.vcpu = ue->u.hv_sint.vcpu; + e->hv_sint.sint = ue->u.hv_sint.sint; + break; default: goto out; } @@ -332,14 +349,15 @@ int kvm_setup_empty_irq_routing(struct kvm *kvm) return kvm_set_irq_routing(kvm, empty_routing, 0, 0); } -void kvm_arch_irq_routing_update(struct kvm *kvm) +void kvm_arch_post_irq_routing_update(struct kvm *kvm) { if (ioapic_in_kernel(kvm) || !irqchip_in_kernel(kvm)) return; kvm_make_scan_ioapic_request(kvm); } -void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) +void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, + ulong *ioapic_handled_vectors) { struct kvm *kvm = vcpu->kvm; struct kvm_kernel_irq_routing_entry *entry; @@ -369,9 +387,26 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) u32 vector = entry->msi.data & 0xff; __set_bit(vector, - (unsigned long *) eoi_exit_bitmap); + ioapic_handled_vectors); } } } srcu_read_unlock(&kvm->irq_srcu, idx); } + +int kvm_arch_set_irq(struct kvm_kernel_irq_routing_entry *irq, struct kvm *kvm, + int irq_source_id, int level, bool line_status) +{ + switch (irq->type) { + case KVM_IRQ_ROUTING_HV_SINT: + return kvm_hv_set_sint(irq, kvm, irq_source_id, level, + line_status); + default: + return -EWOULDBLOCK; + } +} + +void kvm_arch_irq_routing_update(struct kvm *kvm) +{ + kvm_hv_irq_routing_update(kvm); +} diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 4d30b86..36591fa 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -41,6 +41,7 @@ #include "trace.h" #include "x86.h" #include "cpuid.h" +#include "hyperv.h" #ifndef CONFIG_X86_64 #define mod_64(x, y) ((x) - (y) * div64_u64(x, y)) @@ -128,11 +129,6 @@ static inline int apic_enabled(struct kvm_lapic *apic) (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \ APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER) -static inline int kvm_apic_id(struct kvm_lapic *apic) -{ - return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff; -} - /* The logical map is definitely wrong if we have multiple * modes at the same time. (Physical map is always right.) */ @@ -379,7 +375,8 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic) if (!apic->irr_pending) return -1; - kvm_x86_ops->sync_pir_to_irr(apic->vcpu); + if (apic->vcpu->arch.apicv_active) + kvm_x86_ops->sync_pir_to_irr(apic->vcpu); result = apic_search_irr(apic); ASSERT(result == -1 || result >= 16); @@ -392,7 +389,7 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) vcpu = apic->vcpu; - if (unlikely(kvm_vcpu_apic_vid_enabled(vcpu))) { + if (unlikely(vcpu->arch.apicv_active)) { /* try to update RVI */ apic_clear_vector(vec, apic->regs + APIC_IRR); kvm_make_request(KVM_REQ_EVENT, vcpu); @@ -418,7 +415,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic) * because the processor can modify ISR under the hood. Instead * just set SVI. */ - if (unlikely(kvm_x86_ops->hwapic_isr_update)) + if (unlikely(vcpu->arch.apicv_active)) kvm_x86_ops->hwapic_isr_update(vcpu->kvm, vec); else { ++apic->isr_count; @@ -466,7 +463,7 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) * on the other hand isr_count and highest_isr_cache are unused * and must be left alone. */ - if (unlikely(kvm_x86_ops->hwapic_isr_update)) + if (unlikely(vcpu->arch.apicv_active)) kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic)); else { @@ -852,7 +849,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, apic_clear_vector(vector, apic->regs + APIC_TMR); } - if (kvm_x86_ops->deliver_posted_interrupt) + if (vcpu->arch.apicv_active) kvm_x86_ops->deliver_posted_interrupt(vcpu, vector); else { apic_set_irr(vector, apic); @@ -932,7 +929,7 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2) static bool kvm_ioapic_handles_vector(struct kvm_lapic *apic, int vector) { - return test_bit(vector, (ulong *)apic->vcpu->arch.eoi_exit_bitmap); + return test_bit(vector, apic->vcpu->arch.ioapic_handled_vectors); } static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector) @@ -974,6 +971,9 @@ static int apic_set_eoi(struct kvm_lapic *apic) apic_clear_isr(vector, apic); apic_update_ppr(apic); + if (test_bit(vector, vcpu_to_synic(apic->vcpu)->vec_bitmap)) + kvm_hv_synic_send_eoi(apic->vcpu, vector); + kvm_ioapic_send_eoi(apic, vector); kvm_make_request(KVM_REQ_EVENT, apic->vcpu); return vector; @@ -1225,7 +1225,7 @@ static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu) int vec = reg & APIC_VECTOR_MASK; void *bitmap = apic->regs + APIC_ISR; - if (kvm_x86_ops->deliver_posted_interrupt) + if (vcpu->arch.apicv_active) bitmap = apic->regs + APIC_IRR; if (apic_test_vector(vec, bitmap)) @@ -1693,8 +1693,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) apic_set_reg(apic, APIC_ISR + 0x10 * i, 0); apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); } - apic->irr_pending = kvm_vcpu_apic_vid_enabled(vcpu); - apic->isr_count = kvm_x86_ops->hwapic_isr_update ? 1 : 0; + apic->irr_pending = vcpu->arch.apicv_active; + apic->isr_count = vcpu->arch.apicv_active ? 1 : 0; apic->highest_isr_cache = -1; update_divide_count(apic); atomic_set(&apic->lapic_timer.pending, 0); @@ -1883,6 +1883,12 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) apic_set_isr(vector, apic); apic_update_ppr(apic); apic_clear_irr(vector, apic); + + if (test_bit(vector, vcpu_to_synic(vcpu)->auto_eoi_bitmap)) { + apic_clear_isr(vector, apic); + apic_update_ppr(apic); + } + return vector; } @@ -1906,15 +1912,15 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, update_divide_count(apic); start_apic_timer(apic); apic->irr_pending = true; - apic->isr_count = kvm_x86_ops->hwapic_isr_update ? + apic->isr_count = vcpu->arch.apicv_active ? 1 : count_vectors(apic->regs + APIC_ISR); apic->highest_isr_cache = -1; - if (kvm_x86_ops->hwapic_irr_update) + if (vcpu->arch.apicv_active) { kvm_x86_ops->hwapic_irr_update(vcpu, apic_find_highest_irr(apic)); - if (unlikely(kvm_x86_ops->hwapic_isr_update)) kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic)); + } kvm_make_request(KVM_REQ_EVENT, vcpu); if (ioapic_in_kernel(vcpu->kvm)) kvm_rtc_eoi_tracking_restore_one(vcpu); diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index fde8e35d..41bdb35 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -143,9 +143,9 @@ static inline int apic_x2apic_mode(struct kvm_lapic *apic) return apic->vcpu->arch.apic_base & X2APIC_ENABLE; } -static inline bool kvm_vcpu_apic_vid_enabled(struct kvm_vcpu *vcpu) +static inline bool kvm_vcpu_apicv_active(struct kvm_vcpu *vcpu) { - return kvm_x86_ops->cpu_uses_apicv(vcpu); + return vcpu->arch.apic && vcpu->arch.apicv_active; } static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu) @@ -164,6 +164,11 @@ static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu) return kvm_vcpu_has_lapic(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); } +static inline int kvm_apic_id(struct kvm_lapic *apic) +{ + return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff; +} + bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); void wait_lapic_expire(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index e7c2c14..95a955d 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -259,7 +259,7 @@ static unsigned get_mmio_spte_access(u64 spte) } static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, - pfn_t pfn, unsigned access) + kvm_pfn_t pfn, unsigned access) { if (unlikely(is_noslot_pfn(pfn))) { mark_mmio_spte(vcpu, sptep, gfn, access); @@ -311,11 +311,6 @@ static int is_large_pte(u64 pte) return pte & PT_PAGE_SIZE_MASK; } -static int is_rmap_spte(u64 pte) -{ - return is_shadow_present_pte(pte); -} - static int is_last_spte(u64 pte, int level) { if (level == PT_PAGE_TABLE_LEVEL) @@ -325,7 +320,7 @@ static int is_last_spte(u64 pte, int level) return 0; } -static pfn_t spte_to_pfn(u64 pte) +static kvm_pfn_t spte_to_pfn(u64 pte) { return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; } @@ -540,7 +535,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) u64 old_spte = *sptep; bool ret = false; - WARN_ON(!is_rmap_spte(new_spte)); + WARN_ON(!is_shadow_present_pte(new_spte)); if (!is_shadow_present_pte(old_spte)) { mmu_spte_set(sptep, new_spte); @@ -587,7 +582,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) */ static int mmu_spte_clear_track_bits(u64 *sptep) { - pfn_t pfn; + kvm_pfn_t pfn; u64 old_spte = *sptep; if (!spte_has_volatile_bits(old_spte)) @@ -595,7 +590,7 @@ static int mmu_spte_clear_track_bits(u64 *sptep) else old_spte = __update_clear_spte_slow(sptep, 0ull); - if (!is_rmap_spte(old_spte)) + if (!is_shadow_present_pte(old_spte)) return 0; pfn = spte_to_pfn(old_spte); @@ -909,36 +904,35 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn, } /* - * Pte mapping structures: + * About rmap_head encoding: * - * If pte_list bit zero is zero, then pte_list point to the spte. - * - * If pte_list bit zero is one, (then pte_list & ~1) points to a struct + * If the bit zero of rmap_head->val is clear, then it points to the only spte + * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct * pte_list_desc containing more mappings. - * - * Returns the number of pte entries before the spte was added or zero if - * the spte was not added. - * + */ + +/* + * Returns the number of pointers in the rmap chain, not counting the new one. */ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, - unsigned long *pte_list) + struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc; int i, count = 0; - if (!*pte_list) { + if (!rmap_head->val) { rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte); - *pte_list = (unsigned long)spte; - } else if (!(*pte_list & 1)) { + rmap_head->val = (unsigned long)spte; + } else if (!(rmap_head->val & 1)) { rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte); desc = mmu_alloc_pte_list_desc(vcpu); - desc->sptes[0] = (u64 *)*pte_list; + desc->sptes[0] = (u64 *)rmap_head->val; desc->sptes[1] = spte; - *pte_list = (unsigned long)desc | 1; + rmap_head->val = (unsigned long)desc | 1; ++count; } else { rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte); - desc = (struct pte_list_desc *)(*pte_list & ~1ul); + desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); while (desc->sptes[PTE_LIST_EXT-1] && desc->more) { desc = desc->more; count += PTE_LIST_EXT; @@ -955,8 +949,9 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, } static void -pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc, - int i, struct pte_list_desc *prev_desc) +pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head, + struct pte_list_desc *desc, int i, + struct pte_list_desc *prev_desc) { int j; @@ -967,43 +962,43 @@ pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc, if (j != 0) return; if (!prev_desc && !desc->more) - *pte_list = (unsigned long)desc->sptes[0]; + rmap_head->val = (unsigned long)desc->sptes[0]; else if (prev_desc) prev_desc->more = desc->more; else - *pte_list = (unsigned long)desc->more | 1; + rmap_head->val = (unsigned long)desc->more | 1; mmu_free_pte_list_desc(desc); } -static void pte_list_remove(u64 *spte, unsigned long *pte_list) +static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc; struct pte_list_desc *prev_desc; int i; - if (!*pte_list) { + if (!rmap_head->val) { printk(KERN_ERR "pte_list_remove: %p 0->BUG\n", spte); BUG(); - } else if (!(*pte_list & 1)) { + } else if (!(rmap_head->val & 1)) { rmap_printk("pte_list_remove: %p 1->0\n", spte); - if ((u64 *)*pte_list != spte) { + if ((u64 *)rmap_head->val != spte) { printk(KERN_ERR "pte_list_remove: %p 1->BUG\n", spte); BUG(); } - *pte_list = 0; + rmap_head->val = 0; } else { rmap_printk("pte_list_remove: %p many->many\n", spte); - desc = (struct pte_list_desc *)(*pte_list & ~1ul); + desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); prev_desc = NULL; while (desc) { - for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) + for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) { if (desc->sptes[i] == spte) { - pte_list_desc_remove_entry(pte_list, - desc, i, - prev_desc); + pte_list_desc_remove_entry(rmap_head, + desc, i, prev_desc); return; } + } prev_desc = desc; desc = desc->more; } @@ -1012,28 +1007,8 @@ static void pte_list_remove(u64 *spte, unsigned long *pte_list) } } -typedef void (*pte_list_walk_fn) (u64 *spte); -static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn) -{ - struct pte_list_desc *desc; - int i; - - if (!*pte_list) - return; - - if (!(*pte_list & 1)) - return fn((u64 *)*pte_list); - - desc = (struct pte_list_desc *)(*pte_list & ~1ul); - while (desc) { - for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) - fn(desc->sptes[i]); - desc = desc->more; - } -} - -static unsigned long *__gfn_to_rmap(gfn_t gfn, int level, - struct kvm_memory_slot *slot) +static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level, + struct kvm_memory_slot *slot) { unsigned long idx; @@ -1041,10 +1016,8 @@ static unsigned long *__gfn_to_rmap(gfn_t gfn, int level, return &slot->arch.rmap[level - PT_PAGE_TABLE_LEVEL][idx]; } -/* - * Take gfn and return the reverse mapping to it. - */ -static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, struct kvm_mmu_page *sp) +static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, + struct kvm_mmu_page *sp) { struct kvm_memslots *slots; struct kvm_memory_slot *slot; @@ -1065,24 +1038,24 @@ static bool rmap_can_add(struct kvm_vcpu *vcpu) static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) { struct kvm_mmu_page *sp; - unsigned long *rmapp; + struct kvm_rmap_head *rmap_head; sp = page_header(__pa(spte)); kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); - rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp); - return pte_list_add(vcpu, spte, rmapp); + rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp); + return pte_list_add(vcpu, spte, rmap_head); } static void rmap_remove(struct kvm *kvm, u64 *spte) { struct kvm_mmu_page *sp; gfn_t gfn; - unsigned long *rmapp; + struct kvm_rmap_head *rmap_head; sp = page_header(__pa(spte)); gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); - rmapp = gfn_to_rmap(kvm, gfn, sp); - pte_list_remove(spte, rmapp); + rmap_head = gfn_to_rmap(kvm, gfn, sp); + pte_list_remove(spte, rmap_head); } /* @@ -1102,19 +1075,26 @@ struct rmap_iterator { * * Returns sptep if found, NULL otherwise. */ -static u64 *rmap_get_first(unsigned long rmap, struct rmap_iterator *iter) +static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head, + struct rmap_iterator *iter) { - if (!rmap) + u64 *sptep; + + if (!rmap_head->val) return NULL; - if (!(rmap & 1)) { + if (!(rmap_head->val & 1)) { iter->desc = NULL; - return (u64 *)rmap; + sptep = (u64 *)rmap_head->val; + goto out; } - iter->desc = (struct pte_list_desc *)(rmap & ~1ul); + iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); iter->pos = 0; - return iter->desc->sptes[iter->pos]; + sptep = iter->desc->sptes[iter->pos]; +out: + BUG_ON(!is_shadow_present_pte(*sptep)); + return sptep; } /* @@ -1124,14 +1104,14 @@ static u64 *rmap_get_first(unsigned long rmap, struct rmap_iterator *iter) */ static u64 *rmap_get_next(struct rmap_iterator *iter) { + u64 *sptep; + if (iter->desc) { if (iter->pos < PTE_LIST_EXT - 1) { - u64 *sptep; - ++iter->pos; sptep = iter->desc->sptes[iter->pos]; if (sptep) - return sptep; + goto out; } iter->desc = iter->desc->more; @@ -1139,17 +1119,20 @@ static u64 *rmap_get_next(struct rmap_iterator *iter) if (iter->desc) { iter->pos = 0; /* desc->sptes[0] cannot be NULL */ - return iter->desc->sptes[iter->pos]; + sptep = iter->desc->sptes[iter->pos]; + goto out; } } return NULL; +out: + BUG_ON(!is_shadow_present_pte(*sptep)); + return sptep; } -#define for_each_rmap_spte(_rmap_, _iter_, _spte_) \ - for (_spte_ = rmap_get_first(*_rmap_, _iter_); \ - _spte_ && ({BUG_ON(!is_shadow_present_pte(*_spte_)); 1;}); \ - _spte_ = rmap_get_next(_iter_)) +#define for_each_rmap_spte(_rmap_head_, _iter_, _spte_) \ + for (_spte_ = rmap_get_first(_rmap_head_, _iter_); \ + _spte_; _spte_ = rmap_get_next(_iter_)) static void drop_spte(struct kvm *kvm, u64 *sptep) { @@ -1207,14 +1190,15 @@ static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect) return mmu_spte_update(sptep, spte); } -static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, +static bool __rmap_write_protect(struct kvm *kvm, + struct kvm_rmap_head *rmap_head, bool pt_protect) { u64 *sptep; struct rmap_iterator iter; bool flush = false; - for_each_rmap_spte(rmapp, &iter, sptep) + for_each_rmap_spte(rmap_head, &iter, sptep) flush |= spte_write_protect(kvm, sptep, pt_protect); return flush; @@ -1231,13 +1215,13 @@ static bool spte_clear_dirty(struct kvm *kvm, u64 *sptep) return mmu_spte_update(sptep, spte); } -static bool __rmap_clear_dirty(struct kvm *kvm, unsigned long *rmapp) +static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) { u64 *sptep; struct rmap_iterator iter; bool flush = false; - for_each_rmap_spte(rmapp, &iter, sptep) + for_each_rmap_spte(rmap_head, &iter, sptep) flush |= spte_clear_dirty(kvm, sptep); return flush; @@ -1254,13 +1238,13 @@ static bool spte_set_dirty(struct kvm *kvm, u64 *sptep) return mmu_spte_update(sptep, spte); } -static bool __rmap_set_dirty(struct kvm *kvm, unsigned long *rmapp) +static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) { u64 *sptep; struct rmap_iterator iter; bool flush = false; - for_each_rmap_spte(rmapp, &iter, sptep) + for_each_rmap_spte(rmap_head, &iter, sptep) flush |= spte_set_dirty(kvm, sptep); return flush; @@ -1280,12 +1264,12 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) { - unsigned long *rmapp; + struct kvm_rmap_head *rmap_head; while (mask) { - rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), - PT_PAGE_TABLE_LEVEL, slot); - __rmap_write_protect(kvm, rmapp, false); + rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), + PT_PAGE_TABLE_LEVEL, slot); + __rmap_write_protect(kvm, rmap_head, false); /* clear the first set bit */ mask &= mask - 1; @@ -1305,12 +1289,12 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) { - unsigned long *rmapp; + struct kvm_rmap_head *rmap_head; while (mask) { - rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), - PT_PAGE_TABLE_LEVEL, slot); - __rmap_clear_dirty(kvm, rmapp); + rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), + PT_PAGE_TABLE_LEVEL, slot); + __rmap_clear_dirty(kvm, rmap_head); /* clear the first set bit */ mask &= mask - 1; @@ -1342,28 +1326,27 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) { struct kvm_memory_slot *slot; - unsigned long *rmapp; + struct kvm_rmap_head *rmap_head; int i; bool write_protected = false; slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { - rmapp = __gfn_to_rmap(gfn, i, slot); - write_protected |= __rmap_write_protect(vcpu->kvm, rmapp, true); + rmap_head = __gfn_to_rmap(gfn, i, slot); + write_protected |= __rmap_write_protect(vcpu->kvm, rmap_head, true); } return write_protected; } -static bool kvm_zap_rmapp(struct kvm *kvm, unsigned long *rmapp) +static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head) { u64 *sptep; struct rmap_iterator iter; bool flush = false; - while ((sptep = rmap_get_first(*rmapp, &iter))) { - BUG_ON(!(*sptep & PT_PRESENT_MASK)); + while ((sptep = rmap_get_first(rmap_head, &iter))) { rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep); drop_spte(kvm, sptep); @@ -1373,14 +1356,14 @@ static bool kvm_zap_rmapp(struct kvm *kvm, unsigned long *rmapp) return flush; } -static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, +static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct kvm_memory_slot *slot, gfn_t gfn, int level, unsigned long data) { - return kvm_zap_rmapp(kvm, rmapp); + return kvm_zap_rmapp(kvm, rmap_head); } -static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, +static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct kvm_memory_slot *slot, gfn_t gfn, int level, unsigned long data) { @@ -1389,13 +1372,13 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, int need_flush = 0; u64 new_spte; pte_t *ptep = (pte_t *)data; - pfn_t new_pfn; + kvm_pfn_t new_pfn; WARN_ON(pte_huge(*ptep)); new_pfn = pte_pfn(*ptep); restart: - for_each_rmap_spte(rmapp, &iter, sptep) { + for_each_rmap_spte(rmap_head, &iter, sptep) { rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n", sptep, *sptep, gfn, level); @@ -1433,11 +1416,11 @@ struct slot_rmap_walk_iterator { /* output fields. */ gfn_t gfn; - unsigned long *rmap; + struct kvm_rmap_head *rmap; int level; /* private field. */ - unsigned long *end_rmap; + struct kvm_rmap_head *end_rmap; }; static void @@ -1496,7 +1479,7 @@ static int kvm_handle_hva_range(struct kvm *kvm, unsigned long end, unsigned long data, int (*handler)(struct kvm *kvm, - unsigned long *rmapp, + struct kvm_rmap_head *rmap_head, struct kvm_memory_slot *slot, gfn_t gfn, int level, @@ -1540,7 +1523,8 @@ static int kvm_handle_hva_range(struct kvm *kvm, static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, unsigned long data, - int (*handler)(struct kvm *kvm, unsigned long *rmapp, + int (*handler)(struct kvm *kvm, + struct kvm_rmap_head *rmap_head, struct kvm_memory_slot *slot, gfn_t gfn, int level, unsigned long data)) @@ -1563,7 +1547,7 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); } -static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, +static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct kvm_memory_slot *slot, gfn_t gfn, int level, unsigned long data) { @@ -1573,18 +1557,19 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, BUG_ON(!shadow_accessed_mask); - for_each_rmap_spte(rmapp, &iter, sptep) + for_each_rmap_spte(rmap_head, &iter, sptep) { if (*sptep & shadow_accessed_mask) { young = 1; clear_bit((ffs(shadow_accessed_mask) - 1), (unsigned long *)sptep); } + } trace_kvm_age_page(gfn, level, slot, young); return young; } -static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, +static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct kvm_memory_slot *slot, gfn_t gfn, int level, unsigned long data) { @@ -1600,11 +1585,12 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, if (!shadow_accessed_mask) goto out; - for_each_rmap_spte(rmapp, &iter, sptep) + for_each_rmap_spte(rmap_head, &iter, sptep) { if (*sptep & shadow_accessed_mask) { young = 1; break; } + } out: return young; } @@ -1613,14 +1599,14 @@ out: static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) { - unsigned long *rmapp; + struct kvm_rmap_head *rmap_head; struct kvm_mmu_page *sp; sp = page_header(__pa(spte)); - rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp); + rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp); - kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, gfn, sp->role.level, 0); + kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, 0); kvm_flush_remote_tlbs(vcpu->kvm); } @@ -1720,8 +1706,7 @@ static void drop_parent_pte(struct kvm_mmu_page *sp, mmu_spte_clear_no_track(parent_pte); } -static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, - u64 *parent_pte, int direct) +static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct) { struct kvm_mmu_page *sp; @@ -1737,8 +1722,6 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, * this feature. See the comments in kvm_zap_obsolete_pages(). */ list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); - sp->parent_ptes = 0; - mmu_page_add_parent_pte(vcpu, sp, parent_pte); kvm_mod_used_mmu_pages(vcpu->kvm, +1); return sp; } @@ -1746,7 +1729,12 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, static void mark_unsync(u64 *spte); static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) { - pte_list_walk(&sp->parent_ptes, mark_unsync); + u64 *sptep; + struct rmap_iterator iter; + + for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) { + mark_unsync(sptep); + } } static void mark_unsync(u64 *spte) @@ -1806,6 +1794,13 @@ static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, return (pvec->nr == KVM_PAGE_ARRAY_NR); } +static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx) +{ + --sp->unsync_children; + WARN_ON((int)sp->unsync_children < 0); + __clear_bit(idx, sp->unsync_child_bitmap); +} + static int __mmu_unsync_walk(struct kvm_mmu_page *sp, struct kvm_mmu_pages *pvec) { @@ -1815,8 +1810,10 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp, struct kvm_mmu_page *child; u64 ent = sp->spt[i]; - if (!is_shadow_present_pte(ent) || is_large_pte(ent)) - goto clear_child_bitmap; + if (!is_shadow_present_pte(ent) || is_large_pte(ent)) { + clear_unsync_child_bit(sp, i); + continue; + } child = page_header(ent & PT64_BASE_ADDR_MASK); @@ -1825,28 +1822,21 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp, return -ENOSPC; ret = __mmu_unsync_walk(child, pvec); - if (!ret) - goto clear_child_bitmap; - else if (ret > 0) + if (!ret) { + clear_unsync_child_bit(sp, i); + continue; + } else if (ret > 0) { nr_unsync_leaf += ret; - else + } else return ret; } else if (child->unsync) { nr_unsync_leaf++; if (mmu_pages_add(pvec, child, i)) return -ENOSPC; } else - goto clear_child_bitmap; - - continue; - -clear_child_bitmap: - __clear_bit(i, sp->unsync_child_bitmap); - sp->unsync_children--; - WARN_ON((int)sp->unsync_children < 0); + clear_unsync_child_bit(sp, i); } - return nr_unsync_leaf; } @@ -2009,9 +1999,7 @@ static void mmu_pages_clear_parents(struct mmu_page_path *parents) if (!sp) return; - --sp->unsync_children; - WARN_ON((int)sp->unsync_children < 0); - __clear_bit(idx, sp->unsync_child_bitmap); + clear_unsync_child_bit(sp, idx); level++; } while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children); } @@ -2053,14 +2041,6 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu, } } -static void init_shadow_page_table(struct kvm_mmu_page *sp) -{ - int i; - - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) - sp->spt[i] = 0ull; -} - static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp) { sp->write_flooding_count = 0; @@ -2083,8 +2063,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, gva_t gaddr, unsigned level, int direct, - unsigned access, - u64 *parent_pte) + unsigned access) { union kvm_mmu_page_role role; unsigned quadrant; @@ -2116,21 +2095,18 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, if (sp->unsync && kvm_sync_page_transient(vcpu, sp)) break; - mmu_page_add_parent_pte(vcpu, sp, parent_pte); - if (sp->unsync_children) { + if (sp->unsync_children) kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); - kvm_mmu_mark_parents_unsync(sp); - } else if (sp->unsync) - kvm_mmu_mark_parents_unsync(sp); __clear_sp_write_flooding_count(sp); trace_kvm_mmu_get_page(sp, false); return sp; } + ++vcpu->kvm->stat.mmu_cache_miss; - sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct); - if (!sp) - return sp; + + sp = kvm_mmu_alloc_page(vcpu, direct); + sp->gfn = gfn; sp->role = role; hlist_add_head(&sp->hash_link, @@ -2144,7 +2120,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, account_shadowed(vcpu->kvm, sp); } sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen; - init_shadow_page_table(sp); + clear_page(sp->spt); trace_kvm_mmu_get_page(sp, true); return sp; } @@ -2198,7 +2174,8 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) return __shadow_walk_next(iterator, *iterator->sptep); } -static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp, bool accessed) +static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, + struct kvm_mmu_page *sp) { u64 spte; @@ -2206,12 +2183,14 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp, bool accessed) VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK | - shadow_user_mask | shadow_x_mask; - - if (accessed) - spte |= shadow_accessed_mask; + shadow_user_mask | shadow_x_mask | shadow_accessed_mask; mmu_spte_set(sptep, spte); + + mmu_page_add_parent_pte(vcpu, sp, sptep); + + if (sp->unsync_children || sp->unsync) + mark_unsync(sptep); } static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, @@ -2270,17 +2249,12 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm, mmu_page_zap_pte(kvm, sp, sp->spt + i); } -static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) -{ - mmu_page_remove_parent_pte(sp, parent_pte); -} - static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) { u64 *sptep; struct rmap_iterator iter; - while ((sptep = rmap_get_first(sp->parent_ptes, &iter))) + while ((sptep = rmap_get_first(&sp->parent_ptes, &iter))) drop_parent_pte(sp, sptep); } @@ -2476,7 +2450,7 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, return 0; } -static bool kvm_is_mmio_pfn(pfn_t pfn) +static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) { if (pfn_valid(pfn)) return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)); @@ -2486,7 +2460,7 @@ static bool kvm_is_mmio_pfn(pfn_t pfn) static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, int level, - gfn_t gfn, pfn_t pfn, bool speculative, + gfn_t gfn, kvm_pfn_t pfn, bool speculative, bool can_unsync, bool host_writable) { u64 spte; @@ -2564,18 +2538,18 @@ done: return ret; } -static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, - unsigned pte_access, int write_fault, int *emulate, - int level, gfn_t gfn, pfn_t pfn, bool speculative, - bool host_writable) +static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, + int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn, + bool speculative, bool host_writable) { int was_rmapped = 0; int rmap_count; + bool emulate = false; pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__, *sptep, write_fault, gfn); - if (is_rmap_spte(*sptep)) { + if (is_shadow_present_pte(*sptep)) { /* * If we overwrite a PTE page pointer with a 2MB PMD, unlink * the parent of the now unreachable PTE. @@ -2600,12 +2574,12 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative, true, host_writable)) { if (write_fault) - *emulate = 1; + emulate = true; kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } - if (unlikely(is_mmio_spte(*sptep) && emulate)) - *emulate = 1; + if (unlikely(is_mmio_spte(*sptep))) + emulate = true; pgprintk("%s: setting spte %llx\n", __func__, *sptep); pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", @@ -2624,9 +2598,11 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, } kvm_release_pfn_clean(pfn); + + return emulate; } -static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, +static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log) { struct kvm_memory_slot *slot; @@ -2658,9 +2634,8 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, return -1; for (i = 0; i < ret; i++, gfn++, start++) - mmu_set_spte(vcpu, start, access, 0, NULL, - sp->role.level, gfn, page_to_pfn(pages[i]), - true, true); + mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn, + page_to_pfn(pages[i]), true, true); return 0; } @@ -2708,9 +2683,8 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) __direct_pte_prefetch(vcpu, sp, sptep); } -static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, - int map_writable, int level, gfn_t gfn, pfn_t pfn, - bool prefault) +static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable, + int level, gfn_t gfn, kvm_pfn_t pfn, bool prefault) { struct kvm_shadow_walk_iterator iterator; struct kvm_mmu_page *sp; @@ -2722,9 +2696,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { if (iterator.level == level) { - mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, - write, &emulate, level, gfn, pfn, - prefault, map_writable); + emulate = mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, + write, level, gfn, pfn, prefault, + map_writable); direct_pte_prefetch(vcpu, iterator.sptep); ++vcpu->stat.pf_fixed; break; @@ -2737,10 +2711,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, base_addr &= PT64_LVL_ADDR_MASK(iterator.level); pseudo_gfn = base_addr >> PAGE_SHIFT; sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr, - iterator.level - 1, - 1, ACC_ALL, iterator.sptep); + iterator.level - 1, 1, ACC_ALL); - link_shadow_page(iterator.sptep, sp, true); + link_shadow_page(vcpu, iterator.sptep, sp); } } return emulate; @@ -2759,7 +2732,7 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct * send_sig_info(SIGBUS, &info, tsk); } -static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn) +static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) { /* * Do not cache the mmio info caused by writing the readonly gfn @@ -2779,9 +2752,10 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn) } static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, - gfn_t *gfnp, pfn_t *pfnp, int *levelp) + gfn_t *gfnp, kvm_pfn_t *pfnp, + int *levelp) { - pfn_t pfn = *pfnp; + kvm_pfn_t pfn = *pfnp; gfn_t gfn = *gfnp; int level = *levelp; @@ -2820,7 +2794,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, } static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, - pfn_t pfn, unsigned access, int *ret_val) + kvm_pfn_t pfn, unsigned access, int *ret_val) { bool ret = true; @@ -2919,7 +2893,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, * If the mapping has been changed, let the vcpu fault on the * same address again. */ - if (!is_rmap_spte(spte)) { + if (!is_shadow_present_pte(spte)) { ret = true; goto exit; } @@ -2974,7 +2948,7 @@ exit: } static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, - gva_t gva, pfn_t *pfn, bool write, bool *writable); + gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable); static void make_mmu_pages_available(struct kvm_vcpu *vcpu); static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, @@ -2983,7 +2957,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, int r; int level; bool force_pt_level = false; - pfn_t pfn; + kvm_pfn_t pfn; unsigned long mmu_seq; bool map_writable, write = error_code & PFERR_WRITE_MASK; @@ -3018,11 +2992,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, make_mmu_pages_available(vcpu); if (likely(!force_pt_level)) transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); - r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn, - prefault); + r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault); spin_unlock(&vcpu->kvm->mmu_lock); - return r; out_unlock: @@ -3097,8 +3069,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { spin_lock(&vcpu->kvm->mmu_lock); make_mmu_pages_available(vcpu); - sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL, - 1, ACC_ALL, NULL); + sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL, 1, ACC_ALL); ++sp->root_count; spin_unlock(&vcpu->kvm->mmu_lock); vcpu->arch.mmu.root_hpa = __pa(sp->spt); @@ -3110,9 +3081,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) spin_lock(&vcpu->kvm->mmu_lock); make_mmu_pages_available(vcpu); sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT), - i << 30, - PT32_ROOT_LEVEL, 1, ACC_ALL, - NULL); + i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL); root = __pa(sp->spt); ++sp->root_count; spin_unlock(&vcpu->kvm->mmu_lock); @@ -3149,7 +3118,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) spin_lock(&vcpu->kvm->mmu_lock); make_mmu_pages_available(vcpu); sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL, - 0, ACC_ALL, NULL); + 0, ACC_ALL); root = __pa(sp->spt); ++sp->root_count; spin_unlock(&vcpu->kvm->mmu_lock); @@ -3182,9 +3151,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) } spin_lock(&vcpu->kvm->mmu_lock); make_mmu_pages_available(vcpu); - sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, - PT32_ROOT_LEVEL, 0, - ACC_ALL, NULL); + sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL, + 0, ACC_ALL); root = __pa(sp->spt); ++sp->root_count; spin_unlock(&vcpu->kvm->mmu_lock); @@ -3443,7 +3411,7 @@ static bool can_do_async_pf(struct kvm_vcpu *vcpu) } static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, - gva_t gva, pfn_t *pfn, bool write, bool *writable) + gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable) { struct kvm_memory_slot *slot; bool async; @@ -3481,7 +3449,7 @@ check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level) static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, bool prefault) { - pfn_t pfn; + kvm_pfn_t pfn; int r; int level; bool force_pt_level; @@ -3531,8 +3499,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, make_mmu_pages_available(vcpu); if (likely(!force_pt_level)) transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); - r = __direct_map(vcpu, gpa, write, map_writable, - level, gfn, pfn, prefault); + r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault); spin_unlock(&vcpu->kvm->mmu_lock); return r; @@ -4058,10 +4025,12 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) g_context->inject_page_fault = kvm_inject_page_fault; /* - * Note that arch.mmu.gva_to_gpa translates l2_gva to l1_gpa. The - * translation of l2_gpa to l1_gpa addresses is done using the - * arch.nested_mmu.gva_to_gpa function. Basically the gva_to_gpa - * functions between mmu and nested_mmu are swapped. + * Note that arch.mmu.gva_to_gpa translates l2_gpa to l1_gpa using + * L1's nested page tables (e.g. EPT12). The nested translation + * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using + * L2's page tables as the first level of translation and L1's + * nested page tables as the second level of translation. Basically + * the gva_to_gpa functions between mmu and nested_mmu are swapped. */ if (!is_paging(vcpu)) { g_context->nx = false; @@ -4495,7 +4464,7 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu) } /* The return value indicates if tlb flush on all vcpus is needed. */ -typedef bool (*slot_level_handler) (struct kvm *kvm, unsigned long *rmap); +typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head); /* The caller should hold mmu-lock before calling this function. */ static bool @@ -4589,9 +4558,10 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) spin_unlock(&kvm->mmu_lock); } -static bool slot_rmap_write_protect(struct kvm *kvm, unsigned long *rmapp) +static bool slot_rmap_write_protect(struct kvm *kvm, + struct kvm_rmap_head *rmap_head) { - return __rmap_write_protect(kvm, rmapp, false); + return __rmap_write_protect(kvm, rmap_head, false); } void kvm_mmu_slot_remove_write_access(struct kvm *kvm, @@ -4627,16 +4597,16 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, } static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, - unsigned long *rmapp) + struct kvm_rmap_head *rmap_head) { u64 *sptep; struct rmap_iterator iter; int need_tlb_flush = 0; - pfn_t pfn; + kvm_pfn_t pfn; struct kvm_mmu_page *sp; restart: - for_each_rmap_spte(rmapp, &iter, sptep) { + for_each_rmap_spte(rmap_head, &iter, sptep) { sp = page_header(__pa(sptep)); pfn = spte_to_pfn(*sptep); diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index 03d518e..dcce533 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -97,7 +97,7 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level) { struct kvm_mmu_page *sp; gfn_t gfn; - pfn_t pfn; + kvm_pfn_t pfn; hpa_t hpa; sp = page_header(__pa(sptep)); @@ -129,7 +129,7 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level) static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) { static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10); - unsigned long *rmapp; + struct kvm_rmap_head *rmap_head; struct kvm_mmu_page *rev_sp; struct kvm_memslots *slots; struct kvm_memory_slot *slot; @@ -150,8 +150,8 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) return; } - rmapp = __gfn_to_rmap(gfn, rev_sp->role.level, slot); - if (!*rmapp) { + rmap_head = __gfn_to_rmap(gfn, rev_sp->role.level, slot); + if (!rmap_head->val) { if (!__ratelimit(&ratelimit_state)) return; audit_printk(kvm, "no rmap for writable spte %llx\n", @@ -183,7 +183,7 @@ static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp) return; for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { - if (!is_rmap_spte(sp->spt[i])) + if (!is_shadow_present_pte(sp->spt[i])) continue; inspect_spte_has_rmap(kvm, sp->spt + i); @@ -192,7 +192,7 @@ static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp) static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) { - unsigned long *rmapp; + struct kvm_rmap_head *rmap_head; u64 *sptep; struct rmap_iterator iter; struct kvm_memslots *slots; @@ -203,13 +203,14 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) slots = kvm_memslots_for_spte_role(kvm, sp->role); slot = __gfn_to_memslot(slots, sp->gfn); - rmapp = __gfn_to_rmap(sp->gfn, PT_PAGE_TABLE_LEVEL, slot); + rmap_head = __gfn_to_rmap(sp->gfn, PT_PAGE_TABLE_LEVEL, slot); - for_each_rmap_spte(rmapp, &iter, sptep) + for_each_rmap_spte(rmap_head, &iter, sptep) { if (is_writable_pte(*sptep)) audit_printk(kvm, "shadow page has writable " "mappings: gfn %llx role %x\n", sp->gfn, sp->role.word); + } } static void audit_sp(struct kvm *kvm, struct kvm_mmu_page *sp) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 3058a22..6c9fed9 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -456,7 +456,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, { unsigned pte_access; gfn_t gfn; - pfn_t pfn; + kvm_pfn_t pfn; if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) return false; @@ -475,8 +475,8 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, * we call mmu_set_spte() with host_writable = true because * pte_prefetch_gfn_to_pfn always gets a writable pfn. */ - mmu_set_spte(vcpu, spte, pte_access, 0, NULL, PT_PAGE_TABLE_LEVEL, - gfn, pfn, true, true); + mmu_set_spte(vcpu, spte, pte_access, 0, PT_PAGE_TABLE_LEVEL, gfn, pfn, + true, true); return true; } @@ -551,12 +551,12 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, struct guest_walker *gw, int write_fault, int hlevel, - pfn_t pfn, bool map_writable, bool prefault) + kvm_pfn_t pfn, bool map_writable, bool prefault) { struct kvm_mmu_page *sp = NULL; struct kvm_shadow_walk_iterator it; unsigned direct_access, access = gw->pt_access; - int top_level, emulate = 0; + int top_level, emulate; direct_access = gw->pte_access; @@ -587,7 +587,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, if (!is_shadow_present_pte(*it.sptep)) { table_gfn = gw->table_gfn[it.level - 2]; sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1, - false, access, it.sptep); + false, access); } /* @@ -598,7 +598,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, goto out_gpte_changed; if (sp) - link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK); + link_shadow_page(vcpu, it.sptep, sp); } for (; @@ -617,20 +617,18 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1, - true, direct_access, it.sptep); - link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK); + true, direct_access); + link_shadow_page(vcpu, it.sptep, sp); } clear_sp_write_flooding_count(it.sptep); - mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, &emulate, - it.level, gw->gfn, pfn, prefault, map_writable); + emulate = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, + it.level, gw->gfn, pfn, prefault, map_writable); FNAME(pte_prefetch)(vcpu, gw, it.sptep); return emulate; out_gpte_changed: - if (sp) - kvm_mmu_put_page(sp, it.sptep); kvm_release_pfn_clean(pfn); return 0; } @@ -696,7 +694,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, int user_fault = error_code & PFERR_USER_MASK; struct guest_walker walker; int r; - pfn_t pfn; + kvm_pfn_t pfn; int level = PT_PAGE_TABLE_LEVEL; bool force_pt_level = false; unsigned long mmu_seq; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 22aef20..c13a64b 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -86,6 +86,7 @@ static const u32 host_save_user_msrs[] = { MSR_FS_BASE, #endif MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, + MSR_TSC_AUX, }; #define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs) @@ -135,6 +136,7 @@ struct vcpu_svm { uint64_t asid_generation; uint64_t sysenter_esp; uint64_t sysenter_eip; + uint64_t tsc_aux; u64 next_rip; @@ -1238,6 +1240,9 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) wrmsrl(MSR_AMD64_TSC_RATIO, tsc_ratio); } } + /* This assumes that the kernel never uses MSR_TSC_AUX */ + if (static_cpu_has(X86_FEATURE_RDTSCP)) + wrmsrl(MSR_TSC_AUX, svm->tsc_aux); } static void svm_vcpu_put(struct kvm_vcpu *vcpu) @@ -3024,6 +3029,11 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_SYSENTER_ESP: msr_info->data = svm->sysenter_esp; break; + case MSR_TSC_AUX: + if (!boot_cpu_has(X86_FEATURE_RDTSCP)) + return 1; + msr_info->data = svm->tsc_aux; + break; /* * Nobody will change the following 5 values in the VMCB so we can * safely return them on rdmsr. They will always be 0 until LBRV is @@ -3162,6 +3172,18 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) svm->sysenter_esp = data; svm->vmcb->save.sysenter_esp = data; break; + case MSR_TSC_AUX: + if (!boot_cpu_has(X86_FEATURE_RDTSCP)) + return 1; + + /* + * This is rare, so we update the MSR here instead of using + * direct_access_msrs. Doing that would require a rdmsr in + * svm_vcpu_put. + */ + svm->tsc_aux = data; + wrmsrl(MSR_TSC_AUX, svm->tsc_aux); + break; case MSR_IA32_DEBUGCTLMSR: if (!boot_cpu_has(X86_FEATURE_LBRV)) { vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", @@ -3578,12 +3600,16 @@ static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) return; } -static int svm_cpu_uses_apicv(struct kvm_vcpu *vcpu) +static bool svm_get_enable_apicv(void) +{ + return false; +} + +static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) { - return 0; } -static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu) +static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) { return; } @@ -4054,7 +4080,7 @@ static int svm_get_lpage_level(void) static bool svm_rdtscp_supported(void) { - return false; + return boot_cpu_has(X86_FEATURE_RDTSCP); } static bool svm_invpcid_supported(void) @@ -4345,7 +4371,8 @@ static struct kvm_x86_ops svm_x86_ops = { .enable_irq_window = enable_irq_window, .update_cr8_intercept = update_cr8_intercept, .set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode, - .cpu_uses_apicv = svm_cpu_uses_apicv, + .get_enable_apicv = svm_get_enable_apicv, + .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl, .load_eoi_exitmap = svm_load_eoi_exitmap, .sync_pir_to_irr = svm_sync_pir_to_irr, diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 1203025..ad9f6a2 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -268,7 +268,7 @@ TRACE_EVENT(kvm_inj_virq, #define kvm_trace_sym_exc \ EXS(DE), EXS(DB), EXS(BP), EXS(OF), EXS(BR), EXS(UD), EXS(NM), \ EXS(DF), EXS(TS), EXS(NP), EXS(SS), EXS(GP), EXS(PF), \ - EXS(MF), EXS(MC) + EXS(MF), EXS(AC), EXS(MC) /* * Tracepoint for kvm interrupt injection: @@ -1025,6 +1025,269 @@ TRACE_EVENT(kvm_pi_irte_update, __entry->pi_desc_addr) ); +/* + * Tracepoint for kvm_hv_notify_acked_sint. + */ +TRACE_EVENT(kvm_hv_notify_acked_sint, + TP_PROTO(int vcpu_id, u32 sint), + TP_ARGS(vcpu_id, sint), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(u32, sint) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->sint = sint; + ), + + TP_printk("vcpu_id %d sint %u", __entry->vcpu_id, __entry->sint) +); + +/* + * Tracepoint for synic_set_irq. + */ +TRACE_EVENT(kvm_hv_synic_set_irq, + TP_PROTO(int vcpu_id, u32 sint, int vector, int ret), + TP_ARGS(vcpu_id, sint, vector, ret), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(u32, sint) + __field(int, vector) + __field(int, ret) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->sint = sint; + __entry->vector = vector; + __entry->ret = ret; + ), + + TP_printk("vcpu_id %d sint %u vector %d ret %d", + __entry->vcpu_id, __entry->sint, __entry->vector, + __entry->ret) +); + +/* + * Tracepoint for kvm_hv_synic_send_eoi. + */ +TRACE_EVENT(kvm_hv_synic_send_eoi, + TP_PROTO(int vcpu_id, int vector), + TP_ARGS(vcpu_id, vector), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(u32, sint) + __field(int, vector) + __field(int, ret) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->vector = vector; + ), + + TP_printk("vcpu_id %d vector %d", __entry->vcpu_id, __entry->vector) +); + +/* + * Tracepoint for synic_set_msr. + */ +TRACE_EVENT(kvm_hv_synic_set_msr, + TP_PROTO(int vcpu_id, u32 msr, u64 data, bool host), + TP_ARGS(vcpu_id, msr, data, host), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(u32, msr) + __field(u64, data) + __field(bool, host) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->msr = msr; + __entry->data = data; + __entry->host = host + ), + + TP_printk("vcpu_id %d msr 0x%x data 0x%llx host %d", + __entry->vcpu_id, __entry->msr, __entry->data, __entry->host) +); + +/* + * Tracepoint for stimer_set_config. + */ +TRACE_EVENT(kvm_hv_stimer_set_config, + TP_PROTO(int vcpu_id, int timer_index, u64 config, bool host), + TP_ARGS(vcpu_id, timer_index, config, host), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(int, timer_index) + __field(u64, config) + __field(bool, host) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->timer_index = timer_index; + __entry->config = config; + __entry->host = host; + ), + + TP_printk("vcpu_id %d timer %d config 0x%llx host %d", + __entry->vcpu_id, __entry->timer_index, __entry->config, + __entry->host) +); + +/* + * Tracepoint for stimer_set_count. + */ +TRACE_EVENT(kvm_hv_stimer_set_count, + TP_PROTO(int vcpu_id, int timer_index, u64 count, bool host), + TP_ARGS(vcpu_id, timer_index, count, host), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(int, timer_index) + __field(u64, count) + __field(bool, host) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->timer_index = timer_index; + __entry->count = count; + __entry->host = host; + ), + + TP_printk("vcpu_id %d timer %d count %llu host %d", + __entry->vcpu_id, __entry->timer_index, __entry->count, + __entry->host) +); + +/* + * Tracepoint for stimer_start(periodic timer case). + */ +TRACE_EVENT(kvm_hv_stimer_start_periodic, + TP_PROTO(int vcpu_id, int timer_index, u64 time_now, u64 exp_time), + TP_ARGS(vcpu_id, timer_index, time_now, exp_time), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(int, timer_index) + __field(u64, time_now) + __field(u64, exp_time) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->timer_index = timer_index; + __entry->time_now = time_now; + __entry->exp_time = exp_time; + ), + + TP_printk("vcpu_id %d timer %d time_now %llu exp_time %llu", + __entry->vcpu_id, __entry->timer_index, __entry->time_now, + __entry->exp_time) +); + +/* + * Tracepoint for stimer_start(one-shot timer case). + */ +TRACE_EVENT(kvm_hv_stimer_start_one_shot, + TP_PROTO(int vcpu_id, int timer_index, u64 time_now, u64 count), + TP_ARGS(vcpu_id, timer_index, time_now, count), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(int, timer_index) + __field(u64, time_now) + __field(u64, count) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->timer_index = timer_index; + __entry->time_now = time_now; + __entry->count = count; + ), + + TP_printk("vcpu_id %d timer %d time_now %llu count %llu", + __entry->vcpu_id, __entry->timer_index, __entry->time_now, + __entry->count) +); + +/* + * Tracepoint for stimer_timer_callback. + */ +TRACE_EVENT(kvm_hv_stimer_callback, + TP_PROTO(int vcpu_id, int timer_index), + TP_ARGS(vcpu_id, timer_index), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(int, timer_index) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->timer_index = timer_index; + ), + + TP_printk("vcpu_id %d timer %d", + __entry->vcpu_id, __entry->timer_index) +); + +/* + * Tracepoint for stimer_expiration. + */ +TRACE_EVENT(kvm_hv_stimer_expiration, + TP_PROTO(int vcpu_id, int timer_index, int msg_send_result), + TP_ARGS(vcpu_id, timer_index, msg_send_result), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(int, timer_index) + __field(int, msg_send_result) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->timer_index = timer_index; + __entry->msg_send_result = msg_send_result; + ), + + TP_printk("vcpu_id %d timer %d msg send result %d", + __entry->vcpu_id, __entry->timer_index, + __entry->msg_send_result) +); + +/* + * Tracepoint for stimer_cleanup. + */ +TRACE_EVENT(kvm_hv_stimer_cleanup, + TP_PROTO(int vcpu_id, int timer_index), + TP_ARGS(vcpu_id, timer_index), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(int, timer_index) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->timer_index = timer_index; + ), + + TP_printk("vcpu_id %d timer %d", + __entry->vcpu_id, __entry->timer_index) +); + #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 44976a5..e2951b6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -19,6 +19,7 @@ #include "irq.h" #include "mmu.h" #include "cpuid.h" +#include "lapic.h" #include <linux/kvm_host.h> #include <linux/module.h> @@ -862,7 +863,6 @@ static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxoff(void); static bool vmx_mpx_supported(void); static bool vmx_xsaves_supported(void); -static int vmx_cpu_uses_apicv(struct kvm_vcpu *vcpu); static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); @@ -870,7 +870,6 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); static bool guest_state_valid(struct kvm_vcpu *vcpu); static u32 vmx_segment_access_rights(struct kvm_segment *var); -static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu); static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx); static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); static int alloc_identity_pagetable(struct kvm *kvm); @@ -1448,7 +1447,51 @@ static inline void ept_sync_context(u64 eptp) } } -static __always_inline unsigned long vmcs_readl(unsigned long field) +static __always_inline void vmcs_check16(unsigned long field) +{ + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000, + "16-bit accessor invalid for 64-bit field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001, + "16-bit accessor invalid for 64-bit high field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000, + "16-bit accessor invalid for 32-bit high field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000, + "16-bit accessor invalid for natural width field"); +} + +static __always_inline void vmcs_check32(unsigned long field) +{ + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0, + "32-bit accessor invalid for 16-bit field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000, + "32-bit accessor invalid for natural width field"); +} + +static __always_inline void vmcs_check64(unsigned long field) +{ + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0, + "64-bit accessor invalid for 16-bit field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001, + "64-bit accessor invalid for 64-bit high field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000, + "64-bit accessor invalid for 32-bit field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000, + "64-bit accessor invalid for natural width field"); +} + +static __always_inline void vmcs_checkl(unsigned long field) +{ + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0, + "Natural width accessor invalid for 16-bit field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000, + "Natural width accessor invalid for 64-bit field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001, + "Natural width accessor invalid for 64-bit high field"); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000, + "Natural width accessor invalid for 32-bit field"); +} + +static __always_inline unsigned long __vmcs_readl(unsigned long field) { unsigned long value; @@ -1459,23 +1502,32 @@ static __always_inline unsigned long vmcs_readl(unsigned long field) static __always_inline u16 vmcs_read16(unsigned long field) { - return vmcs_readl(field); + vmcs_check16(field); + return __vmcs_readl(field); } static __always_inline u32 vmcs_read32(unsigned long field) { - return vmcs_readl(field); + vmcs_check32(field); + return __vmcs_readl(field); } static __always_inline u64 vmcs_read64(unsigned long field) { + vmcs_check64(field); #ifdef CONFIG_X86_64 - return vmcs_readl(field); + return __vmcs_readl(field); #else - return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32); + return __vmcs_readl(field) | ((u64)__vmcs_readl(field+1) << 32); #endif } +static __always_inline unsigned long vmcs_readl(unsigned long field) +{ + vmcs_checkl(field); + return __vmcs_readl(field); +} + static noinline void vmwrite_error(unsigned long field, unsigned long value) { printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n", @@ -1483,7 +1535,7 @@ static noinline void vmwrite_error(unsigned long field, unsigned long value) dump_stack(); } -static void vmcs_writel(unsigned long field, unsigned long value) +static __always_inline void __vmcs_writel(unsigned long field, unsigned long value) { u8 error; @@ -1493,33 +1545,46 @@ static void vmcs_writel(unsigned long field, unsigned long value) vmwrite_error(field, value); } -static void vmcs_write16(unsigned long field, u16 value) +static __always_inline void vmcs_write16(unsigned long field, u16 value) { - vmcs_writel(field, value); + vmcs_check16(field); + __vmcs_writel(field, value); } -static void vmcs_write32(unsigned long field, u32 value) +static __always_inline void vmcs_write32(unsigned long field, u32 value) { - vmcs_writel(field, value); + vmcs_check32(field); + __vmcs_writel(field, value); } -static void vmcs_write64(unsigned long field, u64 value) +static __always_inline void vmcs_write64(unsigned long field, u64 value) { - vmcs_writel(field, value); + vmcs_check64(field); + __vmcs_writel(field, value); #ifndef CONFIG_X86_64 asm volatile (""); - vmcs_writel(field+1, value >> 32); + __vmcs_writel(field+1, value >> 32); #endif } -static void vmcs_clear_bits(unsigned long field, u32 mask) +static __always_inline void vmcs_writel(unsigned long field, unsigned long value) { - vmcs_writel(field, vmcs_readl(field) & ~mask); + vmcs_checkl(field); + __vmcs_writel(field, value); } -static void vmcs_set_bits(unsigned long field, u32 mask) +static __always_inline void vmcs_clear_bits(unsigned long field, u32 mask) { - vmcs_writel(field, vmcs_readl(field) | mask); + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000, + "vmcs_clear_bits does not support 64-bit fields"); + __vmcs_writel(field, __vmcs_readl(field) & ~mask); +} + +static __always_inline void vmcs_set_bits(unsigned long field, u32 mask) +{ + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000, + "vmcs_set_bits does not support 64-bit fields"); + __vmcs_writel(field, __vmcs_readl(field) | mask); } static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val) @@ -2498,7 +2563,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) vmx->nested.nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | PIN_BASED_VMX_PREEMPTION_TIMER; - if (vmx_cpu_uses_apicv(&vmx->vcpu)) + if (kvm_vcpu_apicv_active(&vmx->vcpu)) vmx->nested.nested_vmx_pinbased_ctls_high |= PIN_BASED_POSTED_INTR; @@ -4186,7 +4251,7 @@ out: static int init_rmode_identity_map(struct kvm *kvm) { int i, idx, r = 0; - pfn_t identity_map_pfn; + kvm_pfn_t identity_map_pfn; u32 tmp; if (!enable_ept) @@ -4462,9 +4527,9 @@ static void vmx_disable_intercept_msr_write_x2apic(u32 msr) msr, MSR_TYPE_W); } -static int vmx_cpu_uses_apicv(struct kvm_vcpu *vcpu) +static bool vmx_get_enable_apicv(void) { - return enable_apicv && lapic_in_kernel(vcpu); + return enable_apicv; } static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) @@ -4586,11 +4651,6 @@ static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) kvm_apic_update_irr(vcpu, vmx->pi_desc.pir); } -static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu) -{ - return; -} - /* * Set up the vmcs's constant host-state fields, i.e., host-state fields that * will not change in the lifetime of the guest. @@ -4660,11 +4720,18 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) { u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; - if (!vmx_cpu_uses_apicv(&vmx->vcpu)) + if (!kvm_vcpu_apicv_active(&vmx->vcpu)) pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; return pin_based_exec_ctrl; } +static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx)); +} + static u32 vmx_exec_control(struct vcpu_vmx *vmx) { u32 exec_control = vmcs_config.cpu_based_exec_ctrl; @@ -4703,7 +4770,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; if (!ple_gap) exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; - if (!vmx_cpu_uses_apicv(&vmx->vcpu)) + if (!kvm_vcpu_apicv_active(&vmx->vcpu)) exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; @@ -4767,7 +4834,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write32(SECONDARY_VM_EXEC_CONTROL, vmx_secondary_exec_control(vmx)); - if (vmx_cpu_uses_apicv(&vmx->vcpu)) { + if (kvm_vcpu_apicv_active(&vmx->vcpu)) { vmcs_write64(EOI_EXIT_BITMAP0, 0); vmcs_write64(EOI_EXIT_BITMAP1, 0); vmcs_write64(EOI_EXIT_BITMAP2, 0); @@ -4775,7 +4842,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write16(GUEST_INTR_STATUS, 0); - vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR); + vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc))); } @@ -4867,7 +4934,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) seg_setup(VCPU_SREG_CS); vmcs_write16(GUEST_CS_SELECTOR, 0xf000); - vmcs_write32(GUEST_CS_BASE, 0xffff0000); + vmcs_writel(GUEST_CS_BASE, 0xffff0000ul); seg_setup(VCPU_SREG_DS); seg_setup(VCPU_SREG_ES); @@ -4903,7 +4970,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); - vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); + vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0); setup_msrs(vmx); @@ -4919,7 +4986,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); - if (vmx_cpu_uses_apicv(vcpu)) + if (kvm_vcpu_apicv_active(vcpu)) memset(&vmx->pi_desc, 0, sizeof(struct pi_desc)); if (vmx->vpid != 0) @@ -6203,15 +6270,6 @@ static __init int hardware_setup(void) kvm_tsc_scaling_ratio_frac_bits = 48; } - if (enable_apicv) - kvm_x86_ops->update_cr8_intercept = NULL; - else { - kvm_x86_ops->hwapic_irr_update = NULL; - kvm_x86_ops->hwapic_isr_update = NULL; - kvm_x86_ops->deliver_posted_interrupt = NULL; - kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy; - } - vmx_disable_intercept_for_msr(MSR_FS_BASE, false); vmx_disable_intercept_for_msr(MSR_GS_BASE, false); vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); @@ -7901,7 +7959,7 @@ static void dump_vmcs(void) u32 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL); u32 secondary_exec_control = 0; unsigned long cr4 = vmcs_readl(GUEST_CR4); - u64 efer = vmcs_readl(GUEST_IA32_EFER); + u64 efer = vmcs_read64(GUEST_IA32_EFER); int i, n; if (cpu_has_secondary_exec_ctrls()) @@ -7917,10 +7975,10 @@ static void dump_vmcs(void) if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) && (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA)) { - pr_err("PDPTR0 = 0x%016lx PDPTR1 = 0x%016lx\n", - vmcs_readl(GUEST_PDPTR0), vmcs_readl(GUEST_PDPTR1)); - pr_err("PDPTR2 = 0x%016lx PDPTR3 = 0x%016lx\n", - vmcs_readl(GUEST_PDPTR2), vmcs_readl(GUEST_PDPTR3)); + pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n", + vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1)); + pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n", + vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3)); } pr_err("RSP = 0x%016lx RIP = 0x%016lx\n", vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP)); @@ -7941,16 +7999,16 @@ static void dump_vmcs(void) vmx_dump_sel("TR: ", GUEST_TR_SELECTOR); if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) || (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER))) - pr_err("EFER = 0x%016llx PAT = 0x%016lx\n", - efer, vmcs_readl(GUEST_IA32_PAT)); - pr_err("DebugCtl = 0x%016lx DebugExceptions = 0x%016lx\n", - vmcs_readl(GUEST_IA32_DEBUGCTL), + pr_err("EFER = 0x%016llx PAT = 0x%016llx\n", + efer, vmcs_read64(GUEST_IA32_PAT)); + pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n", + vmcs_read64(GUEST_IA32_DEBUGCTL), vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS)); if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) - pr_err("PerfGlobCtl = 0x%016lx\n", - vmcs_readl(GUEST_IA32_PERF_GLOBAL_CTRL)); + pr_err("PerfGlobCtl = 0x%016llx\n", + vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL)); if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS) - pr_err("BndCfgS = 0x%016lx\n", vmcs_readl(GUEST_BNDCFGS)); + pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS)); pr_err("Interruptibility = %08x ActivityState = %08x\n", vmcs_read32(GUEST_INTERRUPTIBILITY_INFO), vmcs_read32(GUEST_ACTIVITY_STATE)); @@ -7979,11 +8037,12 @@ static void dump_vmcs(void) vmcs_read32(HOST_IA32_SYSENTER_CS), vmcs_readl(HOST_IA32_SYSENTER_EIP)); if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER)) - pr_err("EFER = 0x%016lx PAT = 0x%016lx\n", - vmcs_readl(HOST_IA32_EFER), vmcs_readl(HOST_IA32_PAT)); + pr_err("EFER = 0x%016llx PAT = 0x%016llx\n", + vmcs_read64(HOST_IA32_EFER), + vmcs_read64(HOST_IA32_PAT)); if (vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) - pr_err("PerfGlobCtl = 0x%016lx\n", - vmcs_readl(HOST_IA32_PERF_GLOBAL_CTRL)); + pr_err("PerfGlobCtl = 0x%016llx\n", + vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL)); pr_err("*** Control State ***\n"); pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n", @@ -8006,16 +8065,16 @@ static void dump_vmcs(void) pr_err("IDTVectoring: info=%08x errcode=%08x\n", vmcs_read32(IDT_VECTORING_INFO_FIELD), vmcs_read32(IDT_VECTORING_ERROR_CODE)); - pr_err("TSC Offset = 0x%016lx\n", vmcs_readl(TSC_OFFSET)); + pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET)); if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING) - pr_err("TSC Multiplier = 0x%016lx\n", - vmcs_readl(TSC_MULTIPLIER)); + pr_err("TSC Multiplier = 0x%016llx\n", + vmcs_read64(TSC_MULTIPLIER)); if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD)); if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR) pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV)); if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT)) - pr_err("EPT pointer = 0x%016lx\n", vmcs_readl(EPT_POINTER)); + pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER)); n = vmcs_read32(CR3_TARGET_COUNT); for (i = 0; i + 1 < n; i += 4) pr_err("CR3 target%u=%016lx target%u=%016lx\n", @@ -8154,7 +8213,7 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) * apicv */ if (!cpu_has_vmx_virtualize_x2apic_mode() || - !vmx_cpu_uses_apicv(vcpu)) + !kvm_vcpu_apicv_active(vcpu)) return; if (!cpu_need_tpr_shadow(vcpu)) @@ -8259,10 +8318,9 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) } } -static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu) +static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) { - u64 *eoi_exit_bitmap = vcpu->arch.eoi_exit_bitmap; - if (!vmx_cpu_uses_apicv(vcpu)) + if (!kvm_vcpu_apicv_active(vcpu)) return; vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]); @@ -8932,7 +8990,8 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) best->ebx &= ~bit(X86_FEATURE_INVPCID); } - vmcs_set_secondary_exec_control(secondary_exec_ctl); + if (cpu_has_secondary_exec_ctrls()) + vmcs_set_secondary_exec_control(secondary_exec_ctl); if (static_cpu_has(X86_FEATURE_PCOMMIT) && nested) { if (guest_cpuid_has_pcommit(vcpu)) @@ -9508,7 +9567,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) */ vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; vmx->nested.pi_pending = false; - vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR); + vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); vmcs_write64(POSTED_INTR_DESC_ADDR, page_to_phys(vmx->nested.pi_desc_page) + (unsigned long)(vmcs12->posted_intr_desc_addr & @@ -10169,7 +10228,7 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, * Additionally, restore L2's PDPTR to vmcs12. */ if (enable_ept) { - vmcs12->guest_cr3 = vmcs_read64(GUEST_CR3); + vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); @@ -10805,7 +10864,8 @@ static struct kvm_x86_ops vmx_x86_ops = { .update_cr8_intercept = update_cr8_intercept, .set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode, .set_apic_access_page_addr = vmx_set_apic_access_page_addr, - .cpu_uses_apicv = vmx_cpu_uses_apicv, + .get_enable_apicv = vmx_get_enable_apicv, + .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl, .load_eoi_exitmap = vmx_load_eoi_exitmap, .hwapic_irr_update = vmx_hwapic_irr_update, .hwapic_isr_update = vmx_hwapic_isr_update, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 97592e1..4244c2b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -951,7 +951,7 @@ static u32 msrs_to_save[] = { MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, - MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS + MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, }; static unsigned num_msrs_to_save; @@ -966,6 +966,8 @@ static u32 emulated_msrs[] = { HV_X64_MSR_RESET, HV_X64_MSR_VP_INDEX, HV_X64_MSR_VP_RUNTIME, + HV_X64_MSR_SCONTROL, + HV_X64_MSR_STIMER0_CONFIG, HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, MSR_KVM_PV_EOI_EN, @@ -1167,7 +1169,8 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) ++version; - kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); + if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version))) + return; /* * The guest calculates current wall clock time by adding @@ -1683,6 +1686,11 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm) #endif } +void kvm_make_mclock_inprogress_request(struct kvm *kvm) +{ + kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS); +} + static void kvm_gen_update_masterclock(struct kvm *kvm) { #ifdef CONFIG_X86_64 @@ -2198,6 +2206,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: case HV_X64_MSR_CRASH_CTL: + case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT: return kvm_hv_set_msr_common(vcpu, msr, data, msr_info->host_initiated); case MSR_IA32_BBL_CR_CTL3: @@ -2402,6 +2411,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: case HV_X64_MSR_CRASH_CTL: + case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT: return kvm_hv_get_msr_common(vcpu, msr_info->index, &msr_info->data); break; @@ -2541,6 +2551,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_HYPERV: case KVM_CAP_HYPERV_VAPIC: case KVM_CAP_HYPERV_SPIN: + case KVM_CAP_HYPERV_SYNIC: case KVM_CAP_PCI_SEGMENT: case KVM_CAP_DEBUGREGS: case KVM_CAP_X86_ROBUST_SINGLESTEP: @@ -2693,6 +2704,11 @@ static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu) return kvm_arch_has_noncoherent_dma(vcpu->kvm); } +static inline void kvm_migrate_timers(struct kvm_vcpu *vcpu) +{ + set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests); +} + void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { /* Address WBINVD may be executed by guest */ @@ -2748,7 +2764,9 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { - kvm_x86_ops->sync_pir_to_irr(vcpu); + if (vcpu->arch.apicv_active) + kvm_x86_ops->sync_pir_to_irr(vcpu); + memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); return 0; @@ -3191,6 +3209,20 @@ static int kvm_set_guest_paused(struct kvm_vcpu *vcpu) return 0; } +static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, + struct kvm_enable_cap *cap) +{ + if (cap->flags) + return -EINVAL; + + switch (cap->cap) { + case KVM_CAP_HYPERV_SYNIC: + return kvm_hv_activate_synic(vcpu); + default: + return -EINVAL; + } +} + long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -3455,6 +3487,15 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_set_guest_paused(vcpu); goto out; } + case KVM_ENABLE_CAP: { + struct kvm_enable_cap cap; + + r = -EFAULT; + if (copy_from_user(&cap, argp, sizeof(cap))) + goto out; + r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap); + break; + } default: r = -EINVAL; } @@ -4006,16 +4047,17 @@ static void kvm_init_msr_list(void) /* * Even MSRs that are valid in the host may not be exposed - * to the guests in some cases. We could work around this - * in VMX with the generic MSR save/load machinery, but it - * is not really worthwhile since it will really only - * happen with nested virtualization. + * to the guests in some cases. */ switch (msrs_to_save[i]) { case MSR_IA32_BNDCFGS: if (!kvm_x86_ops->mpx_supported()) continue; break; + case MSR_TSC_AUX: + if (!kvm_x86_ops->rdtscp_supported()) + continue; + break; default: break; } @@ -5106,7 +5148,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2, int emulation_type) { gpa_t gpa = cr2; - pfn_t pfn; + kvm_pfn_t pfn; if (emulation_type & EMULTYPE_NO_REEXECUTE) return false; @@ -5872,6 +5914,12 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid) kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL); } +void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu) +{ + vcpu->arch.apicv_active = false; + kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu); +} + int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) { unsigned long nr, a0, a1, a2, a3, ret; @@ -5965,6 +6013,9 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) if (!vcpu->arch.apic) return; + if (vcpu->arch.apicv_active) + return; + if (!vcpu->arch.apic->vapic_addr) max_irr = kvm_lapic_find_highest_irr(vcpu); else @@ -6301,20 +6352,30 @@ static void process_smi(struct kvm_vcpu *vcpu) kvm_mmu_reset_context(vcpu); } +void kvm_make_scan_ioapic_request(struct kvm *kvm) +{ + kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC); +} + static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) { + u64 eoi_exit_bitmap[4]; + if (!kvm_apic_hw_enabled(vcpu->arch.apic)) return; - memset(vcpu->arch.eoi_exit_bitmap, 0, 256 / 8); + bitmap_zero(vcpu->arch.ioapic_handled_vectors, 256); if (irqchip_split(vcpu->kvm)) - kvm_scan_ioapic_routes(vcpu, vcpu->arch.eoi_exit_bitmap); + kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors); else { - kvm_x86_ops->sync_pir_to_irr(vcpu); - kvm_ioapic_scan_entry(vcpu, vcpu->arch.eoi_exit_bitmap); + if (vcpu->arch.apicv_active) + kvm_x86_ops->sync_pir_to_irr(vcpu); + kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors); } - kvm_x86_ops->load_eoi_exitmap(vcpu); + bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors, + vcpu_to_synic(vcpu)->vec_bitmap, 256); + kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); } static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu) @@ -6422,7 +6483,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) { BUG_ON(vcpu->arch.pending_ioapic_eoi > 255); if (test_bit(vcpu->arch.pending_ioapic_eoi, - (void *) vcpu->arch.eoi_exit_bitmap)) { + vcpu->arch.ioapic_handled_vectors)) { vcpu->run->exit_reason = KVM_EXIT_IOAPIC_EOI; vcpu->run->eoi.vector = vcpu->arch.pending_ioapic_eoi; @@ -6446,6 +6507,20 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) r = 0; goto out; } + if (kvm_check_request(KVM_REQ_HV_EXIT, vcpu)) { + vcpu->run->exit_reason = KVM_EXIT_HYPERV; + vcpu->run->hyperv = vcpu->arch.hyperv.exit; + r = 0; + goto out; + } + + /* + * KVM_REQ_HV_STIMER has to be processed after + * KVM_REQ_CLOCK_UPDATE, because Hyper-V SynIC timers + * depend on the guest clock being up-to-date + */ + if (kvm_check_request(KVM_REQ_HV_STIMER, vcpu)) + kvm_hv_process_stimers(vcpu); } /* @@ -6457,7 +6532,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) * Update architecture specific hints for APIC * virtual interrupt delivery. */ - if (kvm_x86_ops->hwapic_irr_update) + if (vcpu->arch.apicv_active) kvm_x86_ops->hwapic_irr_update(vcpu, kvm_lapic_find_highest_irr(vcpu)); } @@ -7528,6 +7603,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) BUG_ON(vcpu->kvm == NULL); kvm = vcpu->kvm; + vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(); vcpu->arch.pv.pv_unhalted = false; vcpu->arch.emulate_ctxt.ops = &emulate_ops; if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu)) @@ -7585,6 +7661,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) vcpu->arch.pending_external_vector = -1; + kvm_hv_vcpu_init(vcpu); + return 0; fail_free_mce_banks: @@ -7603,6 +7681,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) { int idx; + kvm_hv_vcpu_uninit(vcpu); kvm_pmu_destroy(vcpu); kfree(vcpu->arch.mce_banks); kvm_free_lapic(vcpu); @@ -7997,6 +8076,9 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) kvm_cpu_has_interrupt(vcpu)) return true; + if (kvm_hv_has_stimer_pending(vcpu)) + return true; + return false; } diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index ae9a37b..6d5eb59 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -9,6 +9,7 @@ #include <linux/vmstat.h> #include <linux/highmem.h> #include <linux/swap.h> +#include <linux/memremap.h> #include <asm/pgtable.h> @@ -63,6 +64,16 @@ retry: #endif } +static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) +{ + while ((*nr) - nr_start) { + struct page *page = pages[--(*nr)]; + + ClearPageReferenced(page); + put_page(page); + } +} + /* * The performance critical leaf functions are made noinline otherwise gcc * inlines everything into a single function which results in too much @@ -71,7 +82,9 @@ retry: static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { + struct dev_pagemap *pgmap = NULL; unsigned long mask; + int nr_start = *nr; pte_t *ptep; mask = _PAGE_PRESENT|_PAGE_USER; @@ -89,13 +102,21 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, return 0; } - if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) { + page = pte_page(pte); + if (pte_devmap(pte)) { + pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); + if (unlikely(!pgmap)) { + undo_dev_pagemap(nr, nr_start, pages); + pte_unmap(ptep); + return 0; + } + } else if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) { pte_unmap(ptep); return 0; } VM_BUG_ON(!pfn_valid(pte_pfn(pte))); - page = pte_page(pte); get_page(page); + put_dev_pagemap(pgmap); SetPageReferenced(page); pages[*nr] = page; (*nr)++; @@ -114,6 +135,32 @@ static inline void get_head_page_multiple(struct page *page, int nr) SetPageReferenced(page); } +static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr, + unsigned long end, struct page **pages, int *nr) +{ + int nr_start = *nr; + unsigned long pfn = pmd_pfn(pmd); + struct dev_pagemap *pgmap = NULL; + + pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT; + do { + struct page *page = pfn_to_page(pfn); + + pgmap = get_dev_pagemap(pfn, pgmap); + if (unlikely(!pgmap)) { + undo_dev_pagemap(nr, nr_start, pages); + return 0; + } + SetPageReferenced(page); + pages[*nr] = page; + get_page(page); + put_dev_pagemap(pgmap); + (*nr)++; + pfn++; + } while (addr += PAGE_SIZE, addr != end); + return 1; +} + static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { @@ -126,9 +173,13 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, mask |= _PAGE_RW; if ((pmd_flags(pmd) & mask) != mask) return 0; + + VM_BUG_ON(!pfn_valid(pmd_pfn(pmd))); + if (pmd_devmap(pmd)) + return __gup_device_huge_pmd(pmd, addr, end, pages, nr); + /* hugepages are never "special" */ VM_BUG_ON(pmd_flags(pmd) & _PAGE_SPECIAL); - VM_BUG_ON(!pfn_valid(pmd_pfn(pmd))); refs = 0; head = pmd_page(pmd); @@ -136,8 +187,6 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, do { VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; - if (PageTail(page)) - get_huge_page_tail(page); (*nr)++; page++; refs++; @@ -158,18 +207,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, pmd_t pmd = *pmdp; next = pmd_addr_end(addr, end); - /* - * The pmd_trans_splitting() check below explains why - * pmdp_splitting_flush has to flush the tlb, to stop - * this gup-fast code from running while we set the - * splitting bit in the pmd. Returning zero will take - * the slow path that will call wait_split_huge_page() - * if the pmd is still in splitting state. gup-fast - * can't because it has irq disabled and - * wait_split_huge_page() would never return as the - * tlb flush IPI wouldn't run. - */ - if (pmd_none(pmd) || pmd_trans_splitting(pmd)) + if (pmd_none(pmd)) return 0; if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) { /* @@ -212,8 +250,6 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr, do { VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; - if (PageTail(page)) - get_huge_page_tail(page); (*nr)++; page++; refs++; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 8829482..5488d21 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -30,6 +30,7 @@ #include <linux/module.h> #include <linux/memory.h> #include <linux/memory_hotplug.h> +#include <linux/memremap.h> #include <linux/nmi.h> #include <linux/gfp.h> #include <linux/kcore.h> @@ -714,6 +715,12 @@ static void __meminit free_pagetable(struct page *page, int order) { unsigned long magic; unsigned int nr_pages = 1 << order; + struct vmem_altmap *altmap = to_vmem_altmap((unsigned long) page); + + if (altmap) { + vmem_altmap_free(altmap, nr_pages); + return; + } /* bootmem page has reserved flag */ if (PageReserved(page)) { @@ -1017,13 +1024,19 @@ int __ref arch_remove_memory(u64 start, u64 size) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; + struct page *page = pfn_to_page(start_pfn); + struct vmem_altmap *altmap; struct zone *zone; int ret; - zone = page_zone(pfn_to_page(start_pfn)); - kernel_physical_mapping_remove(start, start + size); + /* With altmap the first mapped page is offset from @start */ + altmap = to_vmem_altmap((unsigned long) page); + if (altmap) + page += vmem_altmap_offset(altmap); + zone = page_zone(page); ret = __remove_pages(zone, start_pfn, nr_pages); WARN_ON_ONCE(ret); + kernel_physical_mapping_remove(start, start + size); return ret; } @@ -1235,7 +1248,7 @@ static void __meminitdata *p_start, *p_end; static int __meminitdata node_start; static int __meminit vmemmap_populate_hugepages(unsigned long start, - unsigned long end, int node) + unsigned long end, int node, struct vmem_altmap *altmap) { unsigned long addr; unsigned long next; @@ -1258,7 +1271,7 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start, if (pmd_none(*pmd)) { void *p; - p = vmemmap_alloc_block_buf(PMD_SIZE, node); + p = __vmemmap_alloc_block_buf(PMD_SIZE, node, altmap); if (p) { pte_t entry; @@ -1279,7 +1292,8 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start, addr_end = addr + PMD_SIZE; p_end = p + PMD_SIZE; continue; - } + } else if (altmap) + return -ENOMEM; /* no fallback */ } else if (pmd_large(*pmd)) { vmemmap_verify((pte_t *)pmd, node, addr, next); continue; @@ -1293,11 +1307,16 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start, int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) { + struct vmem_altmap *altmap = to_vmem_altmap(start); int err; if (cpu_has_pse) - err = vmemmap_populate_hugepages(start, end, node); - else + err = vmemmap_populate_hugepages(start, end, node, altmap); + else if (altmap) { + pr_err_once("%s: no cpu support for altmap allocations\n", + __func__); + err = -ENOMEM; + } else err = vmemmap_populate_basepages(start, end, node); if (!err) sync_global_pgds(start, end - 1, 0); diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 844b06d..96bd1e2 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -69,14 +69,14 @@ unsigned long arch_mmap_rnd(void) { unsigned long rnd; - /* - * 8 bits of randomness in 32bit mmaps, 20 address space bits - * 28 bits of randomness in 64bit mmaps, 40 address space bits - */ if (mmap_is_ia32()) - rnd = (unsigned long)get_random_int() % (1<<8); +#ifdef CONFIG_COMPAT + rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_compat_bits) - 1); +#else + rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_bits) - 1); +#endif else - rnd = (unsigned long)get_random_int() % (1<<28); + rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_bits) - 1); return rnd << PAGE_SHIFT; } diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 031782e..f4ae536 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -12,6 +12,7 @@ #include <linux/debugfs.h> #include <linux/kernel.h> #include <linux/module.h> +#include <linux/pfn_t.h> #include <linux/slab.h> #include <linux/mm.h> #include <linux/fs.h> @@ -949,7 +950,7 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, } int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, - unsigned long pfn) + pfn_t pfn) { enum page_cache_mode pcm; @@ -957,7 +958,7 @@ int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, return 0; /* Set prot based on lookup */ - pcm = lookup_memtype((resource_size_t)pfn << PAGE_SHIFT); + pcm = lookup_memtype(pfn_t_to_phys(pfn)); *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | cachemode2protval(pcm)); diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index ee9c2e3..4eb287e 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -505,19 +505,6 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma, return young; } - -void pmdp_splitting_flush(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) -{ - int set; - VM_BUG_ON(address & ~HPAGE_PMD_MASK); - set = !test_and_set_bit(_PAGE_BIT_SPLITTING, - (unsigned long *)pmdp); - if (set) { - /* need tlb flush only to serialize against gup-fast */ - flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); - } -} #endif /** diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 7599197..4286f36 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -193,7 +193,7 @@ struct jit_context { 32 /* space for rbx, r13, r14, r15 */ + \ 8 /* space for skb_copy_bits() buffer */) -#define PROLOGUE_SIZE 51 +#define PROLOGUE_SIZE 48 /* emit x64 prologue code for BPF program and check it's size. * bpf_tail_call helper will skip it while jumping into another program @@ -229,11 +229,15 @@ static void emit_prologue(u8 **pprog) /* mov qword ptr [rbp-X],r15 */ EMIT3_off32(0x4C, 0x89, 0xBD, -STACKSIZE + 24); - /* clear A and X registers */ - EMIT2(0x31, 0xc0); /* xor eax, eax */ - EMIT3(0x4D, 0x31, 0xED); /* xor r13, r13 */ + /* Clear the tail call counter (tail_call_cnt): for eBPF tail calls + * we need to reset the counter to 0. It's done in two instructions, + * resetting rax register to 0 (xor on eax gets 0 extended), and + * moving it to the counter location. + */ - /* clear tail_cnt: mov qword ptr [rbp-X], rax */ + /* xor eax, eax */ + EMIT2(0x31, 0xc0); + /* mov qword ptr [rbp-X], rax */ EMIT3_off32(0x48, 0x89, 0x85, -STACKSIZE + 32); BUILD_BUG_ON(cnt != PROLOGUE_SIZE); @@ -455,6 +459,18 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, } case BPF_ALU | BPF_MOV | BPF_K: + /* optimization: if imm32 is zero, use 'xor <dst>,<dst>' + * to save 3 bytes. + */ + if (imm32 == 0) { + if (is_ereg(dst_reg)) + EMIT1(add_2mod(0x40, dst_reg, dst_reg)); + b2 = 0x31; /* xor */ + b3 = 0xC0; + EMIT2(b2, add_2reg(b3, dst_reg, dst_reg)); + break; + } + /* mov %eax, imm32 */ if (is_ereg(dst_reg)) EMIT1(add_1mod(0x40, dst_reg)); @@ -469,6 +485,20 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, return -EINVAL; } + /* optimization: if imm64 is zero, use 'xor <dst>,<dst>' + * to save 7 bytes. + */ + if (insn[0].imm == 0 && insn[1].imm == 0) { + b1 = add_2mod(0x48, dst_reg, dst_reg); + b2 = 0x31; /* xor */ + b3 = 0xC0; + EMIT3(b1, b2, add_2reg(b3, dst_reg, dst_reg)); + + insn++; + i++; + break; + } + /* movabsq %rax, imm64 */ EMIT2(add_1mod(0x48, dst_reg), add_1reg(0xB8, dst_reg)); EMIT(insn[0].imm, 4); diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile index 5c6fc35..97062a6 100644 --- a/arch/x86/pci/Makefile +++ b/arch/x86/pci/Makefile @@ -23,6 +23,8 @@ obj-y += bus_numa.o obj-$(CONFIG_AMD_NB) += amd_bus.o obj-$(CONFIG_PCI_CNB20LE_QUIRK) += broadcom_bus.o +obj-$(CONFIG_VMD) += vmd.o + ifeq ($(CONFIG_PCI_DEBUG),y) EXTRA_CFLAGS += -DDEBUG endif diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index eccd4d9..2879efc 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -641,6 +641,43 @@ unsigned int pcibios_assign_all_busses(void) return (pci_probe & PCI_ASSIGN_ALL_BUSSES) ? 1 : 0; } +#if defined(CONFIG_X86_DEV_DMA_OPS) && defined(CONFIG_PCI_DOMAINS) +static LIST_HEAD(dma_domain_list); +static DEFINE_SPINLOCK(dma_domain_list_lock); + +void add_dma_domain(struct dma_domain *domain) +{ + spin_lock(&dma_domain_list_lock); + list_add(&domain->node, &dma_domain_list); + spin_unlock(&dma_domain_list_lock); +} +EXPORT_SYMBOL_GPL(add_dma_domain); + +void del_dma_domain(struct dma_domain *domain) +{ + spin_lock(&dma_domain_list_lock); + list_del(&domain->node); + spin_unlock(&dma_domain_list_lock); +} +EXPORT_SYMBOL_GPL(del_dma_domain); + +static void set_dma_domain_ops(struct pci_dev *pdev) +{ + struct dma_domain *domain; + + spin_lock(&dma_domain_list_lock); + list_for_each_entry(domain, &dma_domain_list, node) { + if (pci_domain_nr(pdev->bus) == domain->domain_nr) { + pdev->dev.archdata.dma_ops = domain->dma_ops; + break; + } + } + spin_unlock(&dma_domain_list_lock); +} +#else +static void set_dma_domain_ops(struct pci_dev *pdev) {} +#endif + int pcibios_add_device(struct pci_dev *dev) { struct setup_data *data; @@ -670,6 +707,7 @@ int pcibios_add_device(struct pci_dev *dev) pa_data = data->next; iounmap(data); } + set_dma_domain_ops(dev); return 0; } diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c index 9b83b90..9770e55 100644 --- a/arch/x86/pci/pcbios.c +++ b/arch/x86/pci/pcbios.c @@ -180,6 +180,7 @@ static int pci_bios_read(unsigned int seg, unsigned int bus, unsigned long result = 0; unsigned long flags; unsigned long bx = (bus << 8) | devfn; + u16 number = 0, mask = 0; WARN_ON(seg); if (!value || (bus > 255) || (devfn > 255) || (reg > 255)) @@ -189,53 +190,35 @@ static int pci_bios_read(unsigned int seg, unsigned int bus, switch (len) { case 1: - __asm__("lcall *(%%esi); cld\n\t" - "jc 1f\n\t" - "xor %%ah, %%ah\n" - "1:" - : "=c" (*value), - "=a" (result) - : "1" (PCIBIOS_READ_CONFIG_BYTE), - "b" (bx), - "D" ((long)reg), - "S" (&pci_indirect)); - /* - * Zero-extend the result beyond 8 bits, do not trust the - * BIOS having done it: - */ - *value &= 0xff; + number = PCIBIOS_READ_CONFIG_BYTE; + mask = 0xff; break; case 2: - __asm__("lcall *(%%esi); cld\n\t" - "jc 1f\n\t" - "xor %%ah, %%ah\n" - "1:" - : "=c" (*value), - "=a" (result) - : "1" (PCIBIOS_READ_CONFIG_WORD), - "b" (bx), - "D" ((long)reg), - "S" (&pci_indirect)); - /* - * Zero-extend the result beyond 16 bits, do not trust the - * BIOS having done it: - */ - *value &= 0xffff; + number = PCIBIOS_READ_CONFIG_WORD; + mask = 0xffff; break; case 4: - __asm__("lcall *(%%esi); cld\n\t" - "jc 1f\n\t" - "xor %%ah, %%ah\n" - "1:" - : "=c" (*value), - "=a" (result) - : "1" (PCIBIOS_READ_CONFIG_DWORD), - "b" (bx), - "D" ((long)reg), - "S" (&pci_indirect)); + number = PCIBIOS_READ_CONFIG_DWORD; break; } + __asm__("lcall *(%%esi); cld\n\t" + "jc 1f\n\t" + "xor %%ah, %%ah\n" + "1:" + : "=c" (*value), + "=a" (result) + : "1" (number), + "b" (bx), + "D" ((long)reg), + "S" (&pci_indirect)); + /* + * Zero-extend the result beyond 8 or 16 bits, do not trust the + * BIOS having done it: + */ + if (mask) + *value &= mask; + raw_spin_unlock_irqrestore(&pci_config_lock, flags); return (int)((result & 0xff00) >> 8); @@ -247,6 +230,7 @@ static int pci_bios_write(unsigned int seg, unsigned int bus, unsigned long result = 0; unsigned long flags; unsigned long bx = (bus << 8) | devfn; + u16 number = 0; WARN_ON(seg); if ((bus > 255) || (devfn > 255) || (reg > 255)) @@ -256,43 +240,27 @@ static int pci_bios_write(unsigned int seg, unsigned int bus, switch (len) { case 1: - __asm__("lcall *(%%esi); cld\n\t" - "jc 1f\n\t" - "xor %%ah, %%ah\n" - "1:" - : "=a" (result) - : "0" (PCIBIOS_WRITE_CONFIG_BYTE), - "c" (value), - "b" (bx), - "D" ((long)reg), - "S" (&pci_indirect)); + number = PCIBIOS_WRITE_CONFIG_BYTE; break; case 2: - __asm__("lcall *(%%esi); cld\n\t" - "jc 1f\n\t" - "xor %%ah, %%ah\n" - "1:" - : "=a" (result) - : "0" (PCIBIOS_WRITE_CONFIG_WORD), - "c" (value), - "b" (bx), - "D" ((long)reg), - "S" (&pci_indirect)); + number = PCIBIOS_WRITE_CONFIG_WORD; break; case 4: - __asm__("lcall *(%%esi); cld\n\t" - "jc 1f\n\t" - "xor %%ah, %%ah\n" - "1:" - : "=a" (result) - : "0" (PCIBIOS_WRITE_CONFIG_DWORD), - "c" (value), - "b" (bx), - "D" ((long)reg), - "S" (&pci_indirect)); + number = PCIBIOS_WRITE_CONFIG_DWORD; break; } + __asm__("lcall *(%%esi); cld\n\t" + "jc 1f\n\t" + "xor %%ah, %%ah\n" + "1:" + : "=a" (result) + : "0" (number), + "c" (value), + "b" (bx), + "D" ((long)reg), + "S" (&pci_indirect)); + raw_spin_unlock_irqrestore(&pci_config_lock, flags); return (int)((result & 0xff00) >> 8); diff --git a/arch/x86/pci/vmd.c b/arch/x86/pci/vmd.c new file mode 100644 index 0000000..d57e480 --- /dev/null +++ b/arch/x86/pci/vmd.c @@ -0,0 +1,723 @@ +/* + * Volume Management Device driver + * Copyright (c) 2015, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include <linux/device.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/msi.h> +#include <linux/pci.h> +#include <linux/rculist.h> +#include <linux/rcupdate.h> + +#include <asm/irqdomain.h> +#include <asm/device.h> +#include <asm/msi.h> +#include <asm/msidef.h> + +#define VMD_CFGBAR 0 +#define VMD_MEMBAR1 2 +#define VMD_MEMBAR2 4 + +/* + * Lock for manipulating VMD IRQ lists. + */ +static DEFINE_RAW_SPINLOCK(list_lock); + +/** + * struct vmd_irq - private data to map driver IRQ to the VMD shared vector + * @node: list item for parent traversal. + * @rcu: RCU callback item for freeing. + * @irq: back pointer to parent. + * @virq: the virtual IRQ value provided to the requesting driver. + * + * Every MSI/MSI-X IRQ requested for a device in a VMD domain will be mapped to + * a VMD IRQ using this structure. + */ +struct vmd_irq { + struct list_head node; + struct rcu_head rcu; + struct vmd_irq_list *irq; + unsigned int virq; +}; + +/** + * struct vmd_irq_list - list of driver requested IRQs mapping to a VMD vector + * @irq_list: the list of irq's the VMD one demuxes to. + * @vmd_vector: the h/w IRQ assigned to the VMD. + * @index: index into the VMD MSI-X table; used for message routing. + * @count: number of child IRQs assigned to this vector; used to track + * sharing. + */ +struct vmd_irq_list { + struct list_head irq_list; + struct vmd_dev *vmd; + unsigned int vmd_vector; + unsigned int index; + unsigned int count; +}; + +struct vmd_dev { + struct pci_dev *dev; + + spinlock_t cfg_lock; + char __iomem *cfgbar; + + int msix_count; + struct msix_entry *msix_entries; + struct vmd_irq_list *irqs; + + struct pci_sysdata sysdata; + struct resource resources[3]; + struct irq_domain *irq_domain; + struct pci_bus *bus; + +#ifdef CONFIG_X86_DEV_DMA_OPS + struct dma_map_ops dma_ops; + struct dma_domain dma_domain; +#endif +}; + +static inline struct vmd_dev *vmd_from_bus(struct pci_bus *bus) +{ + return container_of(bus->sysdata, struct vmd_dev, sysdata); +} + +/* + * Drivers managing a device in a VMD domain allocate their own IRQs as before, + * but the MSI entry for the hardware it's driving will be programmed with a + * destination ID for the VMD MSI-X table. The VMD muxes interrupts in its + * domain into one of its own, and the VMD driver de-muxes these for the + * handlers sharing that VMD IRQ. The vmd irq_domain provides the operations + * and irq_chip to set this up. + */ +static void vmd_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) +{ + struct vmd_irq *vmdirq = data->chip_data; + struct vmd_irq_list *irq = vmdirq->irq; + + msg->address_hi = MSI_ADDR_BASE_HI; + msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_DEST_ID(irq->index); + msg->data = 0; +} + +/* + * We rely on MSI_FLAG_USE_DEF_CHIP_OPS to set the IRQ mask/unmask ops. + */ +static void vmd_irq_enable(struct irq_data *data) +{ + struct vmd_irq *vmdirq = data->chip_data; + + raw_spin_lock(&list_lock); + list_add_tail_rcu(&vmdirq->node, &vmdirq->irq->irq_list); + raw_spin_unlock(&list_lock); + + data->chip->irq_unmask(data); +} + +static void vmd_irq_disable(struct irq_data *data) +{ + struct vmd_irq *vmdirq = data->chip_data; + + data->chip->irq_mask(data); + + raw_spin_lock(&list_lock); + list_del_rcu(&vmdirq->node); + raw_spin_unlock(&list_lock); +} + +/* + * XXX: Stubbed until we develop acceptable way to not create conflicts with + * other devices sharing the same vector. + */ +static int vmd_irq_set_affinity(struct irq_data *data, + const struct cpumask *dest, bool force) +{ + return -EINVAL; +} + +static struct irq_chip vmd_msi_controller = { + .name = "VMD-MSI", + .irq_enable = vmd_irq_enable, + .irq_disable = vmd_irq_disable, + .irq_compose_msi_msg = vmd_compose_msi_msg, + .irq_set_affinity = vmd_irq_set_affinity, +}; + +static irq_hw_number_t vmd_get_hwirq(struct msi_domain_info *info, + msi_alloc_info_t *arg) +{ + return 0; +} + +/* + * XXX: We can be even smarter selecting the best IRQ once we solve the + * affinity problem. + */ +static struct vmd_irq_list *vmd_next_irq(struct vmd_dev *vmd) +{ + int i, best = 0; + + raw_spin_lock(&list_lock); + for (i = 1; i < vmd->msix_count; i++) + if (vmd->irqs[i].count < vmd->irqs[best].count) + best = i; + vmd->irqs[best].count++; + raw_spin_unlock(&list_lock); + + return &vmd->irqs[best]; +} + +static int vmd_msi_init(struct irq_domain *domain, struct msi_domain_info *info, + unsigned int virq, irq_hw_number_t hwirq, + msi_alloc_info_t *arg) +{ + struct vmd_dev *vmd = vmd_from_bus(msi_desc_to_pci_dev(arg->desc)->bus); + struct vmd_irq *vmdirq = kzalloc(sizeof(*vmdirq), GFP_KERNEL); + + if (!vmdirq) + return -ENOMEM; + + INIT_LIST_HEAD(&vmdirq->node); + vmdirq->irq = vmd_next_irq(vmd); + vmdirq->virq = virq; + + irq_domain_set_info(domain, virq, vmdirq->irq->vmd_vector, info->chip, + vmdirq, handle_simple_irq, vmd, NULL); + return 0; +} + +static void vmd_msi_free(struct irq_domain *domain, + struct msi_domain_info *info, unsigned int virq) +{ + struct vmd_irq *vmdirq = irq_get_chip_data(virq); + + /* XXX: Potential optimization to rebalance */ + raw_spin_lock(&list_lock); + vmdirq->irq->count--; + raw_spin_unlock(&list_lock); + + kfree_rcu(vmdirq, rcu); +} + +static int vmd_msi_prepare(struct irq_domain *domain, struct device *dev, + int nvec, msi_alloc_info_t *arg) +{ + struct pci_dev *pdev = to_pci_dev(dev); + struct vmd_dev *vmd = vmd_from_bus(pdev->bus); + + if (nvec > vmd->msix_count) + return vmd->msix_count; + + memset(arg, 0, sizeof(*arg)); + return 0; +} + +static void vmd_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc) +{ + arg->desc = desc; +} + +static struct msi_domain_ops vmd_msi_domain_ops = { + .get_hwirq = vmd_get_hwirq, + .msi_init = vmd_msi_init, + .msi_free = vmd_msi_free, + .msi_prepare = vmd_msi_prepare, + .set_desc = vmd_set_desc, +}; + +static struct msi_domain_info vmd_msi_domain_info = { + .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | + MSI_FLAG_PCI_MSIX, + .ops = &vmd_msi_domain_ops, + .chip = &vmd_msi_controller, +}; + +#ifdef CONFIG_X86_DEV_DMA_OPS +/* + * VMD replaces the requester ID with its own. DMA mappings for devices in a + * VMD domain need to be mapped for the VMD, not the device requiring + * the mapping. + */ +static struct device *to_vmd_dev(struct device *dev) +{ + struct pci_dev *pdev = to_pci_dev(dev); + struct vmd_dev *vmd = vmd_from_bus(pdev->bus); + + return &vmd->dev->dev; +} + +static struct dma_map_ops *vmd_dma_ops(struct device *dev) +{ + return to_vmd_dev(dev)->archdata.dma_ops; +} + +static void *vmd_alloc(struct device *dev, size_t size, dma_addr_t *addr, + gfp_t flag, struct dma_attrs *attrs) +{ + return vmd_dma_ops(dev)->alloc(to_vmd_dev(dev), size, addr, flag, + attrs); +} + +static void vmd_free(struct device *dev, size_t size, void *vaddr, + dma_addr_t addr, struct dma_attrs *attrs) +{ + return vmd_dma_ops(dev)->free(to_vmd_dev(dev), size, vaddr, addr, + attrs); +} + +static int vmd_mmap(struct device *dev, struct vm_area_struct *vma, + void *cpu_addr, dma_addr_t addr, size_t size, + struct dma_attrs *attrs) +{ + return vmd_dma_ops(dev)->mmap(to_vmd_dev(dev), vma, cpu_addr, addr, + size, attrs); +} + +static int vmd_get_sgtable(struct device *dev, struct sg_table *sgt, + void *cpu_addr, dma_addr_t addr, size_t size, + struct dma_attrs *attrs) +{ + return vmd_dma_ops(dev)->get_sgtable(to_vmd_dev(dev), sgt, cpu_addr, + addr, size, attrs); +} + +static dma_addr_t vmd_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + return vmd_dma_ops(dev)->map_page(to_vmd_dev(dev), page, offset, size, + dir, attrs); +} + +static void vmd_unmap_page(struct device *dev, dma_addr_t addr, size_t size, + enum dma_data_direction dir, struct dma_attrs *attrs) +{ + vmd_dma_ops(dev)->unmap_page(to_vmd_dev(dev), addr, size, dir, attrs); +} + +static int vmd_map_sg(struct device *dev, struct scatterlist *sg, int nents, + enum dma_data_direction dir, struct dma_attrs *attrs) +{ + return vmd_dma_ops(dev)->map_sg(to_vmd_dev(dev), sg, nents, dir, attrs); +} + +static void vmd_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, + enum dma_data_direction dir, struct dma_attrs *attrs) +{ + vmd_dma_ops(dev)->unmap_sg(to_vmd_dev(dev), sg, nents, dir, attrs); +} + +static void vmd_sync_single_for_cpu(struct device *dev, dma_addr_t addr, + size_t size, enum dma_data_direction dir) +{ + vmd_dma_ops(dev)->sync_single_for_cpu(to_vmd_dev(dev), addr, size, dir); +} + +static void vmd_sync_single_for_device(struct device *dev, dma_addr_t addr, + size_t size, enum dma_data_direction dir) +{ + vmd_dma_ops(dev)->sync_single_for_device(to_vmd_dev(dev), addr, size, + dir); +} + +static void vmd_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir) +{ + vmd_dma_ops(dev)->sync_sg_for_cpu(to_vmd_dev(dev), sg, nents, dir); +} + +static void vmd_sync_sg_for_device(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir) +{ + vmd_dma_ops(dev)->sync_sg_for_device(to_vmd_dev(dev), sg, nents, dir); +} + +static int vmd_mapping_error(struct device *dev, dma_addr_t addr) +{ + return vmd_dma_ops(dev)->mapping_error(to_vmd_dev(dev), addr); +} + +static int vmd_dma_supported(struct device *dev, u64 mask) +{ + return vmd_dma_ops(dev)->dma_supported(to_vmd_dev(dev), mask); +} + +#ifdef ARCH_HAS_DMA_GET_REQUIRED_MASK +static u64 vmd_get_required_mask(struct device *dev) +{ + return vmd_dma_ops(dev)->get_required_mask(to_vmd_dev(dev)); +} +#endif + +static void vmd_teardown_dma_ops(struct vmd_dev *vmd) +{ + struct dma_domain *domain = &vmd->dma_domain; + + if (vmd->dev->dev.archdata.dma_ops) + del_dma_domain(domain); +} + +#define ASSIGN_VMD_DMA_OPS(source, dest, fn) \ + do { \ + if (source->fn) \ + dest->fn = vmd_##fn; \ + } while (0) + +static void vmd_setup_dma_ops(struct vmd_dev *vmd) +{ + const struct dma_map_ops *source = vmd->dev->dev.archdata.dma_ops; + struct dma_map_ops *dest = &vmd->dma_ops; + struct dma_domain *domain = &vmd->dma_domain; + + domain->domain_nr = vmd->sysdata.domain; + domain->dma_ops = dest; + + if (!source) + return; + ASSIGN_VMD_DMA_OPS(source, dest, alloc); + ASSIGN_VMD_DMA_OPS(source, dest, free); + ASSIGN_VMD_DMA_OPS(source, dest, mmap); + ASSIGN_VMD_DMA_OPS(source, dest, get_sgtable); + ASSIGN_VMD_DMA_OPS(source, dest, map_page); + ASSIGN_VMD_DMA_OPS(source, dest, unmap_page); + ASSIGN_VMD_DMA_OPS(source, dest, map_sg); + ASSIGN_VMD_DMA_OPS(source, dest, unmap_sg); + ASSIGN_VMD_DMA_OPS(source, dest, sync_single_for_cpu); + ASSIGN_VMD_DMA_OPS(source, dest, sync_single_for_device); + ASSIGN_VMD_DMA_OPS(source, dest, sync_sg_for_cpu); + ASSIGN_VMD_DMA_OPS(source, dest, sync_sg_for_device); + ASSIGN_VMD_DMA_OPS(source, dest, mapping_error); + ASSIGN_VMD_DMA_OPS(source, dest, dma_supported); +#ifdef ARCH_HAS_DMA_GET_REQUIRED_MASK + ASSIGN_VMD_DMA_OPS(source, dest, get_required_mask); +#endif + add_dma_domain(domain); +} +#undef ASSIGN_VMD_DMA_OPS +#else +static void vmd_teardown_dma_ops(struct vmd_dev *vmd) {} +static void vmd_setup_dma_ops(struct vmd_dev *vmd) {} +#endif + +static char __iomem *vmd_cfg_addr(struct vmd_dev *vmd, struct pci_bus *bus, + unsigned int devfn, int reg, int len) +{ + char __iomem *addr = vmd->cfgbar + + (bus->number << 20) + (devfn << 12) + reg; + + if ((addr - vmd->cfgbar) + len >= + resource_size(&vmd->dev->resource[VMD_CFGBAR])) + return NULL; + + return addr; +} + +/* + * CPU may deadlock if config space is not serialized on some versions of this + * hardware, so all config space access is done under a spinlock. + */ +static int vmd_pci_read(struct pci_bus *bus, unsigned int devfn, int reg, + int len, u32 *value) +{ + struct vmd_dev *vmd = vmd_from_bus(bus); + char __iomem *addr = vmd_cfg_addr(vmd, bus, devfn, reg, len); + unsigned long flags; + int ret = 0; + + if (!addr) + return -EFAULT; + + spin_lock_irqsave(&vmd->cfg_lock, flags); + switch (len) { + case 1: + *value = readb(addr); + break; + case 2: + *value = readw(addr); + break; + case 4: + *value = readl(addr); + break; + default: + ret = -EINVAL; + break; + } + spin_unlock_irqrestore(&vmd->cfg_lock, flags); + return ret; +} + +/* + * VMD h/w converts non-posted config writes to posted memory writes. The + * read-back in this function forces the completion so it returns only after + * the config space was written, as expected. + */ +static int vmd_pci_write(struct pci_bus *bus, unsigned int devfn, int reg, + int len, u32 value) +{ + struct vmd_dev *vmd = vmd_from_bus(bus); + char __iomem *addr = vmd_cfg_addr(vmd, bus, devfn, reg, len); + unsigned long flags; + int ret = 0; + + if (!addr) + return -EFAULT; + + spin_lock_irqsave(&vmd->cfg_lock, flags); + switch (len) { + case 1: + writeb(value, addr); + readb(addr); + break; + case 2: + writew(value, addr); + readw(addr); + break; + case 4: + writel(value, addr); + readl(addr); + break; + default: + ret = -EINVAL; + break; + } + spin_unlock_irqrestore(&vmd->cfg_lock, flags); + return ret; +} + +static struct pci_ops vmd_ops = { + .read = vmd_pci_read, + .write = vmd_pci_write, +}; + +/* + * VMD domains start at 0x1000 to not clash with ACPI _SEG domains. + */ +static int vmd_find_free_domain(void) +{ + int domain = 0xffff; + struct pci_bus *bus = NULL; + + while ((bus = pci_find_next_bus(bus)) != NULL) + domain = max_t(int, domain, pci_domain_nr(bus)); + return domain + 1; +} + +static int vmd_enable_domain(struct vmd_dev *vmd) +{ + struct pci_sysdata *sd = &vmd->sysdata; + struct resource *res; + u32 upper_bits; + unsigned long flags; + LIST_HEAD(resources); + + res = &vmd->dev->resource[VMD_CFGBAR]; + vmd->resources[0] = (struct resource) { + .name = "VMD CFGBAR", + .start = res->start, + .end = (resource_size(res) >> 20) - 1, + .flags = IORESOURCE_BUS | IORESOURCE_PCI_FIXED, + }; + + res = &vmd->dev->resource[VMD_MEMBAR1]; + upper_bits = upper_32_bits(res->end); + flags = res->flags & ~IORESOURCE_SIZEALIGN; + if (!upper_bits) + flags &= ~IORESOURCE_MEM_64; + vmd->resources[1] = (struct resource) { + .name = "VMD MEMBAR1", + .start = res->start, + .end = res->end, + .flags = flags, + }; + + res = &vmd->dev->resource[VMD_MEMBAR2]; + upper_bits = upper_32_bits(res->end); + flags = res->flags & ~IORESOURCE_SIZEALIGN; + if (!upper_bits) + flags &= ~IORESOURCE_MEM_64; + vmd->resources[2] = (struct resource) { + .name = "VMD MEMBAR2", + .start = res->start + 0x2000, + .end = res->end, + .flags = flags, + }; + + sd->domain = vmd_find_free_domain(); + if (sd->domain < 0) + return sd->domain; + + sd->node = pcibus_to_node(vmd->dev->bus); + + vmd->irq_domain = pci_msi_create_irq_domain(NULL, &vmd_msi_domain_info, + NULL); + if (!vmd->irq_domain) + return -ENODEV; + + pci_add_resource(&resources, &vmd->resources[0]); + pci_add_resource(&resources, &vmd->resources[1]); + pci_add_resource(&resources, &vmd->resources[2]); + vmd->bus = pci_create_root_bus(&vmd->dev->dev, 0, &vmd_ops, sd, + &resources); + if (!vmd->bus) { + pci_free_resource_list(&resources); + irq_domain_remove(vmd->irq_domain); + return -ENODEV; + } + + vmd_setup_dma_ops(vmd); + dev_set_msi_domain(&vmd->bus->dev, vmd->irq_domain); + pci_rescan_bus(vmd->bus); + + WARN(sysfs_create_link(&vmd->dev->dev.kobj, &vmd->bus->dev.kobj, + "domain"), "Can't create symlink to domain\n"); + return 0; +} + +static irqreturn_t vmd_irq(int irq, void *data) +{ + struct vmd_irq_list *irqs = data; + struct vmd_irq *vmdirq; + + rcu_read_lock(); + list_for_each_entry_rcu(vmdirq, &irqs->irq_list, node) + generic_handle_irq(vmdirq->virq); + rcu_read_unlock(); + + return IRQ_HANDLED; +} + +static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id) +{ + struct vmd_dev *vmd; + int i, err; + + if (resource_size(&dev->resource[VMD_CFGBAR]) < (1 << 20)) + return -ENOMEM; + + vmd = devm_kzalloc(&dev->dev, sizeof(*vmd), GFP_KERNEL); + if (!vmd) + return -ENOMEM; + + vmd->dev = dev; + err = pcim_enable_device(dev); + if (err < 0) + return err; + + vmd->cfgbar = pcim_iomap(dev, VMD_CFGBAR, 0); + if (!vmd->cfgbar) + return -ENOMEM; + + pci_set_master(dev); + if (dma_set_mask_and_coherent(&dev->dev, DMA_BIT_MASK(64)) && + dma_set_mask_and_coherent(&dev->dev, DMA_BIT_MASK(32))) + return -ENODEV; + + vmd->msix_count = pci_msix_vec_count(dev); + if (vmd->msix_count < 0) + return -ENODEV; + + vmd->irqs = devm_kcalloc(&dev->dev, vmd->msix_count, sizeof(*vmd->irqs), + GFP_KERNEL); + if (!vmd->irqs) + return -ENOMEM; + + vmd->msix_entries = devm_kcalloc(&dev->dev, vmd->msix_count, + sizeof(*vmd->msix_entries), + GFP_KERNEL); + if (!vmd->msix_entries) + return -ENOMEM; + for (i = 0; i < vmd->msix_count; i++) + vmd->msix_entries[i].entry = i; + + vmd->msix_count = pci_enable_msix_range(vmd->dev, vmd->msix_entries, 1, + vmd->msix_count); + if (vmd->msix_count < 0) + return vmd->msix_count; + + for (i = 0; i < vmd->msix_count; i++) { + INIT_LIST_HEAD(&vmd->irqs[i].irq_list); + vmd->irqs[i].vmd_vector = vmd->msix_entries[i].vector; + vmd->irqs[i].index = i; + + err = devm_request_irq(&dev->dev, vmd->irqs[i].vmd_vector, + vmd_irq, 0, "vmd", &vmd->irqs[i]); + if (err) + return err; + } + + spin_lock_init(&vmd->cfg_lock); + pci_set_drvdata(dev, vmd); + err = vmd_enable_domain(vmd); + if (err) + return err; + + dev_info(&vmd->dev->dev, "Bound to PCI domain %04x\n", + vmd->sysdata.domain); + return 0; +} + +static void vmd_remove(struct pci_dev *dev) +{ + struct vmd_dev *vmd = pci_get_drvdata(dev); + + pci_set_drvdata(dev, NULL); + sysfs_remove_link(&vmd->dev->dev.kobj, "domain"); + pci_stop_root_bus(vmd->bus); + pci_remove_root_bus(vmd->bus); + vmd_teardown_dma_ops(vmd); + irq_domain_remove(vmd->irq_domain); +} + +#ifdef CONFIG_PM +static int vmd_suspend(struct device *dev) +{ + struct pci_dev *pdev = to_pci_dev(dev); + + pci_save_state(pdev); + return 0; +} + +static int vmd_resume(struct device *dev) +{ + struct pci_dev *pdev = to_pci_dev(dev); + + pci_restore_state(pdev); + return 0; +} +#endif +static SIMPLE_DEV_PM_OPS(vmd_dev_pm_ops, vmd_suspend, vmd_resume); + +static const struct pci_device_id vmd_ids[] = { + {PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x201d),}, + {0,} +}; +MODULE_DEVICE_TABLE(pci, vmd_ids); + +static struct pci_driver vmd_drv = { + .name = "vmd", + .id_table = vmd_ids, + .probe = vmd_probe, + .remove = vmd_remove, + .driver = { + .pm = &vmd_dev_pm_ops, + }, +}; +module_pci_driver(vmd_drv); + +MODULE_AUTHOR("Intel Corporation"); +MODULE_LICENSE("GPL v2"); +MODULE_VERSION("0.6"); diff --git a/arch/x86/platform/atom/punit_atom_debug.c b/arch/x86/platform/atom/punit_atom_debug.c index 5ca8ead..81c769e 100644 --- a/arch/x86/platform/atom/punit_atom_debug.c +++ b/arch/x86/platform/atom/punit_atom_debug.c @@ -25,8 +25,6 @@ #include <asm/cpu_device_id.h> #include <asm/iosf_mbi.h> -/* Side band Interface port */ -#define PUNIT_PORT 0x04 /* Power gate status reg */ #define PWRGT_STATUS 0x61 /* Subsystem config/status Video processor */ @@ -85,9 +83,8 @@ static int punit_dev_state_show(struct seq_file *seq_file, void *unused) seq_puts(seq_file, "\n\nPUNIT NORTH COMPLEX DEVICES :\n"); while (punit_devp->name) { - status = iosf_mbi_read(PUNIT_PORT, BT_MBI_PMC_READ, - punit_devp->reg, - &punit_pwr_status); + status = iosf_mbi_read(BT_MBI_UNIT_PMC, MBI_REG_READ, + punit_devp->reg, &punit_pwr_status); if (status) { seq_printf(seq_file, "%9s : Read Failed\n", punit_devp->name); diff --git a/arch/x86/platform/intel-quark/imr.c b/arch/x86/platform/intel-quark/imr.c index 0ee619f..c1bdafa 100644 --- a/arch/x86/platform/intel-quark/imr.c +++ b/arch/x86/platform/intel-quark/imr.c @@ -111,23 +111,19 @@ static int imr_read(struct imr_device *idev, u32 imr_id, struct imr_regs *imr) u32 reg = imr_id * IMR_NUM_REGS + idev->reg_base; int ret; - ret = iosf_mbi_read(QRK_MBI_UNIT_MM, QRK_MBI_MM_READ, - reg++, &imr->addr_lo); + ret = iosf_mbi_read(QRK_MBI_UNIT_MM, MBI_REG_READ, reg++, &imr->addr_lo); if (ret) return ret; - ret = iosf_mbi_read(QRK_MBI_UNIT_MM, QRK_MBI_MM_READ, - reg++, &imr->addr_hi); + ret = iosf_mbi_read(QRK_MBI_UNIT_MM, MBI_REG_READ, reg++, &imr->addr_hi); if (ret) return ret; - ret = iosf_mbi_read(QRK_MBI_UNIT_MM, QRK_MBI_MM_READ, - reg++, &imr->rmask); + ret = iosf_mbi_read(QRK_MBI_UNIT_MM, MBI_REG_READ, reg++, &imr->rmask); if (ret) return ret; - return iosf_mbi_read(QRK_MBI_UNIT_MM, QRK_MBI_MM_READ, - reg++, &imr->wmask); + return iosf_mbi_read(QRK_MBI_UNIT_MM, MBI_REG_READ, reg++, &imr->wmask); } /** @@ -151,31 +147,27 @@ static int imr_write(struct imr_device *idev, u32 imr_id, local_irq_save(flags); - ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE, reg++, - imr->addr_lo); + ret = iosf_mbi_write(QRK_MBI_UNIT_MM, MBI_REG_WRITE, reg++, imr->addr_lo); if (ret) goto failed; - ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE, - reg++, imr->addr_hi); + ret = iosf_mbi_write(QRK_MBI_UNIT_MM, MBI_REG_WRITE, reg++, imr->addr_hi); if (ret) goto failed; - ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE, - reg++, imr->rmask); + ret = iosf_mbi_write(QRK_MBI_UNIT_MM, MBI_REG_WRITE, reg++, imr->rmask); if (ret) goto failed; - ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE, - reg++, imr->wmask); + ret = iosf_mbi_write(QRK_MBI_UNIT_MM, MBI_REG_WRITE, reg++, imr->wmask); if (ret) goto failed; /* Lock bit must be set separately to addr_lo address bits. */ if (lock) { imr->addr_lo |= IMR_LOCK; - ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE, - reg - IMR_NUM_REGS, imr->addr_lo); + ret = iosf_mbi_write(QRK_MBI_UNIT_MM, MBI_REG_WRITE, + reg - IMR_NUM_REGS, imr->addr_lo); if (ret) goto failed; } diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index 2730d77..3e75fcf 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -70,3 +70,4 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) $(REALMODE_CFLAGS) -D_SETUP -D_WAKEUP \ -I$(srctree)/arch/x86/boot KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ GCOV_PROFILE := n +UBSAN_SANITIZE := n diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile index a8fecc2..3ee2bb6 100644 --- a/arch/x86/um/Makefile +++ b/arch/x86/um/Makefile @@ -17,7 +17,7 @@ obj-y = bug.o bugs_$(BITS).o delay.o fault.o ksyms.o ldt.o \ ifeq ($(CONFIG_X86_32),y) obj-y += checksum_32.o -obj-$(CONFIG_BINFMT_ELF) += elfcore.o +obj-$(CONFIG_ELF_CORE) += elfcore.o subarch-y = ../lib/string_32.o ../lib/atomic64_32.o ../lib/atomic64_cx8_32.o subarch-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += ../lib/rwsem.o diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h index 755481f..174781a 100644 --- a/arch/x86/um/asm/barrier.h +++ b/arch/x86/um/asm/barrier.h @@ -36,13 +36,6 @@ #endif /* CONFIG_X86_PPRO_FENCE */ #define dma_wmb() barrier() -#define smp_mb() barrier() -#define smp_rmb() barrier() -#define smp_wmb() barrier() - -#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); barrier(); } while (0) - -#define read_barrier_depends() do { } while (0) -#define smp_read_barrier_depends() do { } while (0) +#include <asm-generic/barrier.h> #endif diff --git a/arch/x86/um/asm/syscall.h b/arch/x86/um/asm/syscall.h index 81d6562..11ab90d 100644 --- a/arch/x86/um/asm/syscall.h +++ b/arch/x86/um/asm/syscall.h @@ -1,6 +1,7 @@ #ifndef __UM_ASM_SYSCALL_H #define __UM_ASM_SYSCALL_H +#include <asm/syscall-generic.h> #include <uapi/linux/audit.h> typedef asmlinkage long (*sys_call_ptr_t)(unsigned long, unsigned long, diff --git a/arch/x86/um/ptrace_32.c b/arch/x86/um/ptrace_32.c index a29756f2..47c78d5 100644 --- a/arch/x86/um/ptrace_32.c +++ b/arch/x86/um/ptrace_32.c @@ -68,6 +68,7 @@ static const int reg_offsets[] = { [EFL] = HOST_EFLAGS, [UESP] = HOST_SP, [SS] = HOST_SS, + [ORIG_EAX] = HOST_ORIG_AX, }; int putreg(struct task_struct *child, int regno, unsigned long value) @@ -83,6 +84,7 @@ int putreg(struct task_struct *child, int regno, unsigned long value) case EAX: case EIP: case UESP: + case ORIG_EAX: break; case FS: if (value && (value & 3) != 3) @@ -108,9 +110,6 @@ int putreg(struct task_struct *child, int regno, unsigned long value) value &= FLAG_MASK; child->thread.regs.regs.gp[HOST_EFLAGS] |= value; return 0; - case ORIG_EAX: - child->thread.regs.regs.syscall = value; - return 0; default : panic("Bad register in putreg() : %d\n", regno); } @@ -143,8 +142,6 @@ unsigned long getreg(struct task_struct *child, int regno) regno >>= 2; switch (regno) { - case ORIG_EAX: - return child->thread.regs.regs.syscall; case FS: case GS: case DS: @@ -163,6 +160,7 @@ unsigned long getreg(struct task_struct *child, int regno) case EDI: case EBP: case EFL: + case ORIG_EAX: break; default: panic("Bad register in getreg() : %d\n", regno); diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index acda713..abf4901 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -64,7 +64,7 @@ static u32 xen_apic_read(u32 reg) if (reg != APIC_ID) return 0; - ret = HYPERVISOR_dom0_op(&op); + ret = HYPERVISOR_platform_op(&op); if (ret) return 0; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 2306392..d09e4c9 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -415,7 +415,7 @@ static bool __init xen_check_mwait(void) set_xen_guest_handle(op.u.set_pminfo.pdc, buf); - if ((HYPERVISOR_dom0_op(&op) == 0) && + if ((HYPERVISOR_platform_op(&op) == 0) && (buf[2] & (ACPI_PDC_C_C1_FFH | ACPI_PDC_C_C2C3_FFH))) { cpuid_leaf5_ecx_val = cx; cpuid_leaf5_edx_val = dx; @@ -1365,7 +1365,7 @@ static void __init xen_boot_params_init_edd(void) info->params.length = sizeof(info->params); set_xen_guest_handle(op.u.firmware_info.u.disk_info.edd_params, &info->params); - ret = HYPERVISOR_dom0_op(&op); + ret = HYPERVISOR_platform_op(&op); if (ret) break; @@ -1383,7 +1383,7 @@ static void __init xen_boot_params_init_edd(void) op.u.firmware_info.type = XEN_FW_DISK_MBR_SIGNATURE; for (nr = 0; nr < EDD_MBR_SIG_MAX; nr++) { op.u.firmware_info.index = nr; - ret = HYPERVISOR_dom0_op(&op); + ret = HYPERVISOR_platform_op(&op); if (ret) break; mbr_signature[nr] = op.u.firmware_info.u.disk_mbr_signature.mbr_signature; @@ -1690,7 +1690,7 @@ asmlinkage __visible void __init xen_start_kernel(void) xen_start_info->console.domU.mfn = 0; xen_start_info->console.domU.evtchn = 0; - if (HYPERVISOR_dom0_op(&op) == 0) + if (HYPERVISOR_platform_op(&op) == 0) boot_params.kbd_status = op.u.firmware_info.u.kbd_shift_flags; /* Make sure ACS will be enabled */ diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index df0c405..7f664c4 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -34,7 +34,8 @@ static void xen_hvm_post_suspend(int suspend_cancelled) { #ifdef CONFIG_XEN_PVHVM int cpu; - xen_hvm_init_shared_info(); + if (!suspend_cancelled) + xen_hvm_init_shared_info(); xen_callback_vector(); xen_unplug_emulated_devices(); if (xen_feature(XENFEAT_hvm_safe_pvclock)) { diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index f1ba6a0..a0a4e55 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -16,6 +16,7 @@ #include <linux/gfp.h> #include <linux/slab.h> #include <linux/pvclock_gtod.h> +#include <linux/timekeeper_internal.h> #include <asm/pvclock.h> #include <asm/xen/hypervisor.h> @@ -32,86 +33,12 @@ #define TIMER_SLOP 100000 #define NS_PER_TICK (1000000000LL / HZ) -/* runstate info updated by Xen */ -static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate); - /* snapshots of runstate info */ static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate_snapshot); /* unused ns of stolen time */ static DEFINE_PER_CPU(u64, xen_residual_stolen); -/* return an consistent snapshot of 64-bit time/counter value */ -static u64 get64(const u64 *p) -{ - u64 ret; - - if (BITS_PER_LONG < 64) { - u32 *p32 = (u32 *)p; - u32 h, l; - - /* - * Read high then low, and then make sure high is - * still the same; this will only loop if low wraps - * and carries into high. - * XXX some clean way to make this endian-proof? - */ - do { - h = p32[1]; - barrier(); - l = p32[0]; - barrier(); - } while (p32[1] != h); - - ret = (((u64)h) << 32) | l; - } else - ret = *p; - - return ret; -} - -/* - * Runstate accounting - */ -static void get_runstate_snapshot(struct vcpu_runstate_info *res) -{ - u64 state_time; - struct vcpu_runstate_info *state; - - BUG_ON(preemptible()); - - state = this_cpu_ptr(&xen_runstate); - - /* - * The runstate info is always updated by the hypervisor on - * the current CPU, so there's no need to use anything - * stronger than a compiler barrier when fetching it. - */ - do { - state_time = get64(&state->state_entry_time); - barrier(); - *res = *state; - barrier(); - } while (get64(&state->state_entry_time) != state_time); -} - -/* return true when a vcpu could run but has no real cpu to run on */ -bool xen_vcpu_stolen(int vcpu) -{ - return per_cpu(xen_runstate, vcpu).state == RUNSTATE_runnable; -} - -void xen_setup_runstate_info(int cpu) -{ - struct vcpu_register_runstate_memory_area area; - - area.addr.v = &per_cpu(xen_runstate, cpu); - - if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, - cpu, &area)) - BUG(); -} - static void do_stolen_accounting(void) { struct vcpu_runstate_info state; @@ -119,7 +46,7 @@ static void do_stolen_accounting(void) s64 runnable, offline, stolen; cputime_t ticks; - get_runstate_snapshot(&state); + xen_get_runstate_snapshot(&state); WARN_ON(state.state != RUNSTATE_running); @@ -194,26 +121,46 @@ static int xen_pvclock_gtod_notify(struct notifier_block *nb, unsigned long was_set, void *priv) { /* Protected by the calling core code serialization */ - static struct timespec next_sync; + static struct timespec64 next_sync; struct xen_platform_op op; - struct timespec now; + struct timespec64 now; + struct timekeeper *tk = priv; + static bool settime64_supported = true; + int ret; - now = __current_kernel_time(); + now.tv_sec = tk->xtime_sec; + now.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift); /* * We only take the expensive HV call when the clock was set * or when the 11 minutes RTC synchronization time elapsed. */ - if (!was_set && timespec_compare(&now, &next_sync) < 0) + if (!was_set && timespec64_compare(&now, &next_sync) < 0) return NOTIFY_OK; - op.cmd = XENPF_settime; - op.u.settime.secs = now.tv_sec; - op.u.settime.nsecs = now.tv_nsec; - op.u.settime.system_time = xen_clocksource_read(); +again: + if (settime64_supported) { + op.cmd = XENPF_settime64; + op.u.settime64.mbz = 0; + op.u.settime64.secs = now.tv_sec; + op.u.settime64.nsecs = now.tv_nsec; + op.u.settime64.system_time = xen_clocksource_read(); + } else { + op.cmd = XENPF_settime32; + op.u.settime32.secs = now.tv_sec; + op.u.settime32.nsecs = now.tv_nsec; + op.u.settime32.system_time = xen_clocksource_read(); + } + + ret = HYPERVISOR_platform_op(&op); - (void)HYPERVISOR_dom0_op(&op); + if (ret == -ENOSYS && settime64_supported) { + settime64_supported = false; + goto again; + } + if (ret < 0) + return NOTIFY_BAD; /* * Move the next drift compensation time 11 minutes |