diff options
Diffstat (limited to 'arch/x86')
145 files changed, 4776 insertions, 2146 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5e28e2b..c2fb8a8 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -17,6 +17,7 @@ config X86_64 depends on 64BIT select X86_DEV_DMA_OPS select ARCH_USE_CMPXCHG_LOCKREF + select HAVE_LIVEPATCH ### Arch settings config X86 @@ -84,6 +85,7 @@ config X86 select HAVE_CMPXCHG_LOCAL select HAVE_CMPXCHG_DOUBLE select HAVE_ARCH_KMEMCHECK + select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP select HAVE_USER_RETURN_NOTIFIER select ARCH_BINFMT_ELF_RANDOMIZE_PIE select HAVE_ARCH_JUMP_LABEL @@ -486,6 +488,22 @@ config X86_INTEL_MID Intel MID platforms are based on an Intel processor and chipset which consume less power than most of the x86 derivatives. +config X86_INTEL_QUARK + bool "Intel Quark platform support" + depends on X86_32 + depends on X86_EXTENDED_PLATFORM + depends on X86_PLATFORM_DEVICES + depends on X86_TSC + depends on PCI + depends on PCI_GOANY + depends on X86_IO_APIC + select IOSF_MBI + select INTEL_IMR + ---help--- + Select to include support for Quark X1000 SoC. + Say Y here if you have a Quark based system such as the Arduino + compatible Intel Galileo. + config X86_INTEL_LPSS bool "Intel Low Power Subsystem Support" depends on ACPI @@ -497,6 +515,17 @@ config X86_INTEL_LPSS things like clock tree (common clock framework) and pincontrol which are needed by the LPSS peripheral drivers. +config X86_AMD_PLATFORM_DEVICE + bool "AMD ACPI2Platform devices support" + depends on ACPI + select COMMON_CLK + select PINCTRL + ---help--- + Select to interpret AMD specific ACPI device to platform device + such as I2C, UART, GPIO found on AMD Carrizo and later chipsets. + I2C and UART depend on COMMON_CLK to set clock. GPIO driver is + implemented under PINCTRL subsystem. + config IOSF_MBI tristate "Intel SoC IOSF Sideband support for SoC platforms" depends on PCI @@ -2017,6 +2046,8 @@ config CMDLINE_OVERRIDE This is used to work around broken boot loaders. This should be set to 'N' under normal conditions. +source "kernel/livepatch/Kconfig" + endmenu config ARCH_ENABLE_MEMORY_HOTPLUG diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 61bd2ad..20028da 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -313,6 +313,19 @@ config DEBUG_NMI_SELFTEST If unsure, say N. +config DEBUG_IMR_SELFTEST + bool "Isolated Memory Region self test" + default n + depends on INTEL_IMR + ---help--- + This option enables automated sanity testing of the IMR code. + Some simple tests are run to verify IMR bounds checking, alignment + and overlapping. This option is really only useful if you are + debugging an IMR memory map or are modifying the IMR code and want to + test your changes. + + If unsure say N here. + config X86_DEBUG_STATIC_CPU_HAS bool "Debug alternatives" depends on DEBUG_KERNEL diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 920e616..5ba2d9c 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -148,6 +148,7 @@ cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTI # does binutils support specific instructions? asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1) +asinstr += $(call as-instr,pshufb %xmm0$(comma)%xmm0,-DCONFIG_AS_SSSE3=1) asinstr += $(call as-instr,crc32l %eax$(comma)%eax,-DCONFIG_AS_CRC32=1) avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1) avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1) diff --git a/arch/x86/Makefile.um b/arch/x86/Makefile.um index 36b62bc..95eba55 100644 --- a/arch/x86/Makefile.um +++ b/arch/x86/Makefile.um @@ -30,7 +30,7 @@ cflags-y += -ffreestanding # Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use # a lot more stack due to the lack of sharing of stacklots. Also, gcc # 4.3.0 needs -funit-at-a-time for extern inline functions. -KBUILD_CFLAGS += $(shell if [ $(call cc-version) -lt 0400 ] ; then \ +KBUILD_CFLAGS += $(shell if [ $(cc-version) -lt 0400 ] ; then \ echo $(call cc-option,-fno-unit-at-a-time); \ else echo $(call cc-option,-funit-at-a-time); fi ;) diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 3db07f3..57bbf2f 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -14,6 +14,8 @@ # Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode. # The number is the same as you would ordinarily press at bootup. +KASAN_SANITIZE := n + SVGA_MODE := -DSVGA_MODE=NORMAL_VGA targets := vmlinux.bin setup.bin setup.elf bzImage diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index ad754b4..0a291cd 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -16,6 +16,8 @@ # (see scripts/Makefile.lib size_append) # compressed vmlinux.bin.all + u32 size of vmlinux.bin.all +KASAN_SANITIZE := n + targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \ vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4 @@ -49,6 +51,7 @@ $(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone vmlinux-objs-$(CONFIG_EFI_STUB) += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o \ $(objtree)/drivers/firmware/efi/libstub/lib.a +vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_thunk_$(BITS).o $(obj)/vmlinux: $(vmlinux-objs-y) FORCE $(call if_changed,ld) diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c index bb13763..7083c16 100644 --- a/arch/x86/boot/compressed/aslr.c +++ b/arch/x86/boot/compressed/aslr.c @@ -14,6 +14,13 @@ static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@" LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION; +struct kaslr_setup_data { + __u64 next; + __u32 type; + __u32 len; + __u8 data[1]; +} kaslr_setup_data; + #define I8254_PORT_CONTROL 0x43 #define I8254_PORT_COUNTER0 0x40 #define I8254_CMD_READBACK 0xC0 @@ -295,7 +302,29 @@ static unsigned long find_random_addr(unsigned long minimum, return slots_fetch_random(); } -unsigned char *choose_kernel_location(unsigned char *input, +static void add_kaslr_setup_data(struct boot_params *params, __u8 enabled) +{ + struct setup_data *data; + + kaslr_setup_data.type = SETUP_KASLR; + kaslr_setup_data.len = 1; + kaslr_setup_data.next = 0; + kaslr_setup_data.data[0] = enabled; + + data = (struct setup_data *)(unsigned long)params->hdr.setup_data; + + while (data && data->next) + data = (struct setup_data *)(unsigned long)data->next; + + if (data) + data->next = (unsigned long)&kaslr_setup_data; + else + params->hdr.setup_data = (unsigned long)&kaslr_setup_data; + +} + +unsigned char *choose_kernel_location(struct boot_params *params, + unsigned char *input, unsigned long input_size, unsigned char *output, unsigned long output_size) @@ -306,14 +335,17 @@ unsigned char *choose_kernel_location(unsigned char *input, #ifdef CONFIG_HIBERNATION if (!cmdline_find_option_bool("kaslr")) { debug_putstr("KASLR disabled by default...\n"); + add_kaslr_setup_data(params, 0); goto out; } #else if (cmdline_find_option_bool("nokaslr")) { debug_putstr("KASLR disabled by cmdline...\n"); + add_kaslr_setup_data(params, 0); goto out; } #endif + add_kaslr_setup_data(params, 1); /* Record the various known unsafe memory ranges. */ mem_avoid_init((unsigned long)input, input_size, diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index 92b9a5f..ef17683 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -13,8 +13,7 @@ #include <asm/setup.h> #include <asm/desc.h> -#undef memcpy /* Use memcpy from misc.c */ - +#include "../string.h" #include "eboot.h" static efi_system_table_t *sys_table; diff --git a/arch/x86/boot/compressed/efi_stub_64.S b/arch/x86/boot/compressed/efi_stub_64.S index 7ff3632..99494df 100644 --- a/arch/x86/boot/compressed/efi_stub_64.S +++ b/arch/x86/boot/compressed/efi_stub_64.S @@ -3,28 +3,3 @@ #include <asm/processor-flags.h> #include "../../platform/efi/efi_stub_64.S" - -#ifdef CONFIG_EFI_MIXED - .code64 - .text -ENTRY(efi64_thunk) - push %rbp - push %rbx - - subq $16, %rsp - leaq efi_exit32(%rip), %rax - movl %eax, 8(%rsp) - leaq efi_gdt64(%rip), %rax - movl %eax, 4(%rsp) - movl %eax, 2(%rax) /* Fixup the gdt base address */ - leaq efi32_boot_gdt(%rip), %rax - movl %eax, (%rsp) - - call __efi64_thunk - - addq $16, %rsp - pop %rbx - pop %rbp - ret -ENDPROC(efi64_thunk) -#endif /* CONFIG_EFI_MIXED */ diff --git a/arch/x86/boot/compressed/efi_thunk_64.S b/arch/x86/boot/compressed/efi_thunk_64.S new file mode 100644 index 0000000..630384a --- /dev/null +++ b/arch/x86/boot/compressed/efi_thunk_64.S @@ -0,0 +1,196 @@ +/* + * Copyright (C) 2014, 2015 Intel Corporation; author Matt Fleming + * + * Early support for invoking 32-bit EFI services from a 64-bit kernel. + * + * Because this thunking occurs before ExitBootServices() we have to + * restore the firmware's 32-bit GDT before we make EFI serivce calls, + * since the firmware's 32-bit IDT is still currently installed and it + * needs to be able to service interrupts. + * + * On the plus side, we don't have to worry about mangling 64-bit + * addresses into 32-bits because we're executing with an identify + * mapped pagetable and haven't transitioned to 64-bit virtual addresses + * yet. + */ + +#include <linux/linkage.h> +#include <asm/msr.h> +#include <asm/page_types.h> +#include <asm/processor-flags.h> +#include <asm/segment.h> + + .code64 + .text +ENTRY(efi64_thunk) + push %rbp + push %rbx + + subq $8, %rsp + leaq efi_exit32(%rip), %rax + movl %eax, 4(%rsp) + leaq efi_gdt64(%rip), %rax + movl %eax, (%rsp) + movl %eax, 2(%rax) /* Fixup the gdt base address */ + + movl %ds, %eax + push %rax + movl %es, %eax + push %rax + movl %ss, %eax + push %rax + + /* + * Convert x86-64 ABI params to i386 ABI + */ + subq $32, %rsp + movl %esi, 0x0(%rsp) + movl %edx, 0x4(%rsp) + movl %ecx, 0x8(%rsp) + movq %r8, %rsi + movl %esi, 0xc(%rsp) + movq %r9, %rsi + movl %esi, 0x10(%rsp) + + sgdt save_gdt(%rip) + + leaq 1f(%rip), %rbx + movq %rbx, func_rt_ptr(%rip) + + /* + * Switch to gdt with 32-bit segments. This is the firmware GDT + * that was installed when the kernel started executing. This + * pointer was saved at the EFI stub entry point in head_64.S. + */ + leaq efi32_boot_gdt(%rip), %rax + lgdt (%rax) + + pushq $__KERNEL_CS + leaq efi_enter32(%rip), %rax + pushq %rax + lretq + +1: addq $32, %rsp + + lgdt save_gdt(%rip) + + pop %rbx + movl %ebx, %ss + pop %rbx + movl %ebx, %es + pop %rbx + movl %ebx, %ds + + /* + * Convert 32-bit status code into 64-bit. + */ + test %rax, %rax + jz 1f + movl %eax, %ecx + andl $0x0fffffff, %ecx + andl $0xf0000000, %eax + shl $32, %rax + or %rcx, %rax +1: + addq $8, %rsp + pop %rbx + pop %rbp + ret +ENDPROC(efi64_thunk) + +ENTRY(efi_exit32) + movq func_rt_ptr(%rip), %rax + push %rax + mov %rdi, %rax + ret +ENDPROC(efi_exit32) + + .code32 +/* + * EFI service pointer must be in %edi. + * + * The stack should represent the 32-bit calling convention. + */ +ENTRY(efi_enter32) + movl $__KERNEL_DS, %eax + movl %eax, %ds + movl %eax, %es + movl %eax, %ss + + /* Reload pgtables */ + movl %cr3, %eax + movl %eax, %cr3 + + /* Disable paging */ + movl %cr0, %eax + btrl $X86_CR0_PG_BIT, %eax + movl %eax, %cr0 + + /* Disable long mode via EFER */ + movl $MSR_EFER, %ecx + rdmsr + btrl $_EFER_LME, %eax + wrmsr + + call *%edi + + /* We must preserve return value */ + movl %eax, %edi + + /* + * Some firmware will return with interrupts enabled. Be sure to + * disable them before we switch GDTs. + */ + cli + + movl 56(%esp), %eax + movl %eax, 2(%eax) + lgdtl (%eax) + + movl %cr4, %eax + btsl $(X86_CR4_PAE_BIT), %eax + movl %eax, %cr4 + + movl %cr3, %eax + movl %eax, %cr3 + + movl $MSR_EFER, %ecx + rdmsr + btsl $_EFER_LME, %eax + wrmsr + + xorl %eax, %eax + lldt %ax + + movl 60(%esp), %eax + pushl $__KERNEL_CS + pushl %eax + + /* Enable paging */ + movl %cr0, %eax + btsl $X86_CR0_PG_BIT, %eax + movl %eax, %cr0 + lret +ENDPROC(efi_enter32) + + .data + .balign 8 + .global efi32_boot_gdt +efi32_boot_gdt: .word 0 + .quad 0 + +save_gdt: .word 0 + .quad 0 +func_rt_ptr: .quad 0 + + .global efi_gdt64 +efi_gdt64: + .word efi_gdt64_end - efi_gdt64 + .long 0 /* Filled out by user */ + .word 0 + .quad 0x0000000000000000 /* NULL descriptor */ + .quad 0x00af9a000000ffff /* __KERNEL_CS */ + .quad 0x00cf92000000ffff /* __KERNEL_DS */ + .quad 0x0080890000000000 /* TS descriptor */ + .quad 0x0000000000000000 /* TS continued */ +efi_gdt64_end: diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index a950864..5903089 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -401,7 +401,8 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, * the entire decompressed kernel plus relocation table, or the * entire decompressed kernel plus .bss and .brk sections. */ - output = choose_kernel_location(input_data, input_len, output, + output = choose_kernel_location(real_mode, input_data, input_len, + output, output_len > run_size ? output_len : run_size); diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 24e3e56..ee3576b 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -7,6 +7,7 @@ * we just keep it from happening */ #undef CONFIG_PARAVIRT +#undef CONFIG_KASAN #ifdef CONFIG_X86_32 #define _ASM_X86_DESC_H 1 #endif @@ -56,7 +57,8 @@ int cmdline_find_option_bool(const char *option); #if CONFIG_RANDOMIZE_BASE /* aslr.c */ -unsigned char *choose_kernel_location(unsigned char *input, +unsigned char *choose_kernel_location(struct boot_params *params, + unsigned char *input, unsigned long input_size, unsigned char *output, unsigned long output_size); @@ -64,7 +66,8 @@ unsigned char *choose_kernel_location(unsigned char *input, bool has_cpuflag(int flag); #else static inline -unsigned char *choose_kernel_location(unsigned char *input, +unsigned char *choose_kernel_location(struct boot_params *params, + unsigned char *input, unsigned long input_size, unsigned char *output, unsigned long output_size) diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index 477e9d7..6bd2c6c 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -32,12 +32,23 @@ #include <linux/linkage.h> #include <asm/inst.h> +/* + * The following macros are used to move an (un)aligned 16 byte value to/from + * an XMM register. This can done for either FP or integer values, for FP use + * movaps (move aligned packed single) or integer use movdqa (move double quad + * aligned). It doesn't make a performance difference which instruction is used + * since Nehalem (original Core i7) was released. However, the movaps is a byte + * shorter, so that is the one we'll use for now. (same for unaligned). + */ +#define MOVADQ movaps +#define MOVUDQ movups + #ifdef __x86_64__ + .data .align 16 .Lgf128mul_x_ble_mask: .octa 0x00000000000000010000000000000087 - POLY: .octa 0xC2000000000000000000000000000001 TWOONE: .octa 0x00000001000000000000000000000001 @@ -89,6 +100,7 @@ enc: .octa 0x2 #define arg8 STACK_OFFSET+16(%r14) #define arg9 STACK_OFFSET+24(%r14) #define arg10 STACK_OFFSET+32(%r14) +#define keysize 2*15*16(%arg1) #endif @@ -213,10 +225,12 @@ enc: .octa 0x2 .macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation + MOVADQ SHUF_MASK(%rip), %xmm14 mov arg7, %r10 # %r10 = AAD mov arg8, %r12 # %r12 = aadLen mov %r12, %r11 pxor %xmm\i, %xmm\i + _get_AAD_loop\num_initial_blocks\operation: movd (%r10), \TMP1 pslldq $12, \TMP1 @@ -225,16 +239,18 @@ _get_AAD_loop\num_initial_blocks\operation: add $4, %r10 sub $4, %r12 jne _get_AAD_loop\num_initial_blocks\operation + cmp $16, %r11 je _get_AAD_loop2_done\num_initial_blocks\operation + mov $16, %r12 _get_AAD_loop2\num_initial_blocks\operation: psrldq $4, %xmm\i sub $4, %r12 cmp %r11, %r12 jne _get_AAD_loop2\num_initial_blocks\operation + _get_AAD_loop2_done\num_initial_blocks\operation: - movdqa SHUF_MASK(%rip), %xmm14 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data xor %r11, %r11 # initialise the data pointer offset as zero @@ -243,59 +259,34 @@ _get_AAD_loop2_done\num_initial_blocks\operation: mov %arg5, %rax # %rax = *Y0 movdqu (%rax), \XMM0 # XMM0 = Y0 - movdqa SHUF_MASK(%rip), %xmm14 PSHUFB_XMM %xmm14, \XMM0 .if (\i == 5) || (\i == 6) || (\i == 7) + MOVADQ ONE(%RIP),\TMP1 + MOVADQ (%arg1),\TMP2 .irpc index, \i_seq - paddd ONE(%rip), \XMM0 # INCR Y0 + paddd \TMP1, \XMM0 # INCR Y0 movdqa \XMM0, %xmm\index - movdqa SHUF_MASK(%rip), %xmm14 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap - -.endr -.irpc index, \i_seq - pxor 16*0(%arg1), %xmm\index -.endr -.irpc index, \i_seq - movaps 0x10(%rdi), \TMP1 - AESENC \TMP1, %xmm\index # Round 1 -.endr -.irpc index, \i_seq - movaps 0x20(%arg1), \TMP1 - AESENC \TMP1, %xmm\index # Round 2 -.endr -.irpc index, \i_seq - movaps 0x30(%arg1), \TMP1 - AESENC \TMP1, %xmm\index # Round 2 -.endr -.irpc index, \i_seq - movaps 0x40(%arg1), \TMP1 - AESENC \TMP1, %xmm\index # Round 2 -.endr -.irpc index, \i_seq - movaps 0x50(%arg1), \TMP1 - AESENC \TMP1, %xmm\index # Round 2 -.endr -.irpc index, \i_seq - movaps 0x60(%arg1), \TMP1 - AESENC \TMP1, %xmm\index # Round 2 + pxor \TMP2, %xmm\index .endr -.irpc index, \i_seq - movaps 0x70(%arg1), \TMP1 - AESENC \TMP1, %xmm\index # Round 2 -.endr -.irpc index, \i_seq - movaps 0x80(%arg1), \TMP1 - AESENC \TMP1, %xmm\index # Round 2 -.endr -.irpc index, \i_seq - movaps 0x90(%arg1), \TMP1 - AESENC \TMP1, %xmm\index # Round 2 + lea 0x10(%arg1),%r10 + mov keysize,%eax + shr $2,%eax # 128->4, 192->6, 256->8 + add $5,%eax # 128->9, 192->11, 256->13 + +aes_loop_initial_dec\num_initial_blocks: + MOVADQ (%r10),\TMP1 +.irpc index, \i_seq + AESENC \TMP1, %xmm\index .endr + add $16,%r10 + sub $1,%eax + jnz aes_loop_initial_dec\num_initial_blocks + + MOVADQ (%r10), \TMP1 .irpc index, \i_seq - movaps 0xa0(%arg1), \TMP1 - AESENCLAST \TMP1, %xmm\index # Round 10 + AESENCLAST \TMP1, %xmm\index # Last Round .endr .irpc index, \i_seq movdqu (%arg3 , %r11, 1), \TMP1 @@ -305,10 +296,8 @@ _get_AAD_loop2_done\num_initial_blocks\operation: add $16, %r11 movdqa \TMP1, %xmm\index - movdqa SHUF_MASK(%rip), %xmm14 PSHUFB_XMM %xmm14, %xmm\index - - # prepare plaintext/ciphertext for GHASH computation + # prepare plaintext/ciphertext for GHASH computation .endr .endif GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 @@ -338,30 +327,28 @@ _get_AAD_loop2_done\num_initial_blocks\operation: * Precomputations for HashKey parallel with encryption of first 4 blocks. * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i */ - paddd ONE(%rip), \XMM0 # INCR Y0 - movdqa \XMM0, \XMM1 - movdqa SHUF_MASK(%rip), %xmm14 + MOVADQ ONE(%rip), \TMP1 + paddd \TMP1, \XMM0 # INCR Y0 + MOVADQ \XMM0, \XMM1 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap - paddd ONE(%rip), \XMM0 # INCR Y0 - movdqa \XMM0, \XMM2 - movdqa SHUF_MASK(%rip), %xmm14 + paddd \TMP1, \XMM0 # INCR Y0 + MOVADQ \XMM0, \XMM2 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap - paddd ONE(%rip), \XMM0 # INCR Y0 - movdqa \XMM0, \XMM3 - movdqa SHUF_MASK(%rip), %xmm14 + paddd \TMP1, \XMM0 # INCR Y0 + MOVADQ \XMM0, \XMM3 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap - paddd ONE(%rip), \XMM0 # INCR Y0 - movdqa \XMM0, \XMM4 - movdqa SHUF_MASK(%rip), %xmm14 + paddd \TMP1, \XMM0 # INCR Y0 + MOVADQ \XMM0, \XMM4 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap - pxor 16*0(%arg1), \XMM1 - pxor 16*0(%arg1), \XMM2 - pxor 16*0(%arg1), \XMM3 - pxor 16*0(%arg1), \XMM4 + MOVADQ 0(%arg1),\TMP1 + pxor \TMP1, \XMM1 + pxor \TMP1, \XMM2 + pxor \TMP1, \XMM3 + pxor \TMP1, \XMM4 movdqa \TMP3, \TMP5 pshufd $78, \TMP3, \TMP1 pxor \TMP3, \TMP1 @@ -399,7 +386,23 @@ _get_AAD_loop2_done\num_initial_blocks\operation: pshufd $78, \TMP5, \TMP1 pxor \TMP5, \TMP1 movdqa \TMP1, HashKey_4_k(%rsp) - movaps 0xa0(%arg1), \TMP2 + lea 0xa0(%arg1),%r10 + mov keysize,%eax + shr $2,%eax # 128->4, 192->6, 256->8 + sub $4,%eax # 128->0, 192->2, 256->4 + jz aes_loop_pre_dec_done\num_initial_blocks + +aes_loop_pre_dec\num_initial_blocks: + MOVADQ (%r10),\TMP2 +.irpc index, 1234 + AESENC \TMP2, %xmm\index +.endr + add $16,%r10 + sub $1,%eax + jnz aes_loop_pre_dec\num_initial_blocks + +aes_loop_pre_dec_done\num_initial_blocks: + MOVADQ (%r10), \TMP2 AESENCLAST \TMP2, \XMM1 AESENCLAST \TMP2, \XMM2 AESENCLAST \TMP2, \XMM3 @@ -421,15 +424,11 @@ _get_AAD_loop2_done\num_initial_blocks\operation: movdqu \XMM4, 16*3(%arg2 , %r11 , 1) movdqa \TMP1, \XMM4 add $64, %r11 - movdqa SHUF_MASK(%rip), %xmm14 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap pxor \XMMDst, \XMM1 # combine GHASHed value with the corresponding ciphertext - movdqa SHUF_MASK(%rip), %xmm14 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap - movdqa SHUF_MASK(%rip), %xmm14 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap - movdqa SHUF_MASK(%rip), %xmm14 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap _initial_blocks_done\num_initial_blocks\operation: @@ -451,6 +450,7 @@ _initial_blocks_done\num_initial_blocks\operation: .macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation + MOVADQ SHUF_MASK(%rip), %xmm14 mov arg7, %r10 # %r10 = AAD mov arg8, %r12 # %r12 = aadLen mov %r12, %r11 @@ -472,7 +472,6 @@ _get_AAD_loop2\num_initial_blocks\operation: cmp %r11, %r12 jne _get_AAD_loop2\num_initial_blocks\operation _get_AAD_loop2_done\num_initial_blocks\operation: - movdqa SHUF_MASK(%rip), %xmm14 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data xor %r11, %r11 # initialise the data pointer offset as zero @@ -481,59 +480,35 @@ _get_AAD_loop2_done\num_initial_blocks\operation: mov %arg5, %rax # %rax = *Y0 movdqu (%rax), \XMM0 # XMM0 = Y0 - movdqa SHUF_MASK(%rip), %xmm14 PSHUFB_XMM %xmm14, \XMM0 .if (\i == 5) || (\i == 6) || (\i == 7) -.irpc index, \i_seq - paddd ONE(%rip), \XMM0 # INCR Y0 - movdqa \XMM0, %xmm\index - movdqa SHUF_MASK(%rip), %xmm14 - PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap -.endr -.irpc index, \i_seq - pxor 16*0(%arg1), %xmm\index -.endr -.irpc index, \i_seq - movaps 0x10(%rdi), \TMP1 - AESENC \TMP1, %xmm\index # Round 1 -.endr -.irpc index, \i_seq - movaps 0x20(%arg1), \TMP1 - AESENC \TMP1, %xmm\index # Round 2 -.endr + MOVADQ ONE(%RIP),\TMP1 + MOVADQ 0(%arg1),\TMP2 .irpc index, \i_seq - movaps 0x30(%arg1), \TMP1 - AESENC \TMP1, %xmm\index # Round 2 + paddd \TMP1, \XMM0 # INCR Y0 + MOVADQ \XMM0, %xmm\index + PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap + pxor \TMP2, %xmm\index .endr -.irpc index, \i_seq - movaps 0x40(%arg1), \TMP1 - AESENC \TMP1, %xmm\index # Round 2 -.endr -.irpc index, \i_seq - movaps 0x50(%arg1), \TMP1 - AESENC \TMP1, %xmm\index # Round 2 -.endr -.irpc index, \i_seq - movaps 0x60(%arg1), \TMP1 - AESENC \TMP1, %xmm\index # Round 2 -.endr -.irpc index, \i_seq - movaps 0x70(%arg1), \TMP1 - AESENC \TMP1, %xmm\index # Round 2 -.endr -.irpc index, \i_seq - movaps 0x80(%arg1), \TMP1 - AESENC \TMP1, %xmm\index # Round 2 -.endr -.irpc index, \i_seq - movaps 0x90(%arg1), \TMP1 - AESENC \TMP1, %xmm\index # Round 2 + lea 0x10(%arg1),%r10 + mov keysize,%eax + shr $2,%eax # 128->4, 192->6, 256->8 + add $5,%eax # 128->9, 192->11, 256->13 + +aes_loop_initial_enc\num_initial_blocks: + MOVADQ (%r10),\TMP1 +.irpc index, \i_seq + AESENC \TMP1, %xmm\index .endr + add $16,%r10 + sub $1,%eax + jnz aes_loop_initial_enc\num_initial_blocks + + MOVADQ (%r10), \TMP1 .irpc index, \i_seq - movaps 0xa0(%arg1), \TMP1 - AESENCLAST \TMP1, %xmm\index # Round 10 + AESENCLAST \TMP1, %xmm\index # Last Round .endr .irpc index, \i_seq movdqu (%arg3 , %r11, 1), \TMP1 @@ -541,8 +516,6 @@ _get_AAD_loop2_done\num_initial_blocks\operation: movdqu %xmm\index, (%arg2 , %r11, 1) # write back plaintext/ciphertext for num_initial_blocks add $16, %r11 - - movdqa SHUF_MASK(%rip), %xmm14 PSHUFB_XMM %xmm14, %xmm\index # prepare plaintext/ciphertext for GHASH computation @@ -575,30 +548,28 @@ _get_AAD_loop2_done\num_initial_blocks\operation: * Precomputations for HashKey parallel with encryption of first 4 blocks. * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i */ - paddd ONE(%rip), \XMM0 # INCR Y0 - movdqa \XMM0, \XMM1 - movdqa SHUF_MASK(%rip), %xmm14 + MOVADQ ONE(%RIP),\TMP1 + paddd \TMP1, \XMM0 # INCR Y0 + MOVADQ \XMM0, \XMM1 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap - paddd ONE(%rip), \XMM0 # INCR Y0 - movdqa \XMM0, \XMM2 - movdqa SHUF_MASK(%rip), %xmm14 + paddd \TMP1, \XMM0 # INCR Y0 + MOVADQ \XMM0, \XMM2 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap - paddd ONE(%rip), \XMM0 # INCR Y0 - movdqa \XMM0, \XMM3 - movdqa SHUF_MASK(%rip), %xmm14 + paddd \TMP1, \XMM0 # INCR Y0 + MOVADQ \XMM0, \XMM3 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap - paddd ONE(%rip), \XMM0 # INCR Y0 - movdqa \XMM0, \XMM4 - movdqa SHUF_MASK(%rip), %xmm14 + paddd \TMP1, \XMM0 # INCR Y0 + MOVADQ \XMM0, \XMM4 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap - pxor 16*0(%arg1), \XMM1 - pxor 16*0(%arg1), \XMM2 - pxor 16*0(%arg1), \XMM3 - pxor 16*0(%arg1), \XMM4 + MOVADQ 0(%arg1),\TMP1 + pxor \TMP1, \XMM1 + pxor \TMP1, \XMM2 + pxor \TMP1, \XMM3 + pxor \TMP1, \XMM4 movdqa \TMP3, \TMP5 pshufd $78, \TMP3, \TMP1 pxor \TMP3, \TMP1 @@ -636,7 +607,23 @@ _get_AAD_loop2_done\num_initial_blocks\operation: pshufd $78, \TMP5, \TMP1 pxor \TMP5, \TMP1 movdqa \TMP1, HashKey_4_k(%rsp) - movaps 0xa0(%arg1), \TMP2 + lea 0xa0(%arg1),%r10 + mov keysize,%eax + shr $2,%eax # 128->4, 192->6, 256->8 + sub $4,%eax # 128->0, 192->2, 256->4 + jz aes_loop_pre_enc_done\num_initial_blocks + +aes_loop_pre_enc\num_initial_blocks: + MOVADQ (%r10),\TMP2 +.irpc index, 1234 + AESENC \TMP2, %xmm\index +.endr + add $16,%r10 + sub $1,%eax + jnz aes_loop_pre_enc\num_initial_blocks + +aes_loop_pre_enc_done\num_initial_blocks: + MOVADQ (%r10), \TMP2 AESENCLAST \TMP2, \XMM1 AESENCLAST \TMP2, \XMM2 AESENCLAST \TMP2, \XMM3 @@ -655,15 +642,11 @@ _get_AAD_loop2_done\num_initial_blocks\operation: movdqu \XMM4, 16*3(%arg2 , %r11 , 1) add $64, %r11 - movdqa SHUF_MASK(%rip), %xmm14 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap pxor \XMMDst, \XMM1 # combine GHASHed value with the corresponding ciphertext - movdqa SHUF_MASK(%rip), %xmm14 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap - movdqa SHUF_MASK(%rip), %xmm14 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap - movdqa SHUF_MASK(%rip), %xmm14 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap _initial_blocks_done\num_initial_blocks\operation: @@ -794,7 +777,23 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation AESENC \TMP3, \XMM3 AESENC \TMP3, \XMM4 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 - movaps 0xa0(%arg1), \TMP3 + lea 0xa0(%arg1),%r10 + mov keysize,%eax + shr $2,%eax # 128->4, 192->6, 256->8 + sub $4,%eax # 128->0, 192->2, 256->4 + jz aes_loop_par_enc_done + +aes_loop_par_enc: + MOVADQ (%r10),\TMP3 +.irpc index, 1234 + AESENC \TMP3, %xmm\index +.endr + add $16,%r10 + sub $1,%eax + jnz aes_loop_par_enc + +aes_loop_par_enc_done: + MOVADQ (%r10), \TMP3 AESENCLAST \TMP3, \XMM1 # Round 10 AESENCLAST \TMP3, \XMM2 AESENCLAST \TMP3, \XMM3 @@ -986,8 +985,24 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation AESENC \TMP3, \XMM3 AESENC \TMP3, \XMM4 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 - movaps 0xa0(%arg1), \TMP3 - AESENCLAST \TMP3, \XMM1 # Round 10 + lea 0xa0(%arg1),%r10 + mov keysize,%eax + shr $2,%eax # 128->4, 192->6, 256->8 + sub $4,%eax # 128->0, 192->2, 256->4 + jz aes_loop_par_dec_done + +aes_loop_par_dec: + MOVADQ (%r10),\TMP3 +.irpc index, 1234 + AESENC \TMP3, %xmm\index +.endr + add $16,%r10 + sub $1,%eax + jnz aes_loop_par_dec + +aes_loop_par_dec_done: + MOVADQ (%r10), \TMP3 + AESENCLAST \TMP3, \XMM1 # last round AESENCLAST \TMP3, \XMM2 AESENCLAST \TMP3, \XMM3 AESENCLAST \TMP3, \XMM4 @@ -1155,33 +1170,29 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst pxor \TMP6, \XMMDst # reduced result is in XMMDst .endm -/* Encryption of a single block done*/ -.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1 - pxor (%arg1), \XMM0 - movaps 16(%arg1), \TMP1 - AESENC \TMP1, \XMM0 - movaps 32(%arg1), \TMP1 - AESENC \TMP1, \XMM0 - movaps 48(%arg1), \TMP1 - AESENC \TMP1, \XMM0 - movaps 64(%arg1), \TMP1 - AESENC \TMP1, \XMM0 - movaps 80(%arg1), \TMP1 - AESENC \TMP1, \XMM0 - movaps 96(%arg1), \TMP1 - AESENC \TMP1, \XMM0 - movaps 112(%arg1), \TMP1 - AESENC \TMP1, \XMM0 - movaps 128(%arg1), \TMP1 - AESENC \TMP1, \XMM0 - movaps 144(%arg1), \TMP1 - AESENC \TMP1, \XMM0 - movaps 160(%arg1), \TMP1 - AESENCLAST \TMP1, \XMM0 -.endm +/* Encryption of a single block +* uses eax & r10 +*/ +.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1 + pxor (%arg1), \XMM0 + mov keysize,%eax + shr $2,%eax # 128->4, 192->6, 256->8 + add $5,%eax # 128->9, 192->11, 256->13 + lea 16(%arg1), %r10 # get first expanded key address + +_esb_loop_\@: + MOVADQ (%r10),\TMP1 + AESENC \TMP1,\XMM0 + add $16,%r10 + sub $1,%eax + jnz _esb_loop_\@ + + MOVADQ (%r10),\TMP1 + AESENCLAST \TMP1,\XMM0 +.endm /***************************************************************************** * void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. * u8 *out, // Plaintext output. Encrypt in-place is allowed. diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index ae855f4..947c6bf 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -43,6 +43,7 @@ #include <asm/crypto/glue_helper.h> #endif + /* This data is stored at the end of the crypto_tfm struct. * It's a type of per "session" data storage location. * This needs to be 16 byte aligned. @@ -182,7 +183,8 @@ static void aesni_gcm_enc_avx(void *ctx, u8 *out, u8 *hash_subkey, const u8 *aad, unsigned long aad_len, u8 *auth_tag, unsigned long auth_tag_len) { - if (plaintext_len < AVX_GEN2_OPTSIZE) { + struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx; + if ((plaintext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)){ aesni_gcm_enc(ctx, out, in, plaintext_len, iv, hash_subkey, aad, aad_len, auth_tag, auth_tag_len); } else { @@ -197,7 +199,8 @@ static void aesni_gcm_dec_avx(void *ctx, u8 *out, u8 *hash_subkey, const u8 *aad, unsigned long aad_len, u8 *auth_tag, unsigned long auth_tag_len) { - if (ciphertext_len < AVX_GEN2_OPTSIZE) { + struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx; + if ((ciphertext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) { aesni_gcm_dec(ctx, out, in, ciphertext_len, iv, hash_subkey, aad, aad_len, auth_tag, auth_tag_len); } else { @@ -231,7 +234,8 @@ static void aesni_gcm_enc_avx2(void *ctx, u8 *out, u8 *hash_subkey, const u8 *aad, unsigned long aad_len, u8 *auth_tag, unsigned long auth_tag_len) { - if (plaintext_len < AVX_GEN2_OPTSIZE) { + struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx; + if ((plaintext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) { aesni_gcm_enc(ctx, out, in, plaintext_len, iv, hash_subkey, aad, aad_len, auth_tag, auth_tag_len); } else if (plaintext_len < AVX_GEN4_OPTSIZE) { @@ -250,7 +254,8 @@ static void aesni_gcm_dec_avx2(void *ctx, u8 *out, u8 *hash_subkey, const u8 *aad, unsigned long aad_len, u8 *auth_tag, unsigned long auth_tag_len) { - if (ciphertext_len < AVX_GEN2_OPTSIZE) { + struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx; + if ((ciphertext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) { aesni_gcm_dec(ctx, out, in, ciphertext_len, iv, hash_subkey, aad, aad_len, auth_tag, auth_tag_len); } else if (ciphertext_len < AVX_GEN4_OPTSIZE) { @@ -511,7 +516,7 @@ static int ctr_crypt(struct blkcipher_desc *desc, kernel_fpu_begin(); while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) { aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr, - nbytes & AES_BLOCK_MASK, walk.iv); + nbytes & AES_BLOCK_MASK, walk.iv); nbytes &= AES_BLOCK_SIZE - 1; err = blkcipher_walk_done(desc, &walk, nbytes); } @@ -902,7 +907,8 @@ static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key, } /*Account for 4 byte nonce at the end.*/ key_len -= 4; - if (key_len != AES_KEYSIZE_128) { + if (key_len != AES_KEYSIZE_128 && key_len != AES_KEYSIZE_192 && + key_len != AES_KEYSIZE_256) { crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); return -EINVAL; } @@ -1013,6 +1019,7 @@ static int __driver_rfc4106_encrypt(struct aead_request *req) __be32 counter = cpu_to_be32(1); struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); + u32 key_len = ctx->aes_key_expanded.key_length; void *aes_ctx = &(ctx->aes_key_expanded); unsigned long auth_tag_len = crypto_aead_authsize(tfm); u8 iv_tab[16+AESNI_ALIGN]; @@ -1027,6 +1034,13 @@ static int __driver_rfc4106_encrypt(struct aead_request *req) /* to 8 or 12 bytes */ if (unlikely(req->assoclen != 8 && req->assoclen != 12)) return -EINVAL; + if (unlikely(auth_tag_len != 8 && auth_tag_len != 12 && auth_tag_len != 16)) + return -EINVAL; + if (unlikely(key_len != AES_KEYSIZE_128 && + key_len != AES_KEYSIZE_192 && + key_len != AES_KEYSIZE_256)) + return -EINVAL; + /* IV below built */ for (i = 0; i < 4; i++) *(iv+i) = ctx->nonce[i]; @@ -1091,6 +1105,7 @@ static int __driver_rfc4106_decrypt(struct aead_request *req) int retval = 0; struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); + u32 key_len = ctx->aes_key_expanded.key_length; void *aes_ctx = &(ctx->aes_key_expanded); unsigned long auth_tag_len = crypto_aead_authsize(tfm); u8 iv_and_authTag[32+AESNI_ALIGN]; @@ -1104,6 +1119,13 @@ static int __driver_rfc4106_decrypt(struct aead_request *req) if (unlikely((req->cryptlen < auth_tag_len) || (req->assoclen != 8 && req->assoclen != 12))) return -EINVAL; + if (unlikely(auth_tag_len != 8 && auth_tag_len != 12 && auth_tag_len != 16)) + return -EINVAL; + if (unlikely(key_len != AES_KEYSIZE_128 && + key_len != AES_KEYSIZE_192 && + key_len != AES_KEYSIZE_256)) + return -EINVAL; + /* Assuming we are supporting rfc4106 64-bit extended */ /* sequence numbers We need to have the AAD length */ /* equal to 8 or 12 bytes */ diff --git a/arch/x86/crypto/des3_ede_glue.c b/arch/x86/crypto/des3_ede_glue.c index 38a14f8..d6fc59a 100644 --- a/arch/x86/crypto/des3_ede_glue.c +++ b/arch/x86/crypto/des3_ede_glue.c @@ -504,6 +504,4 @@ MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Triple DES EDE Cipher Algorithm, asm optimized"); MODULE_ALIAS_CRYPTO("des3_ede"); MODULE_ALIAS_CRYPTO("des3_ede-asm"); -MODULE_ALIAS_CRYPTO("des"); -MODULE_ALIAS_CRYPTO("des-asm"); MODULE_AUTHOR("Jussi Kivilinna <jussi.kivilinna@iki.fi>"); diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index f9e181a..d0165c9 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -169,7 +169,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, u32 tmp; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; get_user_try { /* diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 92003f3..efc3b22 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -213,7 +213,15 @@ void register_lapic_address(unsigned long address); extern void setup_boot_APIC_clock(void); extern void setup_secondary_APIC_clock(void); extern int APIC_init_uniprocessor(void); + +#ifdef CONFIG_X86_64 +static inline int apic_force_enable(unsigned long addr) +{ + return -1; +} +#else extern int apic_force_enable(unsigned long addr); +#endif extern int apic_bsp_setup(bool upmode); extern void apic_ap_setup(void); diff --git a/arch/x86/include/asm/imr.h b/arch/x86/include/asm/imr.h new file mode 100644 index 0000000..cd2ce40 --- /dev/null +++ b/arch/x86/include/asm/imr.h @@ -0,0 +1,60 @@ +/* + * imr.h: Isolated Memory Region API + * + * Copyright(c) 2013 Intel Corporation. + * Copyright(c) 2015 Bryan O'Donoghue <pure.logic@nexus-software.ie> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ +#ifndef _IMR_H +#define _IMR_H + +#include <linux/types.h> + +/* + * IMR agent access mask bits + * See section 12.7.4.7 from quark-x1000-datasheet.pdf for register + * definitions. + */ +#define IMR_ESRAM_FLUSH BIT(31) +#define IMR_CPU_SNOOP BIT(30) /* Applicable only to write */ +#define IMR_RMU BIT(29) +#define IMR_VC1_SAI_ID3 BIT(15) +#define IMR_VC1_SAI_ID2 BIT(14) +#define IMR_VC1_SAI_ID1 BIT(13) +#define IMR_VC1_SAI_ID0 BIT(12) +#define IMR_VC0_SAI_ID3 BIT(11) +#define IMR_VC0_SAI_ID2 BIT(10) +#define IMR_VC0_SAI_ID1 BIT(9) +#define IMR_VC0_SAI_ID0 BIT(8) +#define IMR_CPU_0 BIT(1) /* SMM mode */ +#define IMR_CPU BIT(0) /* Non SMM mode */ +#define IMR_ACCESS_NONE 0 + +/* + * Read/Write access-all bits here include some reserved bits + * These are the values firmware uses and are accepted by hardware. + * The kernel defines read/write access-all in the same way as firmware + * in order to have a consistent and crisp definition across firmware, + * bootloader and kernel. + */ +#define IMR_READ_ACCESS_ALL 0xBFFFFFFF +#define IMR_WRITE_ACCESS_ALL 0xFFFFFFFF + +/* Number of IMRs provided by Quark X1000 SoC */ +#define QUARK_X1000_IMR_MAX 0x08 +#define QUARK_X1000_IMR_REGBASE 0x40 + +/* IMR alignment bits - only bits 31:10 are checked for IMR validity */ +#define IMR_ALIGN 0x400 +#define IMR_MASK (IMR_ALIGN - 1) + +int imr_add_range(phys_addr_t base, size_t size, + unsigned int rmask, unsigned int wmask, bool lock); + +int imr_remove_range(phys_addr_t base, size_t size); + +#endif /* _IMR_H */ diff --git a/arch/x86/include/asm/intel-mid.h b/arch/x86/include/asm/intel-mid.h index e34e097..705d357 100644 --- a/arch/x86/include/asm/intel-mid.h +++ b/arch/x86/include/asm/intel-mid.h @@ -136,9 +136,6 @@ extern enum intel_mid_timer_options intel_mid_timer_options; #define SFI_MTMR_MAX_NUM 8 #define SFI_MRTC_MAX 8 -extern struct console early_mrst_console; -extern void mrst_early_console_init(void); - extern struct console early_hsu_console; extern void hsu_early_console_init(const char *); diff --git a/arch/x86/include/asm/kasan.h b/arch/x86/include/asm/kasan.h new file mode 100644 index 0000000..8b22422 --- /dev/null +++ b/arch/x86/include/asm/kasan.h @@ -0,0 +1,31 @@ +#ifndef _ASM_X86_KASAN_H +#define _ASM_X86_KASAN_H + +/* + * Compiler uses shadow offset assuming that addresses start + * from 0. Kernel addresses don't start from 0, so shadow + * for kernel really starts from compiler's shadow offset + + * 'kernel address space start' >> KASAN_SHADOW_SCALE_SHIFT + */ +#define KASAN_SHADOW_START (KASAN_SHADOW_OFFSET + \ + (0xffff800000000000ULL >> 3)) +/* 47 bits for kernel address -> (47 - 3) bits for shadow */ +#define KASAN_SHADOW_END (KASAN_SHADOW_START + (1ULL << (47 - 3))) + +#ifndef __ASSEMBLY__ + +extern pte_t kasan_zero_pte[]; +extern pte_t kasan_zero_pmd[]; +extern pte_t kasan_zero_pud[]; + +#ifdef CONFIG_KASAN +void __init kasan_map_early_shadow(pgd_t *pgd); +void __init kasan_init(void); +#else +static inline void kasan_map_early_shadow(pgd_t *pgd) { } +static inline void kasan_init(void) { } +#endif + +#endif + +#endif diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index eb18117..57a9d94 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -208,6 +208,7 @@ struct x86_emulate_ops { void (*get_cpuid)(struct x86_emulate_ctxt *ctxt, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); + void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked); }; typedef u32 __attribute__((vector_size(16))) sse128_t; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d89c6b8..a236e39 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -38,8 +38,6 @@ #define KVM_PRIVATE_MEM_SLOTS 3 #define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS) -#define KVM_MMIO_SIZE 16 - #define KVM_PIO_PAGE_OFFSET 1 #define KVM_COALESCED_MMIO_PAGE_OFFSET 2 @@ -51,7 +49,7 @@ | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) #define CR3_L_MODE_RESERVED_BITS 0xFFFFFF0000000000ULL -#define CR3_PCID_INVD (1UL << 63) +#define CR3_PCID_INVD BIT_64(63) #define CR4_RESERVED_BITS \ (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ @@ -160,6 +158,18 @@ enum { #define DR7_FIXED_1 0x00000400 #define DR7_VOLATILE 0xffff2bff +#define PFERR_PRESENT_BIT 0 +#define PFERR_WRITE_BIT 1 +#define PFERR_USER_BIT 2 +#define PFERR_RSVD_BIT 3 +#define PFERR_FETCH_BIT 4 + +#define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT) +#define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT) +#define PFERR_USER_MASK (1U << PFERR_USER_BIT) +#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT) +#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT) + /* apic attention bits */ #define KVM_APIC_CHECK_VAPIC 0 /* @@ -615,6 +625,8 @@ struct kvm_arch { #ifdef CONFIG_KVM_MMU_AUDIT int audit_point; #endif + + bool boot_vcpu_runs_old_kvmclock; }; struct kvm_vm_stat { @@ -643,6 +655,7 @@ struct kvm_vcpu_stat { u32 irq_window_exits; u32 nmi_window_exits; u32 halt_exits; + u32 halt_successful_poll; u32 halt_wakeup; u32 request_irq_exits; u32 irq_exits; @@ -787,6 +800,31 @@ struct kvm_x86_ops { int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr); void (*sched_in)(struct kvm_vcpu *kvm, int cpu); + + /* + * Arch-specific dirty logging hooks. These hooks are only supposed to + * be valid if the specific arch has hardware-accelerated dirty logging + * mechanism. Currently only for PML on VMX. + * + * - slot_enable_log_dirty: + * called when enabling log dirty mode for the slot. + * - slot_disable_log_dirty: + * called when disabling log dirty mode for the slot. + * also called when slot is created with log dirty disabled. + * - flush_log_dirty: + * called before reporting dirty_bitmap to userspace. + * - enable_log_dirty_pt_masked: + * called when reenabling log dirty for the GFNs in the mask after + * corresponding bits are cleared in slot->dirty_bitmap. + */ + void (*slot_enable_log_dirty)(struct kvm *kvm, + struct kvm_memory_slot *slot); + void (*slot_disable_log_dirty)(struct kvm *kvm, + struct kvm_memory_slot *slot); + void (*flush_log_dirty)(struct kvm *kvm); + void (*enable_log_dirty_pt_masked)(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t offset, unsigned long mask); }; struct kvm_arch_async_pf { @@ -819,10 +857,17 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, u64 dirty_mask, u64 nx_mask, u64 x_mask); void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); -void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); -void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, - struct kvm_memory_slot *slot, - gfn_t gfn_offset, unsigned long mask); +void kvm_mmu_slot_remove_write_access(struct kvm *kvm, + struct kvm_memory_slot *memslot); +void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, + struct kvm_memory_slot *memslot); +void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, + struct kvm_memory_slot *memslot); +void kvm_mmu_slot_set_dirty(struct kvm *kvm, + struct kvm_memory_slot *memslot); +void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask); void kvm_mmu_zap_all(struct kvm *kvm); void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm); unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h index 879fd7d..ef01fef 100644 --- a/arch/x86/include/asm/lguest_hcall.h +++ b/arch/x86/include/asm/lguest_hcall.h @@ -16,7 +16,6 @@ #define LHCALL_SET_PTE 14 #define LHCALL_SET_PGD 15 #define LHCALL_LOAD_TLS 16 -#define LHCALL_NOTIFY 17 #define LHCALL_LOAD_GDT_ENTRY 18 #define LHCALL_SEND_INTERRUPTS 19 diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h new file mode 100644 index 0000000..a455a53 --- /dev/null +++ b/arch/x86/include/asm/livepatch.h @@ -0,0 +1,46 @@ +/* + * livepatch.h - x86-specific Kernel Live Patching Core + * + * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com> + * Copyright (C) 2014 SUSE + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _ASM_X86_LIVEPATCH_H +#define _ASM_X86_LIVEPATCH_H + +#include <linux/module.h> +#include <linux/ftrace.h> + +#ifdef CONFIG_LIVEPATCH +static inline int klp_check_compiler_support(void) +{ +#ifndef CC_USING_FENTRY + return 1; +#endif + return 0; +} +extern int klp_write_module_reloc(struct module *mod, unsigned long type, + unsigned long loc, unsigned long value); + +static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip) +{ + regs->ip = ip; +} +#else +#error Live patching support is disabled; check CONFIG_LIVEPATCH +#endif + +#endif /* _ASM_X86_LIVEPATCH_H */ diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 876e74e..09b9620 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -19,6 +19,8 @@ typedef struct { struct mutex lock; void __user *vdso; + + atomic_t perf_rdpmc_allowed; /* nonzero if rdpmc is allowed */ } mm_context_t; #ifdef CONFIG_SMP diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 4b75d59..883f6b9 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -18,6 +18,21 @@ static inline void paravirt_activate_mm(struct mm_struct *prev, } #endif /* !CONFIG_PARAVIRT */ +#ifdef CONFIG_PERF_EVENTS +extern struct static_key rdpmc_always_available; + +static inline void load_mm_cr4(struct mm_struct *mm) +{ + if (static_key_true(&rdpmc_always_available) || + atomic_read(&mm->context.perf_rdpmc_allowed)) + cr4_set_bits(X86_CR4_PCE); + else + cr4_clear_bits(X86_CR4_PCE); +} +#else +static inline void load_mm_cr4(struct mm_struct *mm) {} +#endif + /* * Used for LDT copy/destruction. */ @@ -52,15 +67,20 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, /* Stop flush ipis for the previous mm */ cpumask_clear_cpu(cpu, mm_cpumask(prev)); + /* Load per-mm CR4 state */ + load_mm_cr4(next); + /* * Load the LDT, if the LDT is different. * - * It's possible leave_mm(prev) has been called. If so, - * then prev->context.ldt could be out of sync with the - * LDT descriptor or the LDT register. This can only happen - * if prev->context.ldt is non-null, since we never free - * an LDT. But LDTs can't be shared across mms, so - * prev->context.ldt won't be equal to next->context.ldt. + * It's possible that prev->context.ldt doesn't match + * the LDT register. This can happen if leave_mm(prev) + * was called and then modify_ldt changed + * prev->context.ldt but suppressed an IPI to this CPU. + * In this case, prev->context.ldt != NULL, because we + * never free an LDT while the mm still exists. That + * means that next->context.ldt != prev->context.ldt, + * because mms never share an LDT. */ if (unlikely(prev->context.ldt != next->context.ldt)) load_LDT_nolock(&next->context); @@ -85,6 +105,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, */ load_cr3(next->pgd); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + load_mm_cr4(next); load_LDT_nolock(&next->context); } } diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 75450b2..4edd53b 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -1,17 +1,23 @@ #ifndef _ASM_X86_PAGE_64_DEFS_H #define _ASM_X86_PAGE_64_DEFS_H -#define THREAD_SIZE_ORDER 2 +#ifdef CONFIG_KASAN +#define KASAN_STACK_ORDER 1 +#else +#define KASAN_STACK_ORDER 0 +#endif + +#define THREAD_SIZE_ORDER (2 + KASAN_STACK_ORDER) #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) #define CURRENT_MASK (~(THREAD_SIZE - 1)) -#define EXCEPTION_STACK_ORDER 0 +#define EXCEPTION_STACK_ORDER (0 + KASAN_STACK_ORDER) #define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER) #define DEBUG_STACK_ORDER (EXCEPTION_STACK_ORDER + 1) #define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER) -#define IRQ_STACK_ORDER 2 +#define IRQ_STACK_ORDER (2 + KASAN_STACK_ORDER) #define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER) #define DOUBLEFAULT_STACK 1 diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index f97fbe3..95e11f7 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -51,6 +51,8 @@ extern int devmem_is_allowed(unsigned long pagenr); extern unsigned long max_low_pfn_mapped; extern unsigned long max_pfn_mapped; +extern bool kaslr_enabled; + static inline phys_addr_t get_max_mapped(void) { return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT; diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 32444ae..965c47d 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -80,16 +80,16 @@ static inline void write_cr3(unsigned long x) PVOP_VCALL1(pv_mmu_ops.write_cr3, x); } -static inline unsigned long read_cr4(void) +static inline unsigned long __read_cr4(void) { return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4); } -static inline unsigned long read_cr4_safe(void) +static inline unsigned long __read_cr4_safe(void) { return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4_safe); } -static inline void write_cr4(unsigned long x) +static inline void __write_cr4(unsigned long x) { PVOP_VCALL1(pv_cpu_ops.write_cr4, x); } diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index 164e3f8..fa1195d 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -93,8 +93,6 @@ extern raw_spinlock_t pci_config_lock; extern int (*pcibios_enable_irq)(struct pci_dev *dev); extern void (*pcibios_disable_irq)(struct pci_dev *dev); -extern bool mp_should_keep_irq(struct device *dev); - struct pci_raw_ops { int (*read)(unsigned int domain, unsigned int bus, unsigned int devfn, int reg, int len, u32 *val); diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h index 206a87f..fd74a11 100644 --- a/arch/x86/include/asm/pgtable-2level.h +++ b/arch/x86/include/asm/pgtable-2level.h @@ -62,44 +62,8 @@ static inline unsigned long pte_bitop(unsigned long value, unsigned int rightshi return ((value >> rightshift) & mask) << leftshift; } -/* - * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken, - * split up the 29 bits of offset into this range. - */ -#define PTE_FILE_MAX_BITS 29 -#define PTE_FILE_SHIFT1 (_PAGE_BIT_PRESENT + 1) -#define PTE_FILE_SHIFT2 (_PAGE_BIT_FILE + 1) -#define PTE_FILE_SHIFT3 (_PAGE_BIT_PROTNONE + 1) -#define PTE_FILE_BITS1 (PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1) -#define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1) - -#define PTE_FILE_MASK1 ((1U << PTE_FILE_BITS1) - 1) -#define PTE_FILE_MASK2 ((1U << PTE_FILE_BITS2) - 1) - -#define PTE_FILE_LSHIFT2 (PTE_FILE_BITS1) -#define PTE_FILE_LSHIFT3 (PTE_FILE_BITS1 + PTE_FILE_BITS2) - -static __always_inline pgoff_t pte_to_pgoff(pte_t pte) -{ - return (pgoff_t) - (pte_bitop(pte.pte_low, PTE_FILE_SHIFT1, PTE_FILE_MASK1, 0) + - pte_bitop(pte.pte_low, PTE_FILE_SHIFT2, PTE_FILE_MASK2, PTE_FILE_LSHIFT2) + - pte_bitop(pte.pte_low, PTE_FILE_SHIFT3, -1UL, PTE_FILE_LSHIFT3)); -} - -static __always_inline pte_t pgoff_to_pte(pgoff_t off) -{ - return (pte_t){ - .pte_low = - pte_bitop(off, 0, PTE_FILE_MASK1, PTE_FILE_SHIFT1) + - pte_bitop(off, PTE_FILE_LSHIFT2, PTE_FILE_MASK2, PTE_FILE_SHIFT2) + - pte_bitop(off, PTE_FILE_LSHIFT3, -1UL, PTE_FILE_SHIFT3) + - _PAGE_FILE, - }; -} - /* Encode and de-code a swap entry */ -#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) +#define SWP_TYPE_BITS 5 #define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 81bb91b..cdaa58c 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -176,18 +176,6 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp) #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) #endif -/* - * Bits 0, 6 and 7 are taken in the low part of the pte, - * put the 32 bits of offset into the high part. - * - * For soft-dirty tracking 11 bit is taken from - * the low part of pte as well. - */ -#define pte_to_pgoff(pte) ((pte).pte_high) -#define pgoff_to_pte(off) \ - ((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } }) -#define PTE_FILE_MAX_BITS 32 - /* Encode and de-code a swap entry */ #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5) #define __swp_type(x) (((x).val) & 0x1f) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index e8a5454..a0c35bf 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -115,11 +115,6 @@ static inline int pte_write(pte_t pte) return pte_flags(pte) & _PAGE_RW; } -static inline int pte_file(pte_t pte) -{ - return pte_flags(pte) & _PAGE_FILE; -} - static inline int pte_huge(pte_t pte) { return pte_flags(pte) & _PAGE_PSE; @@ -137,13 +132,7 @@ static inline int pte_exec(pte_t pte) static inline int pte_special(pte_t pte) { - /* - * See CONFIG_NUMA_BALANCING pte_numa in include/asm-generic/pgtable.h. - * On x86 we have _PAGE_BIT_NUMA == _PAGE_BIT_GLOBAL+1 == - * __PAGE_BIT_SOFTW1 == _PAGE_BIT_SPECIAL. - */ - return (pte_flags(pte) & _PAGE_SPECIAL) && - (pte_flags(pte) & (_PAGE_PRESENT|_PAGE_PROTNONE)); + return pte_flags(pte) & _PAGE_SPECIAL; } static inline unsigned long pte_pfn(pte_t pte) @@ -305,7 +294,7 @@ static inline pmd_t pmd_mkwrite(pmd_t pmd) static inline pmd_t pmd_mknotpresent(pmd_t pmd) { - return pmd_clear_flags(pmd, _PAGE_PRESENT); + return pmd_clear_flags(pmd, _PAGE_PRESENT | _PAGE_PROTNONE); } #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY @@ -329,21 +318,6 @@ static inline pmd_t pmd_mksoft_dirty(pmd_t pmd) return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY); } -static inline pte_t pte_file_clear_soft_dirty(pte_t pte) -{ - return pte_clear_flags(pte, _PAGE_SOFT_DIRTY); -} - -static inline pte_t pte_file_mksoft_dirty(pte_t pte) -{ - return pte_set_flags(pte, _PAGE_SOFT_DIRTY); -} - -static inline int pte_file_soft_dirty(pte_t pte) -{ - return pte_flags(pte) & _PAGE_SOFT_DIRTY; -} - #endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ /* @@ -463,13 +437,6 @@ static inline int pte_same(pte_t a, pte_t b) static inline int pte_present(pte_t a) { - return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE | - _PAGE_NUMA); -} - -#define pte_present_nonuma pte_present_nonuma -static inline int pte_present_nonuma(pte_t a) -{ return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); } @@ -479,7 +446,7 @@ static inline bool pte_accessible(struct mm_struct *mm, pte_t a) if (pte_flags(a) & _PAGE_PRESENT) return true; - if ((pte_flags(a) & (_PAGE_PROTNONE | _PAGE_NUMA)) && + if ((pte_flags(a) & _PAGE_PROTNONE) && mm_tlb_flush_pending(mm)) return true; @@ -499,10 +466,27 @@ static inline int pmd_present(pmd_t pmd) * the _PAGE_PSE flag will remain set at all times while the * _PAGE_PRESENT bit is clear). */ - return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE | - _PAGE_NUMA); + return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE); } +#ifdef CONFIG_NUMA_BALANCING +/* + * These work without NUMA balancing but the kernel does not care. See the + * comment in include/asm-generic/pgtable.h + */ +static inline int pte_protnone(pte_t pte) +{ + return (pte_flags(pte) & (_PAGE_PROTNONE | _PAGE_PRESENT)) + == _PAGE_PROTNONE; +} + +static inline int pmd_protnone(pmd_t pmd) +{ + return (pmd_flags(pmd) & (_PAGE_PROTNONE | _PAGE_PRESENT)) + == _PAGE_PROTNONE; +} +#endif /* CONFIG_NUMA_BALANCING */ + static inline int pmd_none(pmd_t pmd) { /* Only check low word on 32-bit platforms, since it might be @@ -559,11 +543,6 @@ static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address) static inline int pmd_bad(pmd_t pmd) { -#ifdef CONFIG_NUMA_BALANCING - /* pmd_numa check */ - if ((pmd_flags(pmd) & (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA) - return 0; -#endif return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE; } @@ -882,19 +861,16 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY static inline pte_t pte_swp_mksoft_dirty(pte_t pte) { - VM_BUG_ON(pte_present_nonuma(pte)); return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY); } static inline int pte_swp_soft_dirty(pte_t pte) { - VM_BUG_ON(pte_present_nonuma(pte)); return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY; } static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) { - VM_BUG_ON(pte_present_nonuma(pte)); return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); } #endif diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 4572b2f..2ee7811 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -133,10 +133,6 @@ static inline int pgd_large(pgd_t pgd) { return 0; } /* PUD - Level3 access */ /* PMD - Level 2 access */ -#define pte_to_pgoff(pte) ((pte_val((pte)) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT) -#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) | \ - _PAGE_FILE }) -#define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT /* PTE - Level 1 access. */ @@ -145,13 +141,8 @@ static inline int pgd_large(pgd_t pgd) { return 0; } #define pte_unmap(pte) ((void)(pte))/* NOP */ /* Encode and de-code a swap entry */ -#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) -#ifdef CONFIG_NUMA_BALANCING -/* Automatic NUMA balancing needs to be distinguishable from swap entries */ -#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 2) -#else +#define SWP_TYPE_BITS 5 #define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) -#endif #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 25bcd4a8..8c7c108 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -4,7 +4,7 @@ #include <linux/const.h> #include <asm/page_types.h> -#define FIRST_USER_ADDRESS 0 +#define FIRST_USER_ADDRESS 0UL #define _PAGE_BIT_PRESENT 0 /* is present */ #define _PAGE_BIT_RW 1 /* writeable */ @@ -27,19 +27,9 @@ #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ -/* - * Swap offsets on configurations that allow automatic NUMA balancing use the - * bits after _PAGE_BIT_GLOBAL. To uniquely distinguish NUMA hinting PTEs from - * swap entries, we use the first bit after _PAGE_BIT_GLOBAL and shrink the - * maximum possible swap space from 16TB to 8TB. - */ -#define _PAGE_BIT_NUMA (_PAGE_BIT_GLOBAL+1) - /* If _PAGE_BIT_PRESENT is clear, we use these: */ /* - if the user mapped it with PROT_NONE; pte_present gives true */ #define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL -/* - set: nonlinear file mapping, saved PTE; unset:swap */ -#define _PAGE_BIT_FILE _PAGE_BIT_DIRTY #define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT) #define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW) @@ -78,21 +68,6 @@ #endif /* - * _PAGE_NUMA distinguishes between a numa hinting minor fault and a page - * that is not present. The hinting fault gathers numa placement statistics - * (see pte_numa()). The bit is always zero when the PTE is not present. - * - * The bit picked must be always zero when the pmd is present and not - * present, so that we don't lose information when we set it while - * atomically clearing the present bit. - */ -#ifdef CONFIG_NUMA_BALANCING -#define _PAGE_NUMA (_AT(pteval_t, 1) << _PAGE_BIT_NUMA) -#else -#define _PAGE_NUMA (_AT(pteval_t, 0)) -#endif - -/* * Tracking soft dirty bit when a page goes to a swap is tricky. * We need a bit which can be stored in pte _and_ not conflict * with swap entry format. On x86 bits 6 and 7 are *not* involved @@ -114,7 +89,6 @@ #define _PAGE_NX (_AT(pteval_t, 0)) #endif -#define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE) #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ @@ -125,8 +99,8 @@ /* Set of bits not changed in pte_modify */ #define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \ - _PAGE_SOFT_DIRTY | _PAGE_NUMA) -#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_NUMA) + _PAGE_SOFT_DIRTY) +#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) /* * The cache modes defined here are used to translate between pure SW usage @@ -327,20 +301,6 @@ static inline pteval_t pte_flags(pte_t pte) return native_pte_val(pte) & PTE_FLAGS_MASK; } -#ifdef CONFIG_NUMA_BALANCING -/* Set of bits that distinguishes present, prot_none and numa ptes */ -#define _PAGE_NUMA_MASK (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT) -static inline pteval_t ptenuma_flags(pte_t pte) -{ - return pte_flags(pte) & _PAGE_NUMA_MASK; -} - -static inline pmdval_t pmdnuma_flags(pmd_t pmd) -{ - return pmd_flags(pmd) & _PAGE_NUMA_MASK; -} -#endif /* CONFIG_NUMA_BALANCING */ - #define pgprot_val(x) ((x).pgprot) #define __pgprot(x) ((pgprot_t) { (x) } ) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index a092a0c..ec1c935 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -579,39 +579,6 @@ static inline void load_sp0(struct tss_struct *tss, #define set_iopl_mask native_set_iopl_mask #endif /* CONFIG_PARAVIRT */ -/* - * Save the cr4 feature set we're using (ie - * Pentium 4MB enable and PPro Global page - * enable), so that any CPU's that boot up - * after us can get the correct flags. - */ -extern unsigned long mmu_cr4_features; -extern u32 *trampoline_cr4_features; - -static inline void set_in_cr4(unsigned long mask) -{ - unsigned long cr4; - - mmu_cr4_features |= mask; - if (trampoline_cr4_features) - *trampoline_cr4_features = mmu_cr4_features; - cr4 = read_cr4(); - cr4 |= mask; - write_cr4(cr4); -} - -static inline void clear_in_cr4(unsigned long mask) -{ - unsigned long cr4; - - mmu_cr4_features &= ~mask; - if (trampoline_cr4_features) - *trampoline_cr4_features = mmu_cr4_features; - cr4 = read_cr4(); - cr4 &= ~mask; - write_cr4(cr4); -} - typedef struct { unsigned long seg; } mm_segment_t; diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index e820c08..6a4b00f 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -137,17 +137,17 @@ static inline void write_cr3(unsigned long x) native_write_cr3(x); } -static inline unsigned long read_cr4(void) +static inline unsigned long __read_cr4(void) { return native_read_cr4(); } -static inline unsigned long read_cr4_safe(void) +static inline unsigned long __read_cr4_safe(void) { return native_read_cr4_safe(); } -static inline void write_cr4(unsigned long x) +static inline void __write_cr4(unsigned long x) { native_write_cr4(x); } diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index 625660f..cf87de3 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -46,7 +46,7 @@ static __always_inline bool static_key_false(struct static_key *key); static inline void __ticket_enter_slowpath(arch_spinlock_t *lock) { - set_bit(0, (volatile unsigned long *)&lock->tickets.tail); + set_bit(0, (volatile unsigned long *)&lock->tickets.head); } #else /* !CONFIG_PARAVIRT_SPINLOCKS */ @@ -60,10 +60,30 @@ static inline void __ticket_unlock_kick(arch_spinlock_t *lock, } #endif /* CONFIG_PARAVIRT_SPINLOCKS */ +static inline int __tickets_equal(__ticket_t one, __ticket_t two) +{ + return !((one ^ two) & ~TICKET_SLOWPATH_FLAG); +} + +static inline void __ticket_check_and_clear_slowpath(arch_spinlock_t *lock, + __ticket_t head) +{ + if (head & TICKET_SLOWPATH_FLAG) { + arch_spinlock_t old, new; + + old.tickets.head = head; + new.tickets.head = head & ~TICKET_SLOWPATH_FLAG; + old.tickets.tail = new.tickets.head + TICKET_LOCK_INC; + new.tickets.tail = old.tickets.tail; + + /* try to clear slowpath flag when there are no contenders */ + cmpxchg(&lock->head_tail, old.head_tail, new.head_tail); + } +} static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock) { - return lock.tickets.head == lock.tickets.tail; + return __tickets_equal(lock.tickets.head, lock.tickets.tail); } /* @@ -87,18 +107,21 @@ static __always_inline void arch_spin_lock(arch_spinlock_t *lock) if (likely(inc.head == inc.tail)) goto out; - inc.tail &= ~TICKET_SLOWPATH_FLAG; for (;;) { unsigned count = SPIN_THRESHOLD; do { - if (READ_ONCE(lock->tickets.head) == inc.tail) - goto out; + inc.head = READ_ONCE(lock->tickets.head); + if (__tickets_equal(inc.head, inc.tail)) + goto clear_slowpath; cpu_relax(); } while (--count); __ticket_lock_spinning(lock, inc.tail); } -out: barrier(); /* make sure nothing creeps before the lock is taken */ +clear_slowpath: + __ticket_check_and_clear_slowpath(lock, inc.head); +out: + barrier(); /* make sure nothing creeps before the lock is taken */ } static __always_inline int arch_spin_trylock(arch_spinlock_t *lock) @@ -106,56 +129,30 @@ static __always_inline int arch_spin_trylock(arch_spinlock_t *lock) arch_spinlock_t old, new; old.tickets = READ_ONCE(lock->tickets); - if (old.tickets.head != (old.tickets.tail & ~TICKET_SLOWPATH_FLAG)) + if (!__tickets_equal(old.tickets.head, old.tickets.tail)) return 0; new.head_tail = old.head_tail + (TICKET_LOCK_INC << TICKET_SHIFT); + new.head_tail &= ~TICKET_SLOWPATH_FLAG; /* cmpxchg is a full barrier, so nothing can move before it */ return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail; } -static inline void __ticket_unlock_slowpath(arch_spinlock_t *lock, - arch_spinlock_t old) -{ - arch_spinlock_t new; - - BUILD_BUG_ON(((__ticket_t)NR_CPUS) != NR_CPUS); - - /* Perform the unlock on the "before" copy */ - old.tickets.head += TICKET_LOCK_INC; - - /* Clear the slowpath flag */ - new.head_tail = old.head_tail & ~(TICKET_SLOWPATH_FLAG << TICKET_SHIFT); - - /* - * If the lock is uncontended, clear the flag - use cmpxchg in - * case it changes behind our back though. - */ - if (new.tickets.head != new.tickets.tail || - cmpxchg(&lock->head_tail, old.head_tail, - new.head_tail) != old.head_tail) { - /* - * Lock still has someone queued for it, so wake up an - * appropriate waiter. - */ - __ticket_unlock_kick(lock, old.tickets.head); - } -} - static __always_inline void arch_spin_unlock(arch_spinlock_t *lock) { if (TICKET_SLOWPATH_FLAG && - static_key_false(¶virt_ticketlocks_enabled)) { - arch_spinlock_t prev; + static_key_false(¶virt_ticketlocks_enabled)) { + __ticket_t head; - prev = *lock; - add_smp(&lock->tickets.head, TICKET_LOCK_INC); + BUILD_BUG_ON(((__ticket_t)NR_CPUS) != NR_CPUS); - /* add_smp() is a full mb() */ + head = xadd(&lock->tickets.head, TICKET_LOCK_INC); - if (unlikely(lock->tickets.tail & TICKET_SLOWPATH_FLAG)) - __ticket_unlock_slowpath(lock, prev); + if (unlikely(head & TICKET_SLOWPATH_FLAG)) { + head &= ~TICKET_SLOWPATH_FLAG; + __ticket_unlock_kick(lock, (head + TICKET_LOCK_INC)); + } } else __add(&lock->tickets.head, TICKET_LOCK_INC, UNLOCK_LOCK_PREFIX); } @@ -164,14 +161,15 @@ static inline int arch_spin_is_locked(arch_spinlock_t *lock) { struct __raw_tickets tmp = READ_ONCE(lock->tickets); - return tmp.tail != tmp.head; + return !__tickets_equal(tmp.tail, tmp.head); } static inline int arch_spin_is_contended(arch_spinlock_t *lock) { struct __raw_tickets tmp = READ_ONCE(lock->tickets); - return (__ticket_t)(tmp.tail - tmp.head) > TICKET_LOCK_INC; + tmp.head &= ~TICKET_SLOWPATH_FLAG; + return (tmp.tail - tmp.head) > TICKET_LOCK_INC; } #define arch_spin_is_contended arch_spin_is_contended @@ -183,16 +181,16 @@ static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock, static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) { - __ticket_t head = ACCESS_ONCE(lock->tickets.head); + __ticket_t head = READ_ONCE(lock->tickets.head); for (;;) { - struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets); + struct __raw_tickets tmp = READ_ONCE(lock->tickets); /* * We need to check "unlocked" in a loop, tmp.head == head * can be false positive because of overflow. */ - if (tmp.head == (tmp.tail & ~TICKET_SLOWPATH_FLAG) || - tmp.head != head) + if (__tickets_equal(tmp.head, tmp.tail) || + !__tickets_equal(tmp.head, head)) break; cpu_relax(); diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index 19e2c46..e466119 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -27,11 +27,12 @@ static __always_inline void *__inline_memcpy(void *to, const void *from, size_t function. */ #define __HAVE_ARCH_MEMCPY 1 +extern void *__memcpy(void *to, const void *from, size_t len); + #ifndef CONFIG_KMEMCHECK #if (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) || __GNUC__ > 4 extern void *memcpy(void *to, const void *from, size_t len); #else -extern void *__memcpy(void *to, const void *from, size_t len); #define memcpy(dst, src, len) \ ({ \ size_t __len = (len); \ @@ -53,9 +54,11 @@ extern void *__memcpy(void *to, const void *from, size_t len); #define __HAVE_ARCH_MEMSET void *memset(void *s, int c, size_t n); +void *__memset(void *s, int c, size_t n); #define __HAVE_ARCH_MEMMOVE void *memmove(void *dest, const void *src, size_t count); +void *__memmove(void *dest, const void *src, size_t count); int memcmp(const void *cs, const void *ct, size_t count); size_t strlen(const char *s); @@ -63,6 +66,19 @@ char *strcpy(char *dest, const char *src); char *strcat(char *dest, const char *src); int strcmp(const char *cs, const char *ct); +#if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__) + +/* + * For files that not instrumented (e.g. mm/slub.c) we + * should use not instrumented version of mem* functions. + */ + +#undef memcpy +#define memcpy(dst, src, len) __memcpy(dst, src, len) +#define memmove(dst, src, len) __memmove(dst, src, len) +#define memset(s, c, n) __memset(s, c, n) +#endif + #endif /* __KERNEL__ */ #endif /* _ASM_X86_STRING_64_H */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index e82e95a..1d4e4f2 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -31,7 +31,6 @@ struct thread_info { __u32 cpu; /* current CPU */ int saved_preempt_count; mm_segment_t addr_limit; - struct restart_block restart_block; void __user *sysenter_return; unsigned int sig_on_uaccess_error:1; unsigned int uaccess_err:1; /* uaccess failed */ @@ -45,9 +44,6 @@ struct thread_info { .cpu = 0, \ .saved_preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #define init_thread_info (init_thread_union.thread_info) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 04905bfc..cd79194 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -15,6 +15,75 @@ #define __flush_tlb_single(addr) __native_flush_tlb_single(addr) #endif +struct tlb_state { +#ifdef CONFIG_SMP + struct mm_struct *active_mm; + int state; +#endif + + /* + * Access to this CR4 shadow and to H/W CR4 is protected by + * disabling interrupts when modifying either one. + */ + unsigned long cr4; +}; +DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate); + +/* Initialize cr4 shadow for this CPU. */ +static inline void cr4_init_shadow(void) +{ + this_cpu_write(cpu_tlbstate.cr4, __read_cr4()); +} + +/* Set in this cpu's CR4. */ +static inline void cr4_set_bits(unsigned long mask) +{ + unsigned long cr4; + + cr4 = this_cpu_read(cpu_tlbstate.cr4); + if ((cr4 | mask) != cr4) { + cr4 |= mask; + this_cpu_write(cpu_tlbstate.cr4, cr4); + __write_cr4(cr4); + } +} + +/* Clear in this cpu's CR4. */ +static inline void cr4_clear_bits(unsigned long mask) +{ + unsigned long cr4; + + cr4 = this_cpu_read(cpu_tlbstate.cr4); + if ((cr4 & ~mask) != cr4) { + cr4 &= ~mask; + this_cpu_write(cpu_tlbstate.cr4, cr4); + __write_cr4(cr4); + } +} + +/* Read the CR4 shadow. */ +static inline unsigned long cr4_read_shadow(void) +{ + return this_cpu_read(cpu_tlbstate.cr4); +} + +/* + * Save some of cr4 feature set we're using (e.g. Pentium 4MB + * enable and PPro Global page enable), so that any CPU's that boot + * up after us can get the correct flags. This should only be used + * during boot on the boot cpu. + */ +extern unsigned long mmu_cr4_features; +extern u32 *trampoline_cr4_features; + +static inline void cr4_set_bits_and_update_boot(unsigned long mask) +{ + mmu_cr4_features |= mask; + if (trampoline_cr4_features) + *trampoline_cr4_features = mmu_cr4_features; + cr4_set_bits(mask); +} + static inline void __native_flush_tlb(void) { native_write_cr3(native_read_cr3()); @@ -24,7 +93,7 @@ static inline void __native_flush_tlb_global_irq_disabled(void) { unsigned long cr4; - cr4 = native_read_cr4(); + cr4 = this_cpu_read(cpu_tlbstate.cr4); /* clear PGE */ native_write_cr4(cr4 & ~X86_CR4_PGE); /* write old PGE again and flush TLBs */ @@ -184,12 +253,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask, #define TLBSTATE_OK 1 #define TLBSTATE_LAZY 2 -struct tlb_state { - struct mm_struct *active_mm; - int state; -}; -DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate); - static inline void reset_lazy_tlbstate(void) { this_cpu_write(cpu_tlbstate.state, 0); diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 0d592e0..ace9dec 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -179,7 +179,7 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) asm volatile("call __get_user_%P3" \ : "=a" (__ret_gu), "=r" (__val_gu) \ : "0" (ptr), "i" (sizeof(*(ptr)))); \ - (x) = (__typeof__(*(ptr))) __val_gu; \ + (x) = (__force __typeof__(*(ptr))) __val_gu; \ __ret_gu; \ }) diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h index 5da71c2..cce9ee6 100644 --- a/arch/x86/include/asm/virtext.h +++ b/arch/x86/include/asm/virtext.h @@ -19,6 +19,7 @@ #include <asm/vmx.h> #include <asm/svm.h> +#include <asm/tlbflush.h> /* * VMX functions: @@ -40,12 +41,12 @@ static inline int cpu_has_vmx(void) static inline void cpu_vmxoff(void) { asm volatile (ASM_VMX_VMXOFF : : : "cc"); - write_cr4(read_cr4() & ~X86_CR4_VMXE); + cr4_clear_bits(X86_CR4_VMXE); } static inline int cpu_vmx_enabled(void) { - return read_cr4() & X86_CR4_VMXE; + return __read_cr4() & X86_CR4_VMXE; } /** Disable VMX if it is enabled on the current CPU diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 45afaee..da772ed 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -69,6 +69,7 @@ #define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 #define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000 #define SECONDARY_EXEC_SHADOW_VMCS 0x00004000 +#define SECONDARY_EXEC_ENABLE_PML 0x00020000 #define SECONDARY_EXEC_XSAVES 0x00100000 @@ -121,6 +122,7 @@ enum vmcs_field { GUEST_LDTR_SELECTOR = 0x0000080c, GUEST_TR_SELECTOR = 0x0000080e, GUEST_INTR_STATUS = 0x00000810, + GUEST_PML_INDEX = 0x00000812, HOST_ES_SELECTOR = 0x00000c00, HOST_CS_SELECTOR = 0x00000c02, HOST_SS_SELECTOR = 0x00000c04, @@ -140,6 +142,8 @@ enum vmcs_field { VM_EXIT_MSR_LOAD_ADDR_HIGH = 0x00002009, VM_ENTRY_MSR_LOAD_ADDR = 0x0000200a, VM_ENTRY_MSR_LOAD_ADDR_HIGH = 0x0000200b, + PML_ADDRESS = 0x0000200e, + PML_ADDRESS_HIGH = 0x0000200f, TSC_OFFSET = 0x00002010, TSC_OFFSET_HIGH = 0x00002011, VIRTUAL_APIC_PAGE_ADDR = 0x00002012, diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 5eea099..358dcd3 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -55,9 +55,8 @@ extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, struct gnttab_map_grant_ref *kmap_ops, struct page **pages, unsigned int count); extern int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, - struct gnttab_map_grant_ref *kmap_ops, + struct gnttab_unmap_grant_ref *kunmap_ops, struct page **pages, unsigned int count); -extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn); /* * Helper functions to write or read unsigned long values to/from @@ -154,21 +153,12 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn) return mfn; pfn = mfn_to_pfn_no_overrides(mfn); - if (__pfn_to_mfn(pfn) != mfn) { - /* - * If this appears to be a foreign mfn (because the pfn - * doesn't map back to the mfn), then check the local override - * table to see if there's a better pfn to use. - * - * m2p_find_override_pfn returns ~0 if it doesn't find anything. - */ - pfn = m2p_find_override_pfn(mfn, ~0); - } + if (__pfn_to_mfn(pfn) != mfn) + pfn = ~0; /* - * pfn is ~0 if there are no entries in the m2p for mfn or if the - * entry doesn't map back to the mfn and m2p_override doesn't have a - * valid entry for it. + * pfn is ~0 if there are no entries in the m2p for mfn or the + * entry doesn't map back to the mfn. */ if (pfn == ~0 && __pfn_to_mfn(mfn) == IDENTITY_FRAME(mfn)) pfn = mfn; diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index 225b098..44e6dd7 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -7,6 +7,7 @@ #define SETUP_DTB 2 #define SETUP_PCI 3 #define SETUP_EFI 4 +#define SETUP_KASLR 5 /* ram_size flags */ #define RAMDISK_IMAGE_START_MASK 0x07FF diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h index 462efe7..90c458e 100644 --- a/arch/x86/include/uapi/asm/hyperv.h +++ b/arch/x86/include/uapi/asm/hyperv.h @@ -187,6 +187,17 @@ #define HV_X64_MSR_SINT14 0x4000009E #define HV_X64_MSR_SINT15 0x4000009F +/* + * Synthetic Timer MSRs. Four timers per vcpu. + */ +#define HV_X64_MSR_STIMER0_CONFIG 0x400000B0 +#define HV_X64_MSR_STIMER0_COUNT 0x400000B1 +#define HV_X64_MSR_STIMER1_CONFIG 0x400000B2 +#define HV_X64_MSR_STIMER1_COUNT 0x400000B3 +#define HV_X64_MSR_STIMER2_CONFIG 0x400000B4 +#define HV_X64_MSR_STIMER2_COUNT 0x400000B5 +#define HV_X64_MSR_STIMER3_CONFIG 0x400000B6 +#define HV_X64_MSR_STIMER3_COUNT 0x400000B7 #define HV_X64_MSR_HYPERCALL_ENABLE 0x00000001 #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT 12 diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h index d979e5a..3ce0791 100644 --- a/arch/x86/include/uapi/asm/msr-index.h +++ b/arch/x86/include/uapi/asm/msr-index.h @@ -152,6 +152,10 @@ #define MSR_CC6_DEMOTION_POLICY_CONFIG 0x00000668 #define MSR_MC6_DEMOTION_POLICY_CONFIG 0x00000669 +#define MSR_CORE_PERF_LIMIT_REASONS 0x00000690 +#define MSR_GFX_PERF_LIMIT_REASONS 0x000006B0 +#define MSR_RING_PERF_LIMIT_REASONS 0x000006B1 + /* Hardware P state interface */ #define MSR_PPERF 0x0000064e #define MSR_PERF_LIMIT_REASONS 0x0000064f @@ -360,8 +364,12 @@ #define MSR_IA32_UCODE_WRITE 0x00000079 #define MSR_IA32_UCODE_REV 0x0000008b +#define MSR_IA32_SMM_MONITOR_CTL 0x0000009b +#define MSR_IA32_SMBASE 0x0000009e + #define MSR_IA32_PERF_STATUS 0x00000198 #define MSR_IA32_PERF_CTL 0x00000199 +#define INTEL_PERF_CTL_MASK 0xffff #define MSR_AMD_PSTATE_DEF_BASE 0xc0010064 #define MSR_AMD_PERF_STATUS 0xc0010063 #define MSR_AMD_PERF_CTL 0xc0010062 diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index b813bf9..c5f1a1d 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -56,6 +56,7 @@ #define EXIT_REASON_MSR_READ 31 #define EXIT_REASON_MSR_WRITE 32 #define EXIT_REASON_INVALID_STATE 33 +#define EXIT_REASON_MSR_LOAD_FAIL 34 #define EXIT_REASON_MWAIT_INSTRUCTION 36 #define EXIT_REASON_MONITOR_INSTRUCTION 39 #define EXIT_REASON_PAUSE_INSTRUCTION 40 @@ -72,6 +73,7 @@ #define EXIT_REASON_XSETBV 55 #define EXIT_REASON_APIC_WRITE 56 #define EXIT_REASON_INVPCID 58 +#define EXIT_REASON_PML_FULL 62 #define EXIT_REASON_XSAVES 63 #define EXIT_REASON_XRSTORS 64 @@ -116,10 +118,14 @@ { EXIT_REASON_APIC_WRITE, "APIC_WRITE" }, \ { EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \ { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \ + { EXIT_REASON_MSR_LOAD_FAIL, "MSR_LOAD_FAIL" }, \ { EXIT_REASON_INVD, "INVD" }, \ { EXIT_REASON_INVVPID, "INVVPID" }, \ { EXIT_REASON_INVPCID, "INVPCID" }, \ { EXIT_REASON_XSAVES, "XSAVES" }, \ { EXIT_REASON_XRSTORS, "XRSTORS" } +#define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1 +#define VMX_ABORT_LOAD_HOST_MSR_FAIL 4 + #endif /* _UAPIVMX_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 5d4502c..cdb1b70 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -16,6 +16,10 @@ CFLAGS_REMOVE_ftrace.o = -pg CFLAGS_REMOVE_early_printk.o = -pg endif +KASAN_SANITIZE_head$(BITS).o := n +KASAN_SANITIZE_dumpstack.o := n +KASAN_SANITIZE_dumpstack_$(BITS).o := n + CFLAGS_irq.o := -I$(src)/../include/asm/trace obj-y := process_$(BITS).o signal.o entry_$(BITS).o @@ -63,6 +67,7 @@ obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-y += apic/ obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o +obj-$(CONFIG_LIVEPATCH) += livepatch.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o obj-$(CONFIG_X86_TSC) += trace_clock.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index a18fff3..3d525c6 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -613,6 +613,11 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irqp) { int rc, irq, trigger, polarity; + if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) { + *irqp = gsi; + return 0; + } + rc = acpi_get_override_irq(gsi, &trigger, &polarity); if (rc == 0) { trigger = trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE; @@ -845,13 +850,7 @@ int acpi_ioapic_registered(acpi_handle handle, u32 gsi_base) static int __init acpi_parse_sbf(struct acpi_table_header *table) { - struct acpi_table_boot *sb; - - sb = (struct acpi_table_boot *)table; - if (!sb) { - printk(KERN_WARNING PREFIX "Unable to map SBF\n"); - return -ENODEV; - } + struct acpi_table_boot *sb = (struct acpi_table_boot *)table; sbf_port = sb->cmos_index; /* Save CMOS port */ @@ -865,13 +864,7 @@ static struct resource *hpet_res __initdata; static int __init acpi_parse_hpet(struct acpi_table_header *table) { - struct acpi_table_hpet *hpet_tbl; - - hpet_tbl = (struct acpi_table_hpet *)table; - if (!hpet_tbl) { - printk(KERN_WARNING PREFIX "Unable to map HPET\n"); - return -ENODEV; - } + struct acpi_table_hpet *hpet_tbl = (struct acpi_table_hpet *)table; if (hpet_tbl->address.space_id != ACPI_SPACE_MEM) { printk(KERN_WARNING PREFIX "HPET timers must be located in " diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 3136820..d1daead 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -78,7 +78,7 @@ int x86_acpi_suspend_lowlevel(void) header->pmode_cr0 = read_cr0(); if (__this_cpu_read(cpu_info.cpuid_level) >= 0) { - header->pmode_cr4 = read_cr4(); + header->pmode_cr4 = __read_cr4(); header->pmode_behavior |= (1 << WAKEUP_BEHAVIOR_RESTORE_CR4); } if (!rdmsr_safe(MSR_IA32_MISC_ENABLE, diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index b665d24..ad3639a 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1580,8 +1580,7 @@ static __init void try_to_enable_x2apic(int remap_mode) * under KVM */ if (max_physical_apicid > 255 || - (IS_ENABLED(CONFIG_HYPERVISOR_GUEST) && - !hypervisor_x2apic_available())) { + !hypervisor_x2apic_available()) { pr_info("x2apic: IRQ remapping doesn't support X2APIC mode\n"); x2apic_disable(); return; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b15bffc..b5c8ff5 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -19,6 +19,7 @@ #include <asm/archrandom.h> #include <asm/hypervisor.h> #include <asm/processor.h> +#include <asm/tlbflush.h> #include <asm/debugreg.h> #include <asm/sections.h> #include <asm/vsyscall.h> @@ -278,7 +279,7 @@ __setup("nosmep", setup_disable_smep); static __always_inline void setup_smep(struct cpuinfo_x86 *c) { if (cpu_has(c, X86_FEATURE_SMEP)) - set_in_cr4(X86_CR4_SMEP); + cr4_set_bits(X86_CR4_SMEP); } static __init int setup_disable_smap(char *arg) @@ -298,9 +299,9 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_SMAP)) { #ifdef CONFIG_X86_SMAP - set_in_cr4(X86_CR4_SMAP); + cr4_set_bits(X86_CR4_SMAP); #else - clear_in_cr4(X86_CR4_SMAP); + cr4_clear_bits(X86_CR4_SMAP); #endif } } @@ -1295,6 +1296,12 @@ void cpu_init(void) wait_for_master_cpu(cpu); /* + * Initialize the CR4 shadow before doing anything that could + * try to read it. + */ + cr4_init_shadow(); + + /* * Load microcode on this cpu if a valid microcode is available. * This is early microcode loading procedure. */ @@ -1313,7 +1320,7 @@ void cpu_init(void) pr_debug("Initializing CPU#%d\n", cpu); - clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); + cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); /* * Initialize the per-CPU GDT with the boot GDT, @@ -1394,7 +1401,7 @@ void cpu_init(void) printk(KERN_INFO "Initializing CPU#%d\n", cpu); if (cpu_feature_enabled(X86_FEATURE_VME) || cpu_has_tsc || cpu_has_de) - clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); + cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); load_current_idt(); switch_to_new_gdt(cpu); diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index c703507..6596433 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -952,20 +952,18 @@ static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf, static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf, int type, char *buf) { - ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf; - int n = 0; - - if (len > 1) { - const struct cpumask *mask; - - mask = to_cpumask(this_leaf->shared_cpu_map); - n = type ? - cpulist_scnprintf(buf, len-2, mask) : - cpumask_scnprintf(buf, len-2, mask); - buf[n++] = '\n'; - buf[n] = '\0'; - } - return n; + const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map); + int ret; + + if (type) + ret = scnprintf(buf, PAGE_SIZE - 1, "%*pbl", + cpumask_pr_args(mask)); + else + ret = scnprintf(buf, PAGE_SIZE - 1, "%*pb", + cpumask_pr_args(mask)); + buf[ret++] = '\n'; + buf[ret] = '\0'; + return ret; } static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf, diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index cdfed79..3c036cb 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -44,6 +44,7 @@ #include <asm/processor.h> #include <asm/traps.h> +#include <asm/tlbflush.h> #include <asm/mce.h> #include <asm/msr.h> @@ -151,14 +152,11 @@ static struct mce_log mcelog = { void mce_log(struct mce *mce) { unsigned next, entry; - int ret = 0; /* Emit the trace record: */ trace_mce_record(mce); - ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); - if (ret == NOTIFY_STOP) - return; + atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); mce->finished = 0; wmb(); @@ -1452,7 +1450,7 @@ static void __mcheck_cpu_init_generic(void) bitmap_fill(all_banks, MAX_NR_BANKS); machine_check_poll(MCP_UC | m_fl, &all_banks); - set_in_cr4(X86_CR4_MCE); + cr4_set_bits(X86_CR4_MCE); rdmsrl(MSR_IA32_MCG_CAP, cap); if (cap & MCG_CTL_P) diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c index ec2663a..737b0ad 100644 --- a/arch/x86/kernel/cpu/mcheck/p5.c +++ b/arch/x86/kernel/cpu/mcheck/p5.c @@ -9,6 +9,7 @@ #include <asm/processor.h> #include <asm/traps.h> +#include <asm/tlbflush.h> #include <asm/mce.h> #include <asm/msr.h> @@ -65,7 +66,7 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c) "Intel old style machine check architecture supported.\n"); /* Enable MCE: */ - set_in_cr4(X86_CR4_MCE); + cr4_set_bits(X86_CR4_MCE); printk(KERN_INFO "Intel old style machine check reporting enabled on CPU#%d.\n", smp_processor_id()); diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c index bd5d46a..44f1382 100644 --- a/arch/x86/kernel/cpu/mcheck/winchip.c +++ b/arch/x86/kernel/cpu/mcheck/winchip.c @@ -8,6 +8,7 @@ #include <asm/processor.h> #include <asm/traps.h> +#include <asm/tlbflush.h> #include <asm/mce.h> #include <asm/msr.h> @@ -36,7 +37,7 @@ void winchip_mcheck_init(struct cpuinfo_x86 *c) lo &= ~(1<<4); /* Enable MCE */ wrmsr(MSR_IDT_FCR1, lo, hi); - set_in_cr4(X86_CR4_MCE); + cr4_set_bits(X86_CR4_MCE); printk(KERN_INFO "Winchip machine check reporting enabled on CPU#0.\n"); diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index c6826d1..746e7fd 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -196,6 +196,11 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, struct microcode_header_intel mc_header; unsigned int mc_size; + if (leftover < sizeof(mc_header)) { + pr_err("error! Truncated header in microcode data file\n"); + break; + } + if (get_ucode_data(&mc_header, ucode_ptr, sizeof(mc_header))) break; diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c index ec9df6f..420eb93 100644 --- a/arch/x86/kernel/cpu/microcode/intel_early.c +++ b/arch/x86/kernel/cpu/microcode/intel_early.c @@ -321,7 +321,11 @@ get_matching_model_microcode(int cpu, unsigned long start, unsigned int mc_saved_count = mc_saved_data->mc_saved_count; int i; - while (leftover) { + while (leftover && mc_saved_count < ARRAY_SIZE(mc_saved_tmp)) { + + if (leftover < sizeof(mc_header)) + break; + mc_header = (struct microcode_header_intel *)ucode_ptr; mc_size = get_totalsize(mc_header); diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c index 9e451b0..f8c81ba 100644 --- a/arch/x86/kernel/cpu/mtrr/cyrix.c +++ b/arch/x86/kernel/cpu/mtrr/cyrix.c @@ -138,8 +138,8 @@ static void prepare_set(void) /* Save value of CR4 and clear Page Global Enable (bit 7) */ if (cpu_has_pge) { - cr4 = read_cr4(); - write_cr4(cr4 & ~X86_CR4_PGE); + cr4 = __read_cr4(); + __write_cr4(cr4 & ~X86_CR4_PGE); } /* @@ -171,7 +171,7 @@ static void post_set(void) /* Restore value of CR4 */ if (cpu_has_pge) - write_cr4(cr4); + __write_cr4(cr4); } static void cyrix_set_arr(unsigned int reg, unsigned long base, diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 0e25a1b..7d74f7b 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -678,8 +678,8 @@ static void prepare_set(void) __acquires(set_atomicity_lock) /* Save value of CR4 and clear Page Global Enable (bit 7) */ if (cpu_has_pge) { - cr4 = read_cr4(); - write_cr4(cr4 & ~X86_CR4_PGE); + cr4 = __read_cr4(); + __write_cr4(cr4 & ~X86_CR4_PGE); } /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */ @@ -708,7 +708,7 @@ static void post_set(void) __releases(set_atomicity_lock) /* Restore value of CR4 */ if (cpu_has_pge) - write_cr4(cr4); + __write_cr4(cr4); raw_spin_unlock(&set_atomicity_lock); } diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 143e5f5..b71a7f8 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -31,6 +31,8 @@ #include <asm/nmi.h> #include <asm/smp.h> #include <asm/alternative.h> +#include <asm/mmu_context.h> +#include <asm/tlbflush.h> #include <asm/timer.h> #include <asm/desc.h> #include <asm/ldt.h> @@ -43,6 +45,8 @@ DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, }; +struct static_key rdpmc_always_available = STATIC_KEY_INIT_FALSE; + u64 __read_mostly hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] @@ -1327,8 +1331,6 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) break; case CPU_STARTING: - if (x86_pmu.attr_rdpmc) - set_in_cr4(X86_CR4_PCE); if (x86_pmu.cpu_starting) x86_pmu.cpu_starting(cpu); break; @@ -1804,14 +1806,44 @@ static int x86_pmu_event_init(struct perf_event *event) event->destroy(event); } + if (ACCESS_ONCE(x86_pmu.attr_rdpmc)) + event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED; + return err; } +static void refresh_pce(void *ignored) +{ + if (current->mm) + load_mm_cr4(current->mm); +} + +static void x86_pmu_event_mapped(struct perf_event *event) +{ + if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) + return; + + if (atomic_inc_return(¤t->mm->context.perf_rdpmc_allowed) == 1) + on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1); +} + +static void x86_pmu_event_unmapped(struct perf_event *event) +{ + if (!current->mm) + return; + + if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) + return; + + if (atomic_dec_and_test(¤t->mm->context.perf_rdpmc_allowed)) + on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1); +} + static int x86_pmu_event_idx(struct perf_event *event) { int idx = event->hw.idx; - if (!x86_pmu.attr_rdpmc) + if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) return 0; if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) { @@ -1829,16 +1861,6 @@ static ssize_t get_attr_rdpmc(struct device *cdev, return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc); } -static void change_rdpmc(void *info) -{ - bool enable = !!(unsigned long)info; - - if (enable) - set_in_cr4(X86_CR4_PCE); - else - clear_in_cr4(X86_CR4_PCE); -} - static ssize_t set_attr_rdpmc(struct device *cdev, struct device_attribute *attr, const char *buf, size_t count) @@ -1850,14 +1872,27 @@ static ssize_t set_attr_rdpmc(struct device *cdev, if (ret) return ret; + if (val > 2) + return -EINVAL; + if (x86_pmu.attr_rdpmc_broken) return -ENOTSUPP; - if (!!val != !!x86_pmu.attr_rdpmc) { - x86_pmu.attr_rdpmc = !!val; - on_each_cpu(change_rdpmc, (void *)val, 1); + if ((val == 2) != (x86_pmu.attr_rdpmc == 2)) { + /* + * Changing into or out of always available, aka + * perf-event-bypassing mode. This path is extremely slow, + * but only root can trigger it, so it's okay. + */ + if (val == 2) + static_key_slow_inc(&rdpmc_always_available); + else + static_key_slow_dec(&rdpmc_always_available); + on_each_cpu(refresh_pce, NULL, 1); } + x86_pmu.attr_rdpmc = val; + return count; } @@ -1900,6 +1935,9 @@ static struct pmu pmu = { .event_init = x86_pmu_event_init, + .event_mapped = x86_pmu_event_mapped, + .event_unmapped = x86_pmu_event_unmapped, + .add = x86_pmu_add, .del = x86_pmu_del, .start = x86_pmu_start, @@ -1914,13 +1952,15 @@ static struct pmu pmu = { .flush_branch_stack = x86_pmu_flush_branch_stack, }; -void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) +void arch_perf_update_userpage(struct perf_event *event, + struct perf_event_mmap_page *userpg, u64 now) { struct cyc2ns_data *data; userpg->cap_user_time = 0; userpg->cap_user_time_zero = 0; - userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc; + userpg->cap_user_rdpmc = + !!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED); userpg->pmc_width = x86_pmu.cntval_bits; if (!sched_clock_stable()) diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 4e6cdb0..df525d2 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -71,6 +71,8 @@ struct event_constraint { #define PERF_X86_EVENT_COMMITTED 0x8 /* event passed commit_txn */ #define PERF_X86_EVENT_PEBS_LD_HSW 0x10 /* haswell style datala, load */ #define PERF_X86_EVENT_PEBS_NA_HSW 0x20 /* haswell style datala, unknown */ +#define PERF_X86_EVENT_RDPMC_ALLOWED 0x40 /* grant rdpmc permission */ + struct amd_nb { int nb_id; /* NorthBridge id */ diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index b74ebc7..cf3df1d 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -265,7 +265,10 @@ int __die(const char *str, struct pt_regs *regs, long err) printk("SMP "); #endif #ifdef CONFIG_DEBUG_PAGEALLOC - printk("DEBUG_PAGEALLOC"); + printk("DEBUG_PAGEALLOC "); +#endif +#ifdef CONFIG_KASAN + printk("KASAN"); #endif printk("\n"); if (notify_die(DIE_OOPS, str, regs, err, diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 01d1c18..a62536a 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -19,6 +19,7 @@ #include <linux/usb/ehci_def.h> #include <linux/efi.h> #include <asm/efi.h> +#include <asm/pci_x86.h> /* Simple VGA output */ #define VGABASE (__ISA_IO_base + 0xb8000) @@ -76,7 +77,7 @@ static struct console early_vga_console = { /* Serial functions loosely based on a similar package from Klaus P. Gerlicher */ -static int early_serial_base = 0x3f8; /* ttyS0 */ +static unsigned long early_serial_base = 0x3f8; /* ttyS0 */ #define XMTRDY 0x20 @@ -94,13 +95,40 @@ static int early_serial_base = 0x3f8; /* ttyS0 */ #define DLL 0 /* Divisor Latch Low */ #define DLH 1 /* Divisor latch High */ +static void mem32_serial_out(unsigned long addr, int offset, int value) +{ + uint32_t *vaddr = (uint32_t *)addr; + /* shift implied by pointer type */ + writel(value, vaddr + offset); +} + +static unsigned int mem32_serial_in(unsigned long addr, int offset) +{ + uint32_t *vaddr = (uint32_t *)addr; + /* shift implied by pointer type */ + return readl(vaddr + offset); +} + +static unsigned int io_serial_in(unsigned long addr, int offset) +{ + return inb(addr + offset); +} + +static void io_serial_out(unsigned long addr, int offset, int value) +{ + outb(value, addr + offset); +} + +static unsigned int (*serial_in)(unsigned long addr, int offset) = io_serial_in; +static void (*serial_out)(unsigned long addr, int offset, int value) = io_serial_out; + static int early_serial_putc(unsigned char ch) { unsigned timeout = 0xffff; - while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) + while ((serial_in(early_serial_base, LSR) & XMTRDY) == 0 && --timeout) cpu_relax(); - outb(ch, early_serial_base + TXR); + serial_out(early_serial_base, TXR, ch); return timeout ? 0 : -1; } @@ -114,13 +142,28 @@ static void early_serial_write(struct console *con, const char *s, unsigned n) } } +static __init void early_serial_hw_init(unsigned divisor) +{ + unsigned char c; + + serial_out(early_serial_base, LCR, 0x3); /* 8n1 */ + serial_out(early_serial_base, IER, 0); /* no interrupt */ + serial_out(early_serial_base, FCR, 0); /* no fifo */ + serial_out(early_serial_base, MCR, 0x3); /* DTR + RTS */ + + c = serial_in(early_serial_base, LCR); + serial_out(early_serial_base, LCR, c | DLAB); + serial_out(early_serial_base, DLL, divisor & 0xff); + serial_out(early_serial_base, DLH, (divisor >> 8) & 0xff); + serial_out(early_serial_base, LCR, c & ~DLAB); +} + #define DEFAULT_BAUD 9600 static __init void early_serial_init(char *s) { - unsigned char c; unsigned divisor; - unsigned baud = DEFAULT_BAUD; + unsigned long baud = DEFAULT_BAUD; char *e; if (*s == ',') @@ -145,24 +188,124 @@ static __init void early_serial_init(char *s) s++; } - outb(0x3, early_serial_base + LCR); /* 8n1 */ - outb(0, early_serial_base + IER); /* no interrupt */ - outb(0, early_serial_base + FCR); /* no fifo */ - outb(0x3, early_serial_base + MCR); /* DTR + RTS */ + if (*s) { + if (kstrtoul(s, 0, &baud) < 0 || baud == 0) + baud = DEFAULT_BAUD; + } + + /* Convert from baud to divisor value */ + divisor = 115200 / baud; + + /* These will always be IO based ports */ + serial_in = io_serial_in; + serial_out = io_serial_out; + + /* Set up the HW */ + early_serial_hw_init(divisor); +} + +#ifdef CONFIG_PCI +/* + * early_pci_serial_init() + * + * This function is invoked when the early_printk param starts with "pciserial" + * The rest of the param should be ",B:D.F,baud" where B, D & F describe the + * location of a PCI device that must be a UART device. + */ +static __init void early_pci_serial_init(char *s) +{ + unsigned divisor; + unsigned long baud = DEFAULT_BAUD; + u8 bus, slot, func; + uint32_t classcode, bar0; + uint16_t cmdreg; + char *e; + + + /* + * First, part the param to get the BDF values + */ + if (*s == ',') + ++s; + + if (*s == 0) + return; + + bus = (u8)simple_strtoul(s, &e, 16); + s = e; + if (*s != ':') + return; + ++s; + slot = (u8)simple_strtoul(s, &e, 16); + s = e; + if (*s != '.') + return; + ++s; + func = (u8)simple_strtoul(s, &e, 16); + s = e; + + /* A baud might be following */ + if (*s == ',') + s++; + + /* + * Second, find the device from the BDF + */ + cmdreg = read_pci_config(bus, slot, func, PCI_COMMAND); + classcode = read_pci_config(bus, slot, func, PCI_CLASS_REVISION); + bar0 = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0); + + /* + * Verify it is a UART type device + */ + if (((classcode >> 16 != PCI_CLASS_COMMUNICATION_MODEM) && + (classcode >> 16 != PCI_CLASS_COMMUNICATION_SERIAL)) || + (((classcode >> 8) & 0xff) != 0x02)) /* 16550 I/F at BAR0 */ + return; + + /* + * Determine if it is IO or memory mapped + */ + if (bar0 & 0x01) { + /* it is IO mapped */ + serial_in = io_serial_in; + serial_out = io_serial_out; + early_serial_base = bar0&0xfffffffc; + write_pci_config(bus, slot, func, PCI_COMMAND, + cmdreg|PCI_COMMAND_IO); + } else { + /* It is memory mapped - assume 32-bit alignment */ + serial_in = mem32_serial_in; + serial_out = mem32_serial_out; + /* WARNING! assuming the address is always in the first 4G */ + early_serial_base = + (unsigned long)early_ioremap(bar0 & 0xfffffff0, 0x10); + write_pci_config(bus, slot, func, PCI_COMMAND, + cmdreg|PCI_COMMAND_MEMORY); + } + /* + * Lastly, initalize the hardware + */ if (*s) { - baud = simple_strtoul(s, &e, 0); - if (baud == 0 || s == e) + if (strcmp(s, "nocfg") == 0) + /* Sometimes, we want to leave the UART alone + * and assume the BIOS has set it up correctly. + * "nocfg" tells us this is the case, and we + * should do no more setup. + */ + return; + if (kstrtoul(s, 0, &baud) < 0 || baud == 0) baud = DEFAULT_BAUD; } + /* Convert from baud to divisor value */ divisor = 115200 / baud; - c = inb(early_serial_base + LCR); - outb(c | DLAB, early_serial_base + LCR); - outb(divisor & 0xff, early_serial_base + DLL); - outb((divisor >> 8) & 0xff, early_serial_base + DLH); - outb(c & ~DLAB, early_serial_base + LCR); + + /* Set up the HW */ + early_serial_hw_init(divisor); } +#endif static struct console early_serial_console = { .name = "earlyser", @@ -210,6 +353,13 @@ static int __init setup_early_printk(char *buf) early_serial_init(buf + 4); early_console_register(&early_serial_console, keep); } +#ifdef CONFIG_PCI + if (!strncmp(buf, "pciserial", 9)) { + early_pci_serial_init(buf + 9); + early_console_register(&early_serial_console, keep); + buf += 9; /* Keep from match the above "serial" */ + } +#endif if (!strncmp(buf, "vga", 3) && boot_params.screen_info.orig_video_isVGA == 1) { max_xpos = boot_params.screen_info.orig_video_cols; @@ -226,11 +376,6 @@ static int __init setup_early_printk(char *buf) early_console_register(&xenboot_console, keep); #endif #ifdef CONFIG_EARLY_PRINTK_INTEL_MID - if (!strncmp(buf, "mrst", 4)) { - mrst_early_console_init(); - early_console_register(&early_mrst_console, keep); - } - if (!strncmp(buf, "hsu", 3)) { hsu_early_console_init(buf + 3); early_console_register(&early_hsu_console, keep); diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index d6c1b983..2911ef3 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -31,6 +31,7 @@ static void __init i386_default_early_setup(void) asmlinkage __visible void __init i386_start_kernel(void) { + cr4_init_shadow(); sanitize_boot_params(&boot_params); /* Call the subarch specific early setup function */ diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index eda1a86..c4f8d46 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -27,6 +27,7 @@ #include <asm/bios_ebda.h> #include <asm/bootparam_utils.h> #include <asm/microcode.h> +#include <asm/kasan.h> /* * Manage page tables very early on. @@ -46,7 +47,7 @@ static void __init reset_early_page_tables(void) next_early_pgt = 0; - write_cr3(__pa(early_level4_pgt)); + write_cr3(__pa_nodebug(early_level4_pgt)); } /* Create a new PMD entry */ @@ -59,7 +60,7 @@ int __init early_make_pgtable(unsigned long address) pmdval_t pmd, *pmd_p; /* Invalid address or early pgt is done ? */ - if (physaddr >= MAXMEM || read_cr3() != __pa(early_level4_pgt)) + if (physaddr >= MAXMEM || read_cr3() != __pa_nodebug(early_level4_pgt)) return -1; again: @@ -155,9 +156,13 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) (__START_KERNEL & PGDIR_MASK))); BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END); + cr4_init_shadow(); + /* Kill off the identity-map trampoline */ reset_early_page_tables(); + kasan_map_early_shadow(early_level4_pgt); + /* clear bss before set_intr_gate with early_idt_handler */ clear_bss(); @@ -179,6 +184,8 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) /* set init_level4_pgt kernel high mapping*/ init_level4_pgt[511] = early_level4_pgt[511]; + kasan_map_early_shadow(init_level4_pgt); + x86_64_start_reservations(real_mode_data); } diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index a468c0a..6fd514d9 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -514,8 +514,38 @@ ENTRY(phys_base) /* This must match the first entry in level2_kernel_pgt */ .quad 0x0000000000000000 +#ifdef CONFIG_KASAN +#define FILL(VAL, COUNT) \ + .rept (COUNT) ; \ + .quad (VAL) ; \ + .endr + +NEXT_PAGE(kasan_zero_pte) + FILL(kasan_zero_page - __START_KERNEL_map + _KERNPG_TABLE, 512) +NEXT_PAGE(kasan_zero_pmd) + FILL(kasan_zero_pte - __START_KERNEL_map + _KERNPG_TABLE, 512) +NEXT_PAGE(kasan_zero_pud) + FILL(kasan_zero_pmd - __START_KERNEL_map + _KERNPG_TABLE, 512) + +#undef FILL +#endif + + #include "../../x86/xen/xen-head.S" __PAGE_ALIGNED_BSS NEXT_PAGE(empty_zero_page) .skip PAGE_SIZE + +#ifdef CONFIG_KASAN +/* + * This page used as early shadow. We don't use empty_zero_page + * at early stages, stack instrumentation could write some garbage + * to this page. + * Latter we reuse it as zero shadow for large ranges of memory + * that allowed to access, but not instrumented by kasan + * (vmalloc/vmemmap ...). + */ +NEXT_PAGE(kasan_zero_page) + .skip PAGE_SIZE +#endif diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 319bcb9..3acbff4 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -168,7 +168,7 @@ static void _hpet_print_config(const char *function, int line) #define hpet_print_config() \ do { \ if (hpet_verbose) \ - _hpet_print_config(__FUNCTION__, __LINE__); \ + _hpet_print_config(__func__, __LINE__); \ } while (0) /* diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 81049ff..d5651fc 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -13,6 +13,7 @@ #include <asm/sigcontext.h> #include <asm/processor.h> #include <asm/math_emu.h> +#include <asm/tlbflush.h> #include <asm/uaccess.h> #include <asm/ptrace.h> #include <asm/i387.h> @@ -193,7 +194,7 @@ void fpu_init(void) if (cpu_has_xmm) cr4_mask |= X86_CR4_OSXMMEXCPT; if (cr4_mask) - set_in_cr4(cr4_mask); + cr4_set_bits(cr4_mask); cr0 = read_cr0(); cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */ diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 705ef8d..67b1cbe 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -302,6 +302,9 @@ int check_irq_vectors_for_cpu_disable(void) irq = __this_cpu_read(vector_irq[vector]); if (irq >= 0) { desc = irq_to_desc(irq); + if (!desc) + continue; + data = irq_desc_get_irq_data(desc); cpumask_copy(&affinity_new, data->affinity); cpu_clear(this_cpu, affinity_new); diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 98f654d..6a1146e 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -84,7 +84,7 @@ static volatile u32 twobyte_is_boostable[256 / 32] = { /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ /* ---------------------------------------------- */ W(0x00, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0) | /* 00 */ - W(0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 10 */ + W(0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1) , /* 10 */ W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 20 */ W(0x30, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */ W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 7c523bb..0dd8d08 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -322,7 +322,8 @@ void arch_remove_optimized_kprobe(struct optimized_kprobe *op) * Target instructions MUST be relocatable (checked inside) * This is called when new aggr(opt)probe is allocated or reused. */ -int arch_prepare_optimized_kprobe(struct optimized_kprobe *op) +int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, + struct kprobe *__unused) { u8 *buf; int ret; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 94f6434..e354cc6 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -609,7 +609,7 @@ static inline void check_zero(void) u8 ret; u8 old; - old = ACCESS_ONCE(zero_stats); + old = READ_ONCE(zero_stats); if (unlikely(old)) { ret = cmpxchg(&zero_stats, old, 0); /* This ensures only one fellow resets the stat */ @@ -727,6 +727,7 @@ __visible void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want) int cpu; u64 start; unsigned long flags; + __ticket_t head; if (in_nmi()) return; @@ -768,11 +769,15 @@ __visible void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want) */ __ticket_enter_slowpath(lock); + /* make sure enter_slowpath, which is atomic does not cross the read */ + smp_mb__after_atomic(); + /* * check again make sure it didn't become free while * we weren't looking. */ - if (ACCESS_ONCE(lock->tickets.head) == want) { + head = READ_ONCE(lock->tickets.head); + if (__tickets_equal(head, want)) { add_stats(TAKEN_SLOW_PICKUP, 1); goto out; } @@ -803,8 +808,8 @@ static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket) add_stats(RELEASED_SLOW, 1); for_each_cpu(cpu, &waiting_cpus) { const struct kvm_lock_waiting *w = &per_cpu(klock_waiting, cpu); - if (ACCESS_ONCE(w->lock) == lock && - ACCESS_ONCE(w->want) == ticket) { + if (READ_ONCE(w->lock) == lock && + READ_ONCE(w->want) == ticket) { add_stats(RELEASED_SLOW_KICKED, 1); kvm_kick_cpu(cpu); break; diff --git a/arch/x86/kernel/livepatch.c b/arch/x86/kernel/livepatch.c new file mode 100644 index 0000000..ff3c3101d --- /dev/null +++ b/arch/x86/kernel/livepatch.c @@ -0,0 +1,90 @@ +/* + * livepatch.c - x86-specific Kernel Live Patching Core + * + * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com> + * Copyright (C) 2014 SUSE + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/module.h> +#include <linux/uaccess.h> +#include <asm/cacheflush.h> +#include <asm/page_types.h> +#include <asm/elf.h> +#include <asm/livepatch.h> + +/** + * klp_write_module_reloc() - write a relocation in a module + * @mod: module in which the section to be modified is found + * @type: ELF relocation type (see asm/elf.h) + * @loc: address that the relocation should be written to + * @value: relocation value (sym address + addend) + * + * This function writes a relocation to the specified location for + * a particular module. + */ +int klp_write_module_reloc(struct module *mod, unsigned long type, + unsigned long loc, unsigned long value) +{ + int ret, numpages, size = 4; + bool readonly; + unsigned long val; + unsigned long core = (unsigned long)mod->module_core; + unsigned long core_ro_size = mod->core_ro_size; + unsigned long core_size = mod->core_size; + + switch (type) { + case R_X86_64_NONE: + return 0; + case R_X86_64_64: + val = value; + size = 8; + break; + case R_X86_64_32: + val = (u32)value; + break; + case R_X86_64_32S: + val = (s32)value; + break; + case R_X86_64_PC32: + val = (u32)(value - loc); + break; + default: + /* unsupported relocation type */ + return -EINVAL; + } + + if (loc < core || loc >= core + core_size) + /* loc does not point to any symbol inside the module */ + return -EINVAL; + + if (loc < core + core_ro_size) + readonly = true; + else + readonly = false; + + /* determine if the relocation spans a page boundary */ + numpages = ((loc & PAGE_MASK) == ((loc + size) & PAGE_MASK)) ? 1 : 2; + + if (readonly) + set_memory_rw(loc & PAGE_MASK, numpages); + + ret = probe_kernel_write((void *)loc, &val, size); + + if (readonly) + set_memory_ro(loc & PAGE_MASK, numpages); + + return ret; +} diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index e69f988..9bbb9b3 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -24,6 +24,7 @@ #include <linux/fs.h> #include <linux/string.h> #include <linux/kernel.h> +#include <linux/kasan.h> #include <linux/bug.h> #include <linux/mm.h> #include <linux/gfp.h> @@ -46,21 +47,13 @@ do { \ #ifdef CONFIG_RANDOMIZE_BASE static unsigned long module_load_offset; -static int randomize_modules = 1; /* Mutex protects the module_load_offset. */ static DEFINE_MUTEX(module_kaslr_mutex); -static int __init parse_nokaslr(char *p) -{ - randomize_modules = 0; - return 0; -} -early_param("nokaslr", parse_nokaslr); - static unsigned long int get_module_load_offset(void) { - if (randomize_modules) { + if (kaslr_enabled) { mutex_lock(&module_kaslr_mutex); /* * Calculate the module_load_offset the first time this @@ -83,13 +76,22 @@ static unsigned long int get_module_load_offset(void) void *module_alloc(unsigned long size) { + void *p; + if (PAGE_ALIGN(size) > MODULES_LEN) return NULL; - return __vmalloc_node_range(size, 1, + + p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR + get_module_load_offset(), MODULES_END, GFP_KERNEL | __GFP_HIGHMEM, - PAGE_KERNEL_EXEC, NUMA_NO_NODE, + PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, __builtin_return_address(0)); + if (p && (kasan_module_alloc(p, size) < 0)) { + vfree(p); + return NULL; + } + + return p; } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index e127dda..046e2d6 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -28,6 +28,7 @@ #include <asm/fpu-internal.h> #include <asm/debugreg.h> #include <asm/nmi.h> +#include <asm/tlbflush.h> /* * per-CPU TSS segments. Threads are completely 'soft' on Linux, @@ -141,7 +142,7 @@ void flush_thread(void) static void hard_disable_TSC(void) { - write_cr4(read_cr4() | X86_CR4_TSD); + cr4_set_bits(X86_CR4_TSD); } void disable_TSC(void) @@ -158,7 +159,7 @@ void disable_TSC(void) static void hard_enable_TSC(void) { - write_cr4(read_cr4() & ~X86_CR4_TSD); + cr4_clear_bits(X86_CR4_TSD); } static void enable_TSC(void) diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 8f3ebfe..603c4f9 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -101,7 +101,7 @@ void __show_regs(struct pt_regs *regs, int all) cr0 = read_cr0(); cr2 = read_cr2(); cr3 = read_cr3(); - cr4 = read_cr4_safe(); + cr4 = __read_cr4_safe(); printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 5a2c029..67fcc43 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -93,7 +93,7 @@ void __show_regs(struct pt_regs *regs, int all) cr0 = read_cr0(); cr2 = read_cr2(); cr3 = read_cr3(); - cr4 = read_cr4(); + cr4 = __read_cr4(); printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", fs, fsindex, gs, gsindex, shadowgs); diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index fe3dbfe..cd96852 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -49,11 +49,11 @@ int mach_set_rtc_mmss(const struct timespec *now) retval = set_rtc_time(&tm); if (retval) printk(KERN_ERR "%s: RTC write failed with error %d\n", - __FUNCTION__, retval); + __func__, retval); } else { printk(KERN_ERR "%s: Invalid RTC value: write of %lx to RTC failed\n", - __FUNCTION__, nowtime); + __func__, nowtime); retval = -EINVAL; } return retval; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index c4648ada..98dc931 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -89,6 +89,7 @@ #include <asm/cacheflush.h> #include <asm/processor.h> #include <asm/bugs.h> +#include <asm/kasan.h> #include <asm/vsyscall.h> #include <asm/cpu.h> @@ -121,6 +122,8 @@ unsigned long max_low_pfn_mapped; unsigned long max_pfn_mapped; +bool __read_mostly kaslr_enabled = false; + #ifdef CONFIG_DMI RESERVE_BRK(dmi_alloc, 65536); #endif @@ -424,6 +427,11 @@ static void __init reserve_initrd(void) } #endif /* CONFIG_BLK_DEV_INITRD */ +static void __init parse_kaslr_setup(u64 pa_data, u32 data_len) +{ + kaslr_enabled = (bool)(pa_data + sizeof(struct setup_data)); +} + static void __init parse_setup_data(void) { struct setup_data *data; @@ -449,6 +457,9 @@ static void __init parse_setup_data(void) case SETUP_EFI: parse_efi_setup(pa_data, data_len); break; + case SETUP_KASLR: + parse_kaslr_setup(pa_data, data_len); + break; default: break; } @@ -831,10 +842,14 @@ static void __init trim_low_memory_range(void) static int dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) { - pr_emerg("Kernel Offset: 0x%lx from 0x%lx " - "(relocation range: 0x%lx-0x%lx)\n", - (unsigned long)&_text - __START_KERNEL, __START_KERNEL, - __START_KERNEL_map, MODULES_VADDR-1); + if (kaslr_enabled) + pr_emerg("Kernel Offset: 0x%lx from 0x%lx (relocation range: 0x%lx-0x%lx)\n", + (unsigned long)&_text - __START_KERNEL, + __START_KERNEL, + __START_KERNEL_map, + MODULES_VADDR-1); + else + pr_emerg("Kernel Offset: disabled\n"); return 0; } @@ -1174,9 +1189,11 @@ void __init setup_arch(char **cmdline_p) x86_init.paging.pagetable_init(); + kasan_init(); + if (boot_cpu_data.cpuid_level >= 0) { /* A CPU has %cr4 if and only if it has CPUID */ - mmu_cr4_features = read_cr4(); + mmu_cr4_features = __read_cr4(); if (trampoline_cr4_features) *trampoline_cr4_features = mmu_cr4_features; } diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 2a33c8f..e504246 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -69,7 +69,7 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned int err = 0; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; get_user_try { diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 8b96a94..81f8adb0 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -66,27 +66,54 @@ * Good-instruction tables for 32-bit apps. This is non-const and volatile * to keep gcc from statically optimizing it out, as variable_test_bit makes * some versions of gcc to think only *(unsigned long*) is used. + * + * Opcodes we'll probably never support: + * 6c-6f - ins,outs. SEGVs if used in userspace + * e4-e7 - in,out imm. SEGVs if used in userspace + * ec-ef - in,out acc. SEGVs if used in userspace + * cc - int3. SIGTRAP if used in userspace + * ce - into. Not used in userspace - no kernel support to make it useful. SEGVs + * (why we support bound (62) then? it's similar, and similarly unused...) + * f1 - int1. SIGTRAP if used in userspace + * f4 - hlt. SEGVs if used in userspace + * fa - cli. SEGVs if used in userspace + * fb - sti. SEGVs if used in userspace + * + * Opcodes which need some work to be supported: + * 07,17,1f - pop es/ss/ds + * Normally not used in userspace, but would execute if used. + * Can cause GP or stack exception if tries to load wrong segment descriptor. + * We hesitate to run them under single step since kernel's handling + * of userspace single-stepping (TF flag) is fragile. + * We can easily refuse to support push es/cs/ss/ds (06/0e/16/1e) + * on the same grounds that they are never used. + * cd - int N. + * Used by userspace for "int 80" syscall entry. (Other "int N" + * cause GP -> SEGV since their IDT gates don't allow calls from CPL 3). + * Not supported since kernel's handling of userspace single-stepping + * (TF flag) is fragile. + * cf - iret. Normally not used in userspace. Doesn't SEGV unless arguments are bad */ #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) static volatile u32 good_insns_32[256 / 32] = { /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ /* ---------------------------------------------- */ - W(0x00, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) | /* 00 */ + W(0x00, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 00 */ W(0x10, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) , /* 10 */ - W(0x20, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1) | /* 20 */ - W(0x30, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1) , /* 30 */ + W(0x20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */ + W(0x30, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 30 */ W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ - W(0x60, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */ + W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */ W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */ W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */ W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */ - W(0xd0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ + W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */ - W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */ + W(0xf0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */ /* ---------------------------------------------- */ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ }; @@ -94,27 +121,61 @@ static volatile u32 good_insns_32[256 / 32] = { #define good_insns_32 NULL #endif -/* Good-instruction tables for 64-bit apps */ +/* Good-instruction tables for 64-bit apps. + * + * Genuinely invalid opcodes: + * 06,07 - formerly push/pop es + * 0e - formerly push cs + * 16,17 - formerly push/pop ss + * 1e,1f - formerly push/pop ds + * 27,2f,37,3f - formerly daa/das/aaa/aas + * 60,61 - formerly pusha/popa + * 62 - formerly bound. EVEX prefix for AVX512 (not yet supported) + * 82 - formerly redundant encoding of Group1 + * 9a - formerly call seg:ofs + * ce - formerly into + * d4,d5 - formerly aam/aad + * d6 - formerly undocumented salc + * ea - formerly jmp seg:ofs + * + * Opcodes we'll probably never support: + * 6c-6f - ins,outs. SEGVs if used in userspace + * e4-e7 - in,out imm. SEGVs if used in userspace + * ec-ef - in,out acc. SEGVs if used in userspace + * cc - int3. SIGTRAP if used in userspace + * f1 - int1. SIGTRAP if used in userspace + * f4 - hlt. SEGVs if used in userspace + * fa - cli. SEGVs if used in userspace + * fb - sti. SEGVs if used in userspace + * + * Opcodes which need some work to be supported: + * cd - int N. + * Used by userspace for "int 80" syscall entry. (Other "int N" + * cause GP -> SEGV since their IDT gates don't allow calls from CPL 3). + * Not supported since kernel's handling of userspace single-stepping + * (TF flag) is fragile. + * cf - iret. Normally not used in userspace. Doesn't SEGV unless arguments are bad + */ #if defined(CONFIG_X86_64) static volatile u32 good_insns_64[256 / 32] = { /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ /* ---------------------------------------------- */ - W(0x00, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 00 */ + W(0x00, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* 00 */ W(0x10, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 10 */ - W(0x20, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 20 */ - W(0x30, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 30 */ - W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */ + W(0x20, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) | /* 20 */ + W(0x30, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) , /* 30 */ + W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ - W(0x60, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */ + W(0x60, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */ W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */ W(0x80, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ - W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ + W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1) , /* 90 */ W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */ W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ - W(0xc0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */ + W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */ W(0xd0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ - W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */ - W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */ + W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0) | /* e0 */ + W(0xf0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */ /* ---------------------------------------------- */ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ }; @@ -122,49 +183,55 @@ static volatile u32 good_insns_64[256 / 32] = { #define good_insns_64 NULL #endif -/* Using this for both 64-bit and 32-bit apps */ +/* Using this for both 64-bit and 32-bit apps. + * Opcodes we don't support: + * 0f 00 - SLDT/STR/LLDT/LTR/VERR/VERW/-/- group. System insns + * 0f 01 - SGDT/SIDT/LGDT/LIDT/SMSW/-/LMSW/INVLPG group. + * Also encodes tons of other system insns if mod=11. + * Some are in fact non-system: xend, xtest, rdtscp, maybe more + * 0f 05 - syscall + * 0f 06 - clts (CPL0 insn) + * 0f 07 - sysret + * 0f 08 - invd (CPL0 insn) + * 0f 09 - wbinvd (CPL0 insn) + * 0f 0b - ud2 + * 0f 30 - wrmsr (CPL0 insn) (then why rdmsr is allowed, it's also CPL0 insn?) + * 0f 34 - sysenter + * 0f 35 - sysexit + * 0f 37 - getsec + * 0f 78 - vmread (Intel VMX. CPL0 insn) + * 0f 79 - vmwrite (Intel VMX. CPL0 insn) + * Note: with prefixes, these two opcodes are + * extrq/insertq/AVX512 convert vector ops. + * 0f ae - group15: [f]xsave,[f]xrstor,[v]{ld,st}mxcsr,clflush[opt], + * {rd,wr}{fs,gs}base,{s,l,m}fence. + * Why? They are all user-executable. + */ static volatile u32 good_2byte_insns[256 / 32] = { /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ /* ---------------------------------------------- */ - W(0x00, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1) | /* 00 */ - W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* 10 */ - W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */ - W(0x30, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */ + W(0x00, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1) | /* 00 */ + W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 10 */ + W(0x20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */ + W(0x30, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* 30 */ W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 60 */ - W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */ + W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* 70 */ W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ - W(0xa0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */ - W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ + W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */ + W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */ - W(0xd0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ + W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* e0 */ - W(0xf0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* f0 */ + W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) /* f0 */ /* ---------------------------------------------- */ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ }; #undef W /* - * opcodes we'll probably never support: - * - * 6c-6d, e4-e5, ec-ed - in - * 6e-6f, e6-e7, ee-ef - out - * cc, cd - int3, int - * cf - iret - * d6 - illegal instruction - * f1 - int1/icebp - * f4 - hlt - * fa, fb - cli, sti - * 0f - lar, lsl, syscall, clts, sysret, sysenter, sysexit, invd, wbinvd, ud2 - * - * invalid opcodes in 64-bit mode: - * - * 06, 0e, 16, 1e, 27, 2f, 37, 3f, 60-62, 82, c4-c5, d4-d5 - * 63 - we support this opcode in x86_64 but not in i386. - * * opcodes we may need to refine support for: * * 0f - 2-byte instructions: For many of these instructions, the validity diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index 0406819..37d8fa4 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -50,13 +50,19 @@ EXPORT_SYMBOL(csum_partial); #undef memset #undef memmove +extern void *__memset(void *, int, __kernel_size_t); +extern void *__memcpy(void *, const void *, __kernel_size_t); +extern void *__memmove(void *, const void *, __kernel_size_t); extern void *memset(void *, int, __kernel_size_t); extern void *memcpy(void *, const void *, __kernel_size_t); -extern void *__memcpy(void *, const void *, __kernel_size_t); +extern void *memmove(void *, const void *, __kernel_size_t); + +EXPORT_SYMBOL(__memset); +EXPORT_SYMBOL(__memcpy); +EXPORT_SYMBOL(__memmove); EXPORT_SYMBOL(memset); EXPORT_SYMBOL(memcpy); -EXPORT_SYMBOL(__memcpy); EXPORT_SYMBOL(memmove); #ifndef CONFIG_DEBUG_VIRTUAL diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 0de1fae..34f66e5 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -12,6 +12,7 @@ #include <asm/i387.h> #include <asm/fpu-internal.h> #include <asm/sigframe.h> +#include <asm/tlbflush.h> #include <asm/xcr.h> /* @@ -453,7 +454,7 @@ static void prepare_fx_sw_frame(void) */ static inline void xstate_enable(void) { - set_in_cr4(X86_CR4_OSXSAVE); + cr4_set_bits(X86_CR4_OSXSAVE); xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); } diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 7dc7ba5..413a7bf 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -39,6 +39,7 @@ config KVM select PERF_EVENTS select HAVE_KVM_MSI select HAVE_KVM_CPU_RELAX_INTERCEPT + select KVM_GENERIC_DIRTYLOG_READ_PROTECT select KVM_VFIO select SRCU ---help--- diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index de12c1d..e0b794a 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -86,6 +86,7 @@ #define DstAcc (OpAcc << DstShift) #define DstDI (OpDI << DstShift) #define DstMem64 (OpMem64 << DstShift) +#define DstMem16 (OpMem16 << DstShift) #define DstImmUByte (OpImmUByte << DstShift) #define DstDX (OpDX << DstShift) #define DstAccLo (OpAccLo << DstShift) @@ -124,6 +125,7 @@ #define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */ #define Escape (5<<15) /* Escape to coprocessor instruction */ #define InstrDual (6<<15) /* Alternate instruction decoding of mod == 3 */ +#define ModeDual (7<<15) /* Different instruction for 32/64 bit */ #define Sse (1<<18) /* SSE Vector instruction */ /* Generic ModRM decode. */ #define ModRM (1<<19) @@ -165,10 +167,10 @@ #define NoMod ((u64)1 << 47) /* Mod field is ignored */ #define Intercept ((u64)1 << 48) /* Has valid intercept field */ #define CheckPerm ((u64)1 << 49) /* Has valid check_perm field */ -#define NoBigReal ((u64)1 << 50) /* No big real mode */ #define PrivUD ((u64)1 << 51) /* #UD instead of #GP on CPL > 0 */ #define NearBranch ((u64)1 << 52) /* Near branches */ #define No16 ((u64)1 << 53) /* No 16 bit operand */ +#define IncSP ((u64)1 << 54) /* SP is incremented before ModRM calc */ #define DstXacc (DstAccLo | SrcAccHi | SrcWrite) @@ -213,6 +215,7 @@ struct opcode { const struct gprefix *gprefix; const struct escape *esc; const struct instr_dual *idual; + const struct mode_dual *mdual; void (*fastop)(struct fastop *fake); } u; int (*check_perm)(struct x86_emulate_ctxt *ctxt); @@ -240,6 +243,11 @@ struct instr_dual { struct opcode mod3; }; +struct mode_dual { + struct opcode mode32; + struct opcode mode64; +}; + /* EFLAGS bit definitions. */ #define EFLG_ID (1<<21) #define EFLG_VIP (1<<20) @@ -262,6 +270,13 @@ struct instr_dual { #define EFLG_RESERVED_ZEROS_MASK 0xffc0802a #define EFLG_RESERVED_ONE_MASK 2 +enum x86_transfer_type { + X86_TRANSFER_NONE, + X86_TRANSFER_CALL_JMP, + X86_TRANSFER_RET, + X86_TRANSFER_TASK_SWITCH, +}; + static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr) { if (!(ctxt->regs_valid & (1 << nr))) { @@ -669,9 +684,13 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt, } if (addr.ea > lim) goto bad; - *max_size = min_t(u64, ~0u, (u64)lim + 1 - addr.ea); - if (size > *max_size) - goto bad; + if (lim == 0xffffffff) + *max_size = ~0u; + else { + *max_size = (u64)lim + 1 - addr.ea; + if (size > *max_size) + goto bad; + } la &= (u32)-1; break; } @@ -722,19 +741,26 @@ static int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst, const struct desc_struct *cs_desc) { enum x86emul_mode mode = ctxt->mode; + int rc; #ifdef CONFIG_X86_64 - if (ctxt->mode >= X86EMUL_MODE_PROT32 && cs_desc->l) { - u64 efer = 0; + if (ctxt->mode >= X86EMUL_MODE_PROT16) { + if (cs_desc->l) { + u64 efer = 0; - ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); - if (efer & EFER_LMA) - mode = X86EMUL_MODE_PROT64; + ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); + if (efer & EFER_LMA) + mode = X86EMUL_MODE_PROT64; + } else + mode = X86EMUL_MODE_PROT32; /* temporary value */ } #endif if (mode == X86EMUL_MODE_PROT16 || mode == X86EMUL_MODE_PROT32) mode = cs_desc->d ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; - return assign_eip(ctxt, dst, mode); + rc = assign_eip(ctxt, dst, mode); + if (rc == X86EMUL_CONTINUE) + ctxt->mode = mode; + return rc; } static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) @@ -1057,8 +1083,6 @@ static int em_fnstcw(struct x86_emulate_ctxt *ctxt) asm volatile("fnstcw %0": "+m"(fcw)); ctxt->ops->put_fpu(ctxt); - /* force 2 byte destination */ - ctxt->dst.bytes = 2; ctxt->dst.val = fcw; return X86EMUL_CONTINUE; @@ -1075,8 +1099,6 @@ static int em_fnstsw(struct x86_emulate_ctxt *ctxt) asm volatile("fnstsw %0": "+m"(fsw)); ctxt->ops->put_fpu(ctxt); - /* force 2 byte destination */ - ctxt->dst.bytes = 2; ctxt->dst.val = fsw; return X86EMUL_CONTINUE; @@ -1223,6 +1245,10 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, else { modrm_ea += reg_read(ctxt, base_reg); adjust_modrm_seg(ctxt, base_reg); + /* Increment ESP on POP [ESP] */ + if ((ctxt->d & IncSP) && + base_reg == VCPU_REGS_RSP) + modrm_ea += ctxt->op_bytes; } if (index_reg != 4) modrm_ea += reg_read(ctxt, index_reg) << scale; @@ -1435,10 +1461,8 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, ops->get_gdt(ctxt, dt); } -/* allowed just for 8 bytes segments */ -static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, - u16 selector, struct desc_struct *desc, - ulong *desc_addr_p) +static int get_descriptor_ptr(struct x86_emulate_ctxt *ctxt, + u16 selector, ulong *desc_addr_p) { struct desc_ptr dt; u16 index = selector >> 3; @@ -1449,8 +1473,34 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, if (dt.size < index * 8 + 7) return emulate_gp(ctxt, selector & 0xfffc); - *desc_addr_p = addr = dt.address + index * 8; - return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc, + addr = dt.address + index * 8; + +#ifdef CONFIG_X86_64 + if (addr >> 32 != 0) { + u64 efer = 0; + + ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); + if (!(efer & EFER_LMA)) + addr &= (u32)-1; + } +#endif + + *desc_addr_p = addr; + return X86EMUL_CONTINUE; +} + +/* allowed just for 8 bytes segments */ +static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, + u16 selector, struct desc_struct *desc, + ulong *desc_addr_p) +{ + int rc; + + rc = get_descriptor_ptr(ctxt, selector, desc_addr_p); + if (rc != X86EMUL_CONTINUE) + return rc; + + return ctxt->ops->read_std(ctxt, *desc_addr_p, desc, sizeof(*desc), &ctxt->exception); } @@ -1458,16 +1508,13 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, u16 selector, struct desc_struct *desc) { - struct desc_ptr dt; - u16 index = selector >> 3; + int rc; ulong addr; - get_descriptor_table_ptr(ctxt, selector, &dt); - - if (dt.size < index * 8 + 7) - return emulate_gp(ctxt, selector & 0xfffc); + rc = get_descriptor_ptr(ctxt, selector, &addr); + if (rc != X86EMUL_CONTINUE) + return rc; - addr = dt.address + index * 8; return ctxt->ops->write_std(ctxt, addr, desc, sizeof *desc, &ctxt->exception); } @@ -1475,7 +1522,7 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, /* Does not support long mode */ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, u16 selector, int seg, u8 cpl, - bool in_task_switch, + enum x86_transfer_type transfer, struct desc_struct *desc) { struct desc_struct seg_desc, old_desc; @@ -1529,11 +1576,15 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, return ret; err_code = selector & 0xfffc; - err_vec = in_task_switch ? TS_VECTOR : GP_VECTOR; + err_vec = (transfer == X86_TRANSFER_TASK_SWITCH) ? TS_VECTOR : + GP_VECTOR; /* can't load system descriptor into segment selector */ - if (seg <= VCPU_SREG_GS && !seg_desc.s) + if (seg <= VCPU_SREG_GS && !seg_desc.s) { + if (transfer == X86_TRANSFER_CALL_JMP) + return X86EMUL_UNHANDLEABLE; goto exception; + } if (!seg_desc.p) { err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR; @@ -1605,10 +1656,13 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, if (seg_desc.s) { /* mark segment as accessed */ - seg_desc.type |= 1; - ret = write_segment_descriptor(ctxt, selector, &seg_desc); - if (ret != X86EMUL_CONTINUE) - return ret; + if (!(seg_desc.type & 1)) { + seg_desc.type |= 1; + ret = write_segment_descriptor(ctxt, selector, + &seg_desc); + if (ret != X86EMUL_CONTINUE) + return ret; + } } else if (ctxt->mode == X86EMUL_MODE_PROT64) { ret = ctxt->ops->read_std(ctxt, desc_addr+8, &base3, sizeof(base3), &ctxt->exception); @@ -1631,7 +1685,8 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, u16 selector, int seg) { u8 cpl = ctxt->ops->cpl(ctxt); - return __load_segment_descriptor(ctxt, selector, seg, cpl, false, NULL); + return __load_segment_descriptor(ctxt, selector, seg, cpl, + X86_TRANSFER_NONE, NULL); } static void write_register_operand(struct operand *op) @@ -1828,12 +1883,14 @@ static int em_pop_sreg(struct x86_emulate_ctxt *ctxt) unsigned long selector; int rc; - rc = emulate_pop(ctxt, &selector, ctxt->op_bytes); + rc = emulate_pop(ctxt, &selector, 2); if (rc != X86EMUL_CONTINUE) return rc; if (ctxt->modrm_reg == VCPU_SREG_SS) ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS; + if (ctxt->op_bytes > 2) + rsp_increment(ctxt, ctxt->op_bytes - 2); rc = load_segment_descriptor(ctxt, (u16)selector, seg); return rc; @@ -2007,6 +2064,7 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt) ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */ ctxt->eflags |= EFLG_RESERVED_ONE_MASK; + ctxt->ops->set_nmi_mask(ctxt, false); return rc; } @@ -2041,7 +2099,8 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt) memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2); - rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, false, + rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, + X86_TRANSFER_CALL_JMP, &new_desc); if (rc != X86EMUL_CONTINUE) return rc; @@ -2130,7 +2189,8 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt) /* Outer-privilege level return is not implemented */ if (ctxt->mode >= X86EMUL_MODE_PROT16 && (cs & 3) > cpl) return X86EMUL_UNHANDLEABLE; - rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, cpl, false, + rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, cpl, + X86_TRANSFER_RET, &new_desc); if (rc != X86EMUL_CONTINUE) return rc; @@ -2163,12 +2223,15 @@ static int em_cmpxchg(struct x86_emulate_ctxt *ctxt) fastop(ctxt, em_cmp); if (ctxt->eflags & EFLG_ZF) { - /* Success: write back to memory. */ + /* Success: write back to memory; no update of EAX */ + ctxt->src.type = OP_NONE; ctxt->dst.val = ctxt->src.orig_val; } else { /* Failure: write the value we saw to EAX. */ - ctxt->dst.type = OP_REG; - ctxt->dst.addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX); + ctxt->src.type = OP_REG; + ctxt->src.addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX); + ctxt->src.val = ctxt->dst.orig_val; + /* Create write-cycle to dest by writing the same value */ ctxt->dst.val = ctxt->dst.orig_val; } return X86EMUL_CONTINUE; @@ -2556,23 +2619,23 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, * it is handled in a context of new task */ ret = __load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR, cpl, - true, NULL); + X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, - true, NULL); + X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, - true, NULL); + X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, - true, NULL); + X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, - true, NULL); + X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; @@ -2694,31 +2757,31 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, * it is handled in a context of new task */ ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR, - cpl, true, NULL); + cpl, X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, - true, NULL); + X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, - true, NULL); + X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, - true, NULL); + X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, - true, NULL); + X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS, cpl, - true, NULL); + X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl, - true, NULL); + X86_TRANSFER_TASK_SWITCH, NULL); if (ret != X86EMUL_CONTINUE) return ret; @@ -2739,7 +2802,6 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, &ctxt->exception); if (ret != X86EMUL_CONTINUE) - /* FIXME: need to provide precise fault address */ return ret; save_state_to_tss32(ctxt, &tss_seg); @@ -2748,13 +2810,11 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, ret = ops->write_std(ctxt, old_tss_base + eip_offset, &tss_seg.eip, ldt_sel_offset - eip_offset, &ctxt->exception); if (ret != X86EMUL_CONTINUE) - /* FIXME: need to provide precise fault address */ return ret; ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg, &ctxt->exception); if (ret != X86EMUL_CONTINUE) - /* FIXME: need to provide precise fault address */ return ret; if (old_tss_sel != 0xffff) { @@ -2765,7 +2825,6 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, sizeof tss_seg.prev_task_link, &ctxt->exception); if (ret != X86EMUL_CONTINUE) - /* FIXME: need to provide precise fault address */ return ret; } @@ -2999,15 +3058,16 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt) struct desc_struct old_desc, new_desc; const struct x86_emulate_ops *ops = ctxt->ops; int cpl = ctxt->ops->cpl(ctxt); + enum x86emul_mode prev_mode = ctxt->mode; old_eip = ctxt->_eip; ops->get_segment(ctxt, &old_cs, &old_desc, NULL, VCPU_SREG_CS); memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2); - rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, false, - &new_desc); + rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, + X86_TRANSFER_CALL_JMP, &new_desc); if (rc != X86EMUL_CONTINUE) - return X86EMUL_CONTINUE; + return rc; rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc); if (rc != X86EMUL_CONTINUE) @@ -3022,11 +3082,14 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt) rc = em_push(ctxt); /* If we failed, we tainted the memory, but the very least we should restore cs */ - if (rc != X86EMUL_CONTINUE) + if (rc != X86EMUL_CONTINUE) { + pr_warn_once("faulting far call emulation tainted memory\n"); goto fail; + } return rc; fail: ops->set_segment(ctxt, old_cs, &old_desc, 0, VCPU_SREG_CS); + ctxt->mode = prev_mode; return rc; } @@ -3477,6 +3540,12 @@ static int em_clflush(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_movsxd(struct x86_emulate_ctxt *ctxt) +{ + ctxt->dst.val = (s32) ctxt->src.val; + return X86EMUL_CONTINUE; +} + static bool valid_cr(int nr) { switch (nr) { @@ -3676,6 +3745,7 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt) #define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) } #define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) } #define ID(_f, _i) { .flags = ((_f) | InstrDual | ModRM), .u.idual = (_i) } +#define MD(_f, _m) { .flags = ((_f) | ModeDual), .u.mdual = (_m) } #define E(_f, _e) { .flags = ((_f) | Escape | ModRM), .u.esc = (_e) } #define I(_f, _e) { .flags = (_f), .u.execute = (_e) } #define F(_f, _e) { .flags = (_f) | Fastop, .u.fastop = (_e) } @@ -3738,7 +3808,7 @@ static const struct opcode group1[] = { }; static const struct opcode group1A[] = { - I(DstMem | SrcNone | Mov | Stack, em_pop), N, N, N, N, N, N, N, + I(DstMem | SrcNone | Mov | Stack | IncSP, em_pop), N, N, N, N, N, N, N, }; static const struct opcode group2[] = { @@ -3854,7 +3924,7 @@ static const struct gprefix pfx_0f_e7 = { }; static const struct escape escape_d9 = { { - N, N, N, N, N, N, N, I(DstMem, em_fnstcw), + N, N, N, N, N, N, N, I(DstMem16 | Mov, em_fnstcw), }, { /* 0xC0 - 0xC7 */ N, N, N, N, N, N, N, N, @@ -3896,7 +3966,7 @@ static const struct escape escape_db = { { } }; static const struct escape escape_dd = { { - N, N, N, N, N, N, N, I(DstMem, em_fnstsw), + N, N, N, N, N, N, N, I(DstMem16 | Mov, em_fnstsw), }, { /* 0xC0 - 0xC7 */ N, N, N, N, N, N, N, N, @@ -3920,6 +3990,10 @@ static const struct instr_dual instr_dual_0f_c3 = { I(DstMem | SrcReg | ModRM | No16 | Mov, em_mov), N }; +static const struct mode_dual mode_dual_63 = { + N, I(DstReg | SrcMem32 | ModRM | Mov, em_movsxd) +}; + static const struct opcode opcode_table[256] = { /* 0x00 - 0x07 */ F6ALU(Lock, em_add), @@ -3954,7 +4028,7 @@ static const struct opcode opcode_table[256] = { /* 0x60 - 0x67 */ I(ImplicitOps | Stack | No64, em_pusha), I(ImplicitOps | Stack | No64, em_popa), - N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ , + N, MD(ModRM, &mode_dual_63), N, N, N, N, /* 0x68 - 0x6F */ I(SrcImm | Mov | Stack, em_push), @@ -4010,8 +4084,8 @@ static const struct opcode opcode_table[256] = { G(ByteOp, group11), G(0, group11), /* 0xC8 - 0xCF */ I(Stack | SrcImmU16 | Src2ImmByte, em_enter), I(Stack, em_leave), - I(ImplicitOps | Stack | SrcImmU16, em_ret_far_imm), - I(ImplicitOps | Stack, em_ret_far), + I(ImplicitOps | SrcImmU16, em_ret_far_imm), + I(ImplicitOps, em_ret_far), D(ImplicitOps), DI(SrcImmByte, intn), D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret), /* 0xD0 - 0xD7 */ @@ -4108,7 +4182,7 @@ static const struct opcode twobyte_table[256] = { F(DstMem | SrcReg | Src2CL | ModRM, em_shrd), GD(0, &group15), F(DstReg | SrcMem | ModRM, em_imul), /* 0xB0 - 0xB7 */ - I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_cmpxchg), + I2bv(DstMem | SrcReg | ModRM | Lock | PageTable | SrcWrite, em_cmpxchg), I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg), F(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr), I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg), @@ -4174,6 +4248,8 @@ static const struct opcode opcode_map_0f_38[256] = { #undef I #undef GP #undef EXT +#undef MD +#undef ID #undef D2bv #undef D2bvIP @@ -4563,6 +4639,12 @@ done_prefixes: else opcode = opcode.u.idual->mod012; break; + case ModeDual: + if (ctxt->mode == X86EMUL_MODE_PROT64) + opcode = opcode.u.mdual->mode64; + else + opcode = opcode.u.mdual->mode32; + break; default: return EMULATION_FAILED; } @@ -4860,8 +4942,13 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) /* optimisation - avoid slow emulated read if Mov */ rc = segmented_read(ctxt, ctxt->dst.addr.mem, &ctxt->dst.val, ctxt->dst.bytes); - if (rc != X86EMUL_CONTINUE) + if (rc != X86EMUL_CONTINUE) { + if (!(ctxt->d & NoWrite) && + rc == X86EMUL_PROPAGATE_FAULT && + ctxt->exception.vector == PF_VECTOR) + ctxt->exception.error_code |= PFERR_WRITE_MASK; goto done; + } } ctxt->dst.orig_val = ctxt->dst.val; @@ -4899,11 +4986,6 @@ special_insn: goto threebyte_insn; switch (ctxt->b) { - case 0x63: /* movsxd */ - if (ctxt->mode != X86EMUL_MODE_PROT64) - goto cannot_emulate; - ctxt->dst.val = (s32) ctxt->src.val; - break; case 0x70 ... 0x7f: /* jcc (short) */ if (test_cc(ctxt->b, ctxt->eflags)) rc = jmp_rel(ctxt, ctxt->src.val); diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index 3c91955..c2e36d9 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -98,7 +98,7 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm) } void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu); -int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, +bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int short_hand, unsigned int dest, int dest_mode); int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, diff --git a/arch/x86/kvm/iommu.c b/arch/x86/kvm/iommu.c index 17b73ee..7dbced3 100644 --- a/arch/x86/kvm/iommu.c +++ b/arch/x86/kvm/iommu.c @@ -138,7 +138,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot) gfn += page_size >> PAGE_SHIFT; - + cond_resched(); } return 0; @@ -306,6 +306,8 @@ static void kvm_iommu_put_pages(struct kvm *kvm, kvm_unpin_pages(kvm, pfn, unmap_pages); gfn += unmap_pages; + + cond_resched(); } } diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index d52dcf0..e55b5fc 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -33,6 +33,7 @@ #include <asm/page.h> #include <asm/current.h> #include <asm/apicdef.h> +#include <asm/delay.h> #include <linux/atomic.h> #include <linux/jump_label.h> #include "kvm_cache_regs.h" @@ -327,17 +328,24 @@ static u8 count_vectors(void *bitmap) return count; } -void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir) +void __kvm_apic_update_irr(u32 *pir, void *regs) { u32 i, pir_val; - struct kvm_lapic *apic = vcpu->arch.apic; for (i = 0; i <= 7; i++) { pir_val = xchg(&pir[i], 0); if (pir_val) - *((u32 *)(apic->regs + APIC_IRR + i * 0x10)) |= pir_val; + *((u32 *)(regs + APIC_IRR + i * 0x10)) |= pir_val; } } +EXPORT_SYMBOL_GPL(__kvm_apic_update_irr); + +void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + __kvm_apic_update_irr(pir, apic->regs); +} EXPORT_SYMBOL_GPL(kvm_apic_update_irr); static inline void apic_set_irr(int vec, struct kvm_lapic *apic) @@ -405,7 +413,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic) * because the processor can modify ISR under the hood. Instead * just set SVI. */ - if (unlikely(kvm_apic_vid_enabled(vcpu->kvm))) + if (unlikely(kvm_x86_ops->hwapic_isr_update)) kvm_x86_ops->hwapic_isr_update(vcpu->kvm, vec); else { ++apic->isr_count; @@ -453,7 +461,7 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) * on the other hand isr_count and highest_isr_cache are unused * and must be left alone. */ - if (unlikely(kvm_apic_vid_enabled(vcpu->kvm))) + if (unlikely(kvm_x86_ops->hwapic_isr_update)) kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic)); else { @@ -580,55 +588,48 @@ static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) apic_update_ppr(apic); } -static int kvm_apic_broadcast(struct kvm_lapic *apic, u32 dest) +static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 dest) { return dest == (apic_x2apic_mode(apic) ? X2APIC_BROADCAST : APIC_BROADCAST); } -int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest) +static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest) { return kvm_apic_id(apic) == dest || kvm_apic_broadcast(apic, dest); } -int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) +static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) { - int result = 0; u32 logical_id; if (kvm_apic_broadcast(apic, mda)) - return 1; + return true; - if (apic_x2apic_mode(apic)) { - logical_id = kvm_apic_get_reg(apic, APIC_LDR); - return logical_id & mda; - } + logical_id = kvm_apic_get_reg(apic, APIC_LDR); - logical_id = GET_APIC_LOGICAL_ID(kvm_apic_get_reg(apic, APIC_LDR)); + if (apic_x2apic_mode(apic)) + return ((logical_id >> 16) == (mda >> 16)) + && (logical_id & mda & 0xffff) != 0; + + logical_id = GET_APIC_LOGICAL_ID(logical_id); switch (kvm_apic_get_reg(apic, APIC_DFR)) { case APIC_DFR_FLAT: - if (logical_id & mda) - result = 1; - break; + return (logical_id & mda) != 0; case APIC_DFR_CLUSTER: - if (((logical_id >> 4) == (mda >> 0x4)) - && (logical_id & mda & 0xf)) - result = 1; - break; + return ((logical_id >> 4) == (mda >> 4)) + && (logical_id & mda & 0xf) != 0; default: apic_debug("Bad DFR vcpu %d: %08x\n", apic->vcpu->vcpu_id, kvm_apic_get_reg(apic, APIC_DFR)); - break; + return false; } - - return result; } -int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, +bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int short_hand, unsigned int dest, int dest_mode) { - int result = 0; struct kvm_lapic *target = vcpu->arch.apic; apic_debug("target %p, source %p, dest 0x%x, " @@ -638,29 +639,21 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, ASSERT(target); switch (short_hand) { case APIC_DEST_NOSHORT: - if (dest_mode == 0) - /* Physical mode. */ - result = kvm_apic_match_physical_addr(target, dest); + if (dest_mode == APIC_DEST_PHYSICAL) + return kvm_apic_match_physical_addr(target, dest); else - /* Logical mode. */ - result = kvm_apic_match_logical_addr(target, dest); - break; + return kvm_apic_match_logical_addr(target, dest); case APIC_DEST_SELF: - result = (target == source); - break; + return target == source; case APIC_DEST_ALLINC: - result = 1; - break; + return true; case APIC_DEST_ALLBUT: - result = (target != source); - break; + return target != source; default: apic_debug("kvm: apic: Bad dest shorthand value %x\n", short_hand); - break; + return false; } - - return result; } bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, @@ -693,7 +686,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, ret = true; - if (irq->dest_mode == 0) { /* physical mode */ + if (irq->dest_mode == APIC_DEST_PHYSICAL) { if (irq->dest_id >= ARRAY_SIZE(map->phys_map)) goto out; @@ -1076,25 +1069,72 @@ static void apic_timer_expired(struct kvm_lapic *apic) { struct kvm_vcpu *vcpu = apic->vcpu; wait_queue_head_t *q = &vcpu->wq; + struct kvm_timer *ktimer = &apic->lapic_timer; - /* - * Note: KVM_REQ_PENDING_TIMER is implicitly checked in - * vcpu_enter_guest. - */ if (atomic_read(&apic->lapic_timer.pending)) return; atomic_inc(&apic->lapic_timer.pending); - /* FIXME: this code should not know anything about vcpus */ - kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); + kvm_set_pending_timer(vcpu); if (waitqueue_active(q)) wake_up_interruptible(q); + + if (apic_lvtt_tscdeadline(apic)) + ktimer->expired_tscdeadline = ktimer->tscdeadline; +} + +/* + * On APICv, this test will cause a busy wait + * during a higher-priority task. + */ + +static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + u32 reg = kvm_apic_get_reg(apic, APIC_LVTT); + + if (kvm_apic_hw_enabled(apic)) { + int vec = reg & APIC_VECTOR_MASK; + void *bitmap = apic->regs + APIC_ISR; + + if (kvm_x86_ops->deliver_posted_interrupt) + bitmap = apic->regs + APIC_IRR; + + if (apic_test_vector(vec, bitmap)) + return true; + } + return false; +} + +void wait_lapic_expire(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + u64 guest_tsc, tsc_deadline; + + if (!kvm_vcpu_has_lapic(vcpu)) + return; + + if (apic->lapic_timer.expired_tscdeadline == 0) + return; + + if (!lapic_timer_int_injected(vcpu)) + return; + + tsc_deadline = apic->lapic_timer.expired_tscdeadline; + apic->lapic_timer.expired_tscdeadline = 0; + guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc()); + trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline); + + /* __delay is delay_tsc whenever the hardware has TSC, thus always. */ + if (guest_tsc < tsc_deadline) + __delay(tsc_deadline - guest_tsc); } static void start_apic_timer(struct kvm_lapic *apic) { ktime_t now; + atomic_set(&apic->lapic_timer.pending, 0); if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) { @@ -1140,6 +1180,7 @@ static void start_apic_timer(struct kvm_lapic *apic) /* lapic timer in tsc deadline mode */ u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline; u64 ns = 0; + ktime_t expire; struct kvm_vcpu *vcpu = apic->vcpu; unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz; unsigned long flags; @@ -1154,8 +1195,10 @@ static void start_apic_timer(struct kvm_lapic *apic) if (likely(tscdeadline > guest_tsc)) { ns = (tscdeadline - guest_tsc) * 1000000ULL; do_div(ns, this_tsc_khz); + expire = ktime_add_ns(now, ns); + expire = ktime_sub_ns(expire, lapic_timer_advance_ns); hrtimer_start(&apic->lapic_timer.timer, - ktime_add_ns(now, ns), HRTIMER_MODE_ABS); + expire, HRTIMER_MODE_ABS); } else apic_timer_expired(apic); @@ -1745,7 +1788,9 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, if (kvm_x86_ops->hwapic_irr_update) kvm_x86_ops->hwapic_irr_update(vcpu, apic_find_highest_irr(apic)); - kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic)); + if (unlikely(kvm_x86_ops->hwapic_isr_update)) + kvm_x86_ops->hwapic_isr_update(vcpu->kvm, + apic_find_highest_isr(apic)); kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_rtc_eoi_tracking_restore_one(vcpu); } diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index c674fce..0bc6c65 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -14,6 +14,7 @@ struct kvm_timer { u32 timer_mode; u32 timer_mode_mask; u64 tscdeadline; + u64 expired_tscdeadline; atomic_t pending; /* accumulated triggered timers */ }; @@ -56,9 +57,8 @@ u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); void kvm_apic_set_version(struct kvm_vcpu *vcpu); void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr); +void __kvm_apic_update_irr(u32 *pir, void *regs); void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir); -int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest); -int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda); int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, unsigned long *dest_map); int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type); @@ -170,4 +170,6 @@ static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu) bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); +void wait_lapic_expire(struct kvm_vcpu *vcpu); + #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index f83fc6c..cee7592 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -63,30 +63,16 @@ enum { #undef MMU_DEBUG #ifdef MMU_DEBUG +static bool dbg = 0; +module_param(dbg, bool, 0644); #define pgprintk(x...) do { if (dbg) printk(x); } while (0) #define rmap_printk(x...) do { if (dbg) printk(x); } while (0) - +#define MMU_WARN_ON(x) WARN_ON(x) #else - #define pgprintk(x...) do { } while (0) #define rmap_printk(x...) do { } while (0) - -#endif - -#ifdef MMU_DEBUG -static bool dbg = 0; -module_param(dbg, bool, 0644); -#endif - -#ifndef MMU_DEBUG -#define ASSERT(x) do { } while (0) -#else -#define ASSERT(x) \ - if (!(x)) { \ - printk(KERN_WARNING "assertion failed %s:%d: %s\n", \ - __FILE__, __LINE__, #x); \ - } +#define MMU_WARN_ON(x) do { } while (0) #endif #define PTE_PREFETCH_NUM 8 @@ -546,6 +532,11 @@ static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask) return (old_spte & bit_mask) && !(new_spte & bit_mask); } +static bool spte_is_bit_changed(u64 old_spte, u64 new_spte, u64 bit_mask) +{ + return (old_spte & bit_mask) != (new_spte & bit_mask); +} + /* Rules for using mmu_spte_set: * Set the sptep from nonpresent to present. * Note: the sptep being assigned *must* be either not present @@ -596,6 +587,14 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) if (!shadow_accessed_mask) return ret; + /* + * Flush TLB when accessed/dirty bits are changed in the page tables, + * to guarantee consistency between TLB and page tables. + */ + if (spte_is_bit_changed(old_spte, new_spte, + shadow_accessed_mask | shadow_dirty_mask)) + ret = true; + if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask)) kvm_set_pfn_accessed(spte_to_pfn(old_spte)); if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask)) @@ -1216,6 +1215,60 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, return flush; } +static bool spte_clear_dirty(struct kvm *kvm, u64 *sptep) +{ + u64 spte = *sptep; + + rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep); + + spte &= ~shadow_dirty_mask; + + return mmu_spte_update(sptep, spte); +} + +static bool __rmap_clear_dirty(struct kvm *kvm, unsigned long *rmapp) +{ + u64 *sptep; + struct rmap_iterator iter; + bool flush = false; + + for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { + BUG_ON(!(*sptep & PT_PRESENT_MASK)); + + flush |= spte_clear_dirty(kvm, sptep); + sptep = rmap_get_next(&iter); + } + + return flush; +} + +static bool spte_set_dirty(struct kvm *kvm, u64 *sptep) +{ + u64 spte = *sptep; + + rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep); + + spte |= shadow_dirty_mask; + + return mmu_spte_update(sptep, spte); +} + +static bool __rmap_set_dirty(struct kvm *kvm, unsigned long *rmapp) +{ + u64 *sptep; + struct rmap_iterator iter; + bool flush = false; + + for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { + BUG_ON(!(*sptep & PT_PRESENT_MASK)); + + flush |= spte_set_dirty(kvm, sptep); + sptep = rmap_get_next(&iter); + } + + return flush; +} + /** * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages * @kvm: kvm instance @@ -1226,7 +1279,7 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, * Used when we do not need to care about huge page mappings: e.g. during dirty * logging we do not have any such mappings. */ -void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, +static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) { @@ -1242,6 +1295,53 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, } } +/** + * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages + * @kvm: kvm instance + * @slot: slot to clear D-bit + * @gfn_offset: start of the BITS_PER_LONG pages we care about + * @mask: indicates which pages we should clear D-bit + * + * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap. + */ +void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask) +{ + unsigned long *rmapp; + + while (mask) { + rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), + PT_PAGE_TABLE_LEVEL, slot); + __rmap_clear_dirty(kvm, rmapp); + + /* clear the first set bit */ + mask &= mask - 1; + } +} +EXPORT_SYMBOL_GPL(kvm_mmu_clear_dirty_pt_masked); + +/** + * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected + * PT level pages. + * + * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to + * enable dirty logging for them. + * + * Used when we do not need to care about huge page mappings: e.g. during dirty + * logging we do not have any such mappings. + */ +void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask) +{ + if (kvm_x86_ops->enable_log_dirty_pt_masked) + kvm_x86_ops->enable_log_dirty_pt_masked(kvm, slot, gfn_offset, + mask); + else + kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); +} + static bool rmap_write_protect(struct kvm *kvm, u64 gfn) { struct kvm_memory_slot *slot; @@ -1536,7 +1636,7 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr) static void kvm_mmu_free_page(struct kvm_mmu_page *sp) { - ASSERT(is_empty_shadow_page(sp->spt)); + MMU_WARN_ON(!is_empty_shadow_page(sp->spt)); hlist_del(&sp->hash_link); list_del(&sp->link); free_page((unsigned long)sp->spt); @@ -2501,8 +2601,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, } } - if (pte_access & ACC_WRITE_MASK) + if (pte_access & ACC_WRITE_MASK) { mark_page_dirty(vcpu->kvm, gfn); + spte |= shadow_dirty_mask; + } set_pte: if (mmu_spte_update(sptep, spte)) @@ -2818,6 +2920,18 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, */ gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); + /* + * Theoretically we could also set dirty bit (and flush TLB) here in + * order to eliminate unnecessary PML logging. See comments in + * set_spte. But fast_page_fault is very unlikely to happen with PML + * enabled, so we do not do this. This might result in the same GPA + * to be logged in PML buffer again when the write really happens, and + * eventually to be called by mark_page_dirty twice. But it's also no + * harm. This also avoids the TLB flush needed after setting dirty bit + * so non-PML cases won't be impacted. + * + * Compare with set_spte where instead shadow_dirty_mask is set. + */ if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte) mark_page_dirty(vcpu->kvm, gfn); @@ -3041,7 +3155,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu.pae_root[i]; - ASSERT(!VALID_PAGE(root)); + MMU_WARN_ON(VALID_PAGE(root)); spin_lock(&vcpu->kvm->mmu_lock); make_mmu_pages_available(vcpu); sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT), @@ -3079,7 +3193,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; - ASSERT(!VALID_PAGE(root)); + MMU_WARN_ON(VALID_PAGE(root)); spin_lock(&vcpu->kvm->mmu_lock); make_mmu_pages_available(vcpu); @@ -3104,7 +3218,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu.pae_root[i]; - ASSERT(!VALID_PAGE(root)); + MMU_WARN_ON(VALID_PAGE(root)); if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i); if (!is_present_gpte(pdptr)) { @@ -3329,8 +3443,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, if (r) return r; - ASSERT(vcpu); - ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); + MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); gfn = gva >> PAGE_SHIFT; @@ -3396,8 +3509,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, int write = error_code & PFERR_WRITE_MASK; bool map_writable; - ASSERT(vcpu); - ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); + MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); if (unlikely(error_code & PFERR_RSVD_MASK)) { r = handle_mmio_page_fault(vcpu, gpa, error_code, true); @@ -3718,7 +3830,7 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu, update_permission_bitmask(vcpu, context, false); update_last_pte_bitmap(vcpu, context); - ASSERT(is_pae(vcpu)); + MMU_WARN_ON(!is_pae(vcpu)); context->page_fault = paging64_page_fault; context->gva_to_gpa = paging64_gva_to_gpa; context->sync_page = paging64_sync_page; @@ -3763,7 +3875,7 @@ static void paging32E_init_context(struct kvm_vcpu *vcpu, static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) { - struct kvm_mmu *context = vcpu->arch.walk_mmu; + struct kvm_mmu *context = &vcpu->arch.mmu; context->base_role.word = 0; context->page_fault = tdp_page_fault; @@ -3803,11 +3915,12 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) update_last_pte_bitmap(vcpu, context); } -void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) +void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) { bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); - ASSERT(vcpu); - ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + struct kvm_mmu *context = &vcpu->arch.mmu; + + MMU_WARN_ON(VALID_PAGE(context->root_hpa)); if (!is_paging(vcpu)) nonpaging_init_context(vcpu, context); @@ -3818,19 +3931,19 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) else paging32_init_context(vcpu, context); - vcpu->arch.mmu.base_role.nxe = is_nx(vcpu); - vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); - vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); - vcpu->arch.mmu.base_role.smep_andnot_wp + context->base_role.nxe = is_nx(vcpu); + context->base_role.cr4_pae = !!is_pae(vcpu); + context->base_role.cr0_wp = is_write_protection(vcpu); + context->base_role.smep_andnot_wp = smep && !is_write_protection(vcpu); } EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); -void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context, - bool execonly) +void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly) { - ASSERT(vcpu); - ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + struct kvm_mmu *context = &vcpu->arch.mmu; + + MMU_WARN_ON(VALID_PAGE(context->root_hpa)); context->shadow_root_level = kvm_x86_ops->get_tdp_level(); @@ -3851,11 +3964,13 @@ EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); static void init_kvm_softmmu(struct kvm_vcpu *vcpu) { - kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu); - vcpu->arch.walk_mmu->set_cr3 = kvm_x86_ops->set_cr3; - vcpu->arch.walk_mmu->get_cr3 = get_cr3; - vcpu->arch.walk_mmu->get_pdptr = kvm_pdptr_read; - vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; + struct kvm_mmu *context = &vcpu->arch.mmu; + + kvm_init_shadow_mmu(vcpu); + context->set_cr3 = kvm_x86_ops->set_cr3; + context->get_cr3 = get_cr3; + context->get_pdptr = kvm_pdptr_read; + context->inject_page_fault = kvm_inject_page_fault; } static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) @@ -3900,17 +4015,15 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) static void init_kvm_mmu(struct kvm_vcpu *vcpu) { if (mmu_is_nested(vcpu)) - return init_kvm_nested_mmu(vcpu); + init_kvm_nested_mmu(vcpu); else if (tdp_enabled) - return init_kvm_tdp_mmu(vcpu); + init_kvm_tdp_mmu(vcpu); else - return init_kvm_softmmu(vcpu); + init_kvm_softmmu(vcpu); } void kvm_mmu_reset_context(struct kvm_vcpu *vcpu) { - ASSERT(vcpu); - kvm_mmu_unload(vcpu); init_kvm_mmu(vcpu); } @@ -4266,8 +4379,6 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) struct page *page; int i; - ASSERT(vcpu); - /* * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. * Therefore we need to allocate shadow page tables in the first @@ -4286,8 +4397,6 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) int kvm_mmu_create(struct kvm_vcpu *vcpu) { - ASSERT(vcpu); - vcpu->arch.walk_mmu = &vcpu->arch.mmu; vcpu->arch.mmu.root_hpa = INVALID_PAGE; vcpu->arch.mmu.translate_gpa = translate_gpa; @@ -4298,19 +4407,18 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) void kvm_mmu_setup(struct kvm_vcpu *vcpu) { - ASSERT(vcpu); - ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa)); init_kvm_mmu(vcpu); } -void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) +void kvm_mmu_slot_remove_write_access(struct kvm *kvm, + struct kvm_memory_slot *memslot) { - struct kvm_memory_slot *memslot; gfn_t last_gfn; int i; + bool flush = false; - memslot = id_to_memslot(kvm->memslots, slot); last_gfn = memslot->base_gfn + memslot->npages - 1; spin_lock(&kvm->mmu_lock); @@ -4325,7 +4433,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) for (index = 0; index <= last_index; ++index, ++rmapp) { if (*rmapp) - __rmap_write_protect(kvm, rmapp, false); + flush |= __rmap_write_protect(kvm, rmapp, + false); if (need_resched() || spin_needbreak(&kvm->mmu_lock)) cond_resched_lock(&kvm->mmu_lock); @@ -4352,8 +4461,124 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) * instead of PT_WRITABLE_MASK, that means it does not depend * on PT_WRITABLE_MASK anymore. */ - kvm_flush_remote_tlbs(kvm); + if (flush) + kvm_flush_remote_tlbs(kvm); +} + +void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, + struct kvm_memory_slot *memslot) +{ + gfn_t last_gfn; + unsigned long *rmapp; + unsigned long last_index, index; + bool flush = false; + + last_gfn = memslot->base_gfn + memslot->npages - 1; + + spin_lock(&kvm->mmu_lock); + + rmapp = memslot->arch.rmap[PT_PAGE_TABLE_LEVEL - 1]; + last_index = gfn_to_index(last_gfn, memslot->base_gfn, + PT_PAGE_TABLE_LEVEL); + + for (index = 0; index <= last_index; ++index, ++rmapp) { + if (*rmapp) + flush |= __rmap_clear_dirty(kvm, rmapp); + + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) + cond_resched_lock(&kvm->mmu_lock); + } + + spin_unlock(&kvm->mmu_lock); + + lockdep_assert_held(&kvm->slots_lock); + + /* + * It's also safe to flush TLBs out of mmu lock here as currently this + * function is only used for dirty logging, in which case flushing TLB + * out of mmu lock also guarantees no dirty pages will be lost in + * dirty_bitmap. + */ + if (flush) + kvm_flush_remote_tlbs(kvm); +} +EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty); + +void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, + struct kvm_memory_slot *memslot) +{ + gfn_t last_gfn; + int i; + bool flush = false; + + last_gfn = memslot->base_gfn + memslot->npages - 1; + + spin_lock(&kvm->mmu_lock); + + for (i = PT_PAGE_TABLE_LEVEL + 1; /* skip rmap for 4K page */ + i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { + unsigned long *rmapp; + unsigned long last_index, index; + + rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL]; + last_index = gfn_to_index(last_gfn, memslot->base_gfn, i); + + for (index = 0; index <= last_index; ++index, ++rmapp) { + if (*rmapp) + flush |= __rmap_write_protect(kvm, rmapp, + false); + + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) + cond_resched_lock(&kvm->mmu_lock); + } + } + spin_unlock(&kvm->mmu_lock); + + /* see kvm_mmu_slot_remove_write_access */ + lockdep_assert_held(&kvm->slots_lock); + + if (flush) + kvm_flush_remote_tlbs(kvm); +} +EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access); + +void kvm_mmu_slot_set_dirty(struct kvm *kvm, + struct kvm_memory_slot *memslot) +{ + gfn_t last_gfn; + int i; + bool flush = false; + + last_gfn = memslot->base_gfn + memslot->npages - 1; + + spin_lock(&kvm->mmu_lock); + + for (i = PT_PAGE_TABLE_LEVEL; + i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { + unsigned long *rmapp; + unsigned long last_index, index; + + rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL]; + last_index = gfn_to_index(last_gfn, memslot->base_gfn, i); + + for (index = 0; index <= last_index; ++index, ++rmapp) { + if (*rmapp) + flush |= __rmap_set_dirty(kvm, rmapp); + + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) + cond_resched_lock(&kvm->mmu_lock); + } + } + + spin_unlock(&kvm->mmu_lock); + + lockdep_assert_held(&kvm->slots_lock); + + /* see kvm_mmu_slot_leaf_clear_dirty */ + if (flush) + kvm_flush_remote_tlbs(kvm); } +EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty); #define BATCH_ZAP_PAGES 10 static void kvm_zap_obsolete_pages(struct kvm *kvm) @@ -4606,8 +4831,6 @@ EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); void kvm_mmu_destroy(struct kvm_vcpu *vcpu) { - ASSERT(vcpu); - kvm_mmu_unload(vcpu); free_mmu_pages(vcpu); mmu_free_memory_caches(vcpu); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index bde8ee7..c7d6563 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -44,18 +44,6 @@ #define PT_DIRECTORY_LEVEL 2 #define PT_PAGE_TABLE_LEVEL 1 -#define PFERR_PRESENT_BIT 0 -#define PFERR_WRITE_BIT 1 -#define PFERR_USER_BIT 2 -#define PFERR_RSVD_BIT 3 -#define PFERR_FETCH_BIT 4 - -#define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT) -#define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT) -#define PFERR_USER_MASK (1U << PFERR_USER_BIT) -#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT) -#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT) - static inline u64 rsvd_bits(int s, int e) { return ((1ULL << (e - s + 1)) - 1) << s; @@ -81,9 +69,8 @@ enum { }; int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); -void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); -void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context, - bool execonly); +void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu); +void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly); void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, bool ept); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 41dd038..d319e0c 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1583,7 +1583,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { - unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE; + unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE; unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4; if (cr4 & X86_CR4_VMXE) @@ -2003,8 +2003,8 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) { - kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu); - + WARN_ON(mmu_is_nested(vcpu)); + kvm_init_shadow_mmu(vcpu); vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3; vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3; vcpu->arch.mmu.get_pdptr = nested_svm_get_tdp_pdptr; diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index c2a34bb..7c7bc8b 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -848,6 +848,24 @@ TRACE_EVENT(kvm_track_tsc, #endif /* CONFIG_X86_64 */ +/* + * Tracepoint for PML full VMEXIT. + */ +TRACE_EVENT(kvm_pml_full, + TP_PROTO(unsigned int vcpu_id), + TP_ARGS(vcpu_id), + + TP_STRUCT__entry( + __field( unsigned int, vcpu_id ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + ), + + TP_printk("vcpu %d: PML full", __entry->vcpu_id) +); + TRACE_EVENT(kvm_ple_window, TP_PROTO(bool grow, unsigned int vcpu_id, int new, int old), TP_ARGS(grow, vcpu_id, new, old), @@ -914,6 +932,26 @@ TRACE_EVENT(kvm_pvclock_update, __entry->flags) ); +TRACE_EVENT(kvm_wait_lapic_expire, + TP_PROTO(unsigned int vcpu_id, s64 delta), + TP_ARGS(vcpu_id, delta), + + TP_STRUCT__entry( + __field( unsigned int, vcpu_id ) + __field( s64, delta ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->delta = delta; + ), + + TP_printk("vcpu %u: delta %lld (%s)", + __entry->vcpu_id, + __entry->delta, + __entry->delta < 0 ? "early" : "late") +); + #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d4c58d8..14c1a18 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -45,6 +45,7 @@ #include <asm/perf_event.h> #include <asm/debugreg.h> #include <asm/kexec.h> +#include <asm/apic.h> #include "trace.h" @@ -101,6 +102,9 @@ module_param(nested, bool, S_IRUGO); static u64 __read_mostly host_xss; +static bool __read_mostly enable_pml = 1; +module_param_named(pml, enable_pml, bool, S_IRUGO); + #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD) #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE) #define KVM_VM_CR0_ALWAYS_ON \ @@ -215,7 +219,12 @@ struct __packed vmcs12 { u64 tsc_offset; u64 virtual_apic_page_addr; u64 apic_access_addr; + u64 posted_intr_desc_addr; u64 ept_pointer; + u64 eoi_exit_bitmap0; + u64 eoi_exit_bitmap1; + u64 eoi_exit_bitmap2; + u64 eoi_exit_bitmap3; u64 xss_exit_bitmap; u64 guest_physical_address; u64 vmcs_link_pointer; @@ -330,6 +339,7 @@ struct __packed vmcs12 { u32 vmx_preemption_timer_value; u32 padding32[7]; /* room for future expansion */ u16 virtual_processor_id; + u16 posted_intr_nv; u16 guest_es_selector; u16 guest_cs_selector; u16 guest_ss_selector; @@ -338,6 +348,7 @@ struct __packed vmcs12 { u16 guest_gs_selector; u16 guest_ldtr_selector; u16 guest_tr_selector; + u16 guest_intr_status; u16 host_es_selector; u16 host_cs_selector; u16 host_ss_selector; @@ -401,6 +412,10 @@ struct nested_vmx { */ struct page *apic_access_page; struct page *virtual_apic_page; + struct page *pi_desc_page; + struct pi_desc *pi_desc; + bool pi_pending; + u16 posted_intr_nv; u64 msr_ia32_feature_control; struct hrtimer preemption_timer; @@ -408,6 +423,23 @@ struct nested_vmx { /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */ u64 vmcs01_debugctl; + + u32 nested_vmx_procbased_ctls_low; + u32 nested_vmx_procbased_ctls_high; + u32 nested_vmx_true_procbased_ctls_low; + u32 nested_vmx_secondary_ctls_low; + u32 nested_vmx_secondary_ctls_high; + u32 nested_vmx_pinbased_ctls_low; + u32 nested_vmx_pinbased_ctls_high; + u32 nested_vmx_exit_ctls_low; + u32 nested_vmx_exit_ctls_high; + u32 nested_vmx_true_exit_ctls_low; + u32 nested_vmx_entry_ctls_low; + u32 nested_vmx_entry_ctls_high; + u32 nested_vmx_true_entry_ctls_low; + u32 nested_vmx_misc_low; + u32 nested_vmx_misc_high; + u32 nested_vmx_ept_caps; }; #define POSTED_INTR_ON 0 @@ -511,6 +543,10 @@ struct vcpu_vmx { /* Dynamic PLE window. */ int ple_window; bool ple_window_dirty; + + /* Support for PML */ +#define PML_ENTITY_NUM 512 + struct page *pml_pg; }; enum segment_cache_field { @@ -594,6 +630,7 @@ static int max_shadow_read_write_fields = static const unsigned short vmcs_field_to_offset_table[] = { FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id), + FIELD(POSTED_INTR_NV, posted_intr_nv), FIELD(GUEST_ES_SELECTOR, guest_es_selector), FIELD(GUEST_CS_SELECTOR, guest_cs_selector), FIELD(GUEST_SS_SELECTOR, guest_ss_selector), @@ -602,6 +639,7 @@ static const unsigned short vmcs_field_to_offset_table[] = { FIELD(GUEST_GS_SELECTOR, guest_gs_selector), FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector), FIELD(GUEST_TR_SELECTOR, guest_tr_selector), + FIELD(GUEST_INTR_STATUS, guest_intr_status), FIELD(HOST_ES_SELECTOR, host_es_selector), FIELD(HOST_CS_SELECTOR, host_cs_selector), FIELD(HOST_SS_SELECTOR, host_ss_selector), @@ -618,7 +656,12 @@ static const unsigned short vmcs_field_to_offset_table[] = { FIELD64(TSC_OFFSET, tsc_offset), FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr), FIELD64(APIC_ACCESS_ADDR, apic_access_addr), + FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr), FIELD64(EPT_POINTER, ept_pointer), + FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0), + FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1), + FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2), + FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3), FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap), FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address), FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer), @@ -766,6 +809,7 @@ static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxoff(void); static bool vmx_mpx_supported(void); static bool vmx_xsaves_supported(void); +static int vmx_vm_has_apicv(struct kvm *kvm); static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); @@ -793,6 +837,7 @@ static unsigned long *vmx_msr_bitmap_legacy; static unsigned long *vmx_msr_bitmap_longmode; static unsigned long *vmx_msr_bitmap_legacy_x2apic; static unsigned long *vmx_msr_bitmap_longmode_x2apic; +static unsigned long *vmx_msr_bitmap_nested; static unsigned long *vmx_vmread_bitmap; static unsigned long *vmx_vmwrite_bitmap; @@ -959,16 +1004,6 @@ static inline bool cpu_has_vmx_ept_execute_only(void) return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT; } -static inline bool cpu_has_vmx_eptp_uncacheable(void) -{ - return vmx_capability.ept & VMX_EPTP_UC_BIT; -} - -static inline bool cpu_has_vmx_eptp_writeback(void) -{ - return vmx_capability.ept & VMX_EPTP_WB_BIT; -} - static inline bool cpu_has_vmx_ept_2m_page(void) { return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT; @@ -1073,6 +1108,11 @@ static inline bool cpu_has_vmx_shadow_vmcs(void) SECONDARY_EXEC_SHADOW_VMCS; } +static inline bool cpu_has_vmx_pml(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML; +} + static inline bool report_flexpriority(void) { return flexpriority_enabled; @@ -1112,6 +1152,26 @@ static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12) vmx_xsaves_supported(); } +static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); +} + +static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT); +} + +static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); +} + +static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12) +{ + return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR; +} + static inline bool is_exception(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) @@ -2284,20 +2344,8 @@ static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu) * if the corresponding bit in the (32-bit) control field *must* be on, and a * bit in the high half is on if the corresponding bit in the control field * may be on. See also vmx_control_verify(). - * TODO: allow these variables to be modified (downgraded) by module options - * or other means. */ -static u32 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high; -static u32 nested_vmx_true_procbased_ctls_low; -static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high; -static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high; -static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high; -static u32 nested_vmx_true_exit_ctls_low; -static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high; -static u32 nested_vmx_true_entry_ctls_low; -static u32 nested_vmx_misc_low, nested_vmx_misc_high; -static u32 nested_vmx_ept_caps; -static __init void nested_vmx_setup_ctls_msrs(void) +static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) { /* * Note that as a general rule, the high half of the MSRs (bits in @@ -2316,57 +2364,74 @@ static __init void nested_vmx_setup_ctls_msrs(void) /* pin-based controls */ rdmsr(MSR_IA32_VMX_PINBASED_CTLS, - nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high); - nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; - nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK | - PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS; - nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | + vmx->nested.nested_vmx_pinbased_ctls_low, + vmx->nested.nested_vmx_pinbased_ctls_high); + vmx->nested.nested_vmx_pinbased_ctls_low |= + PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; + vmx->nested.nested_vmx_pinbased_ctls_high &= + PIN_BASED_EXT_INTR_MASK | + PIN_BASED_NMI_EXITING | + PIN_BASED_VIRTUAL_NMIS; + vmx->nested.nested_vmx_pinbased_ctls_high |= + PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | PIN_BASED_VMX_PREEMPTION_TIMER; + if (vmx_vm_has_apicv(vmx->vcpu.kvm)) + vmx->nested.nested_vmx_pinbased_ctls_high |= + PIN_BASED_POSTED_INTR; /* exit controls */ rdmsr(MSR_IA32_VMX_EXIT_CTLS, - nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high); - nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; + vmx->nested.nested_vmx_exit_ctls_low, + vmx->nested.nested_vmx_exit_ctls_high); + vmx->nested.nested_vmx_exit_ctls_low = + VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; - nested_vmx_exit_ctls_high &= + vmx->nested.nested_vmx_exit_ctls_high &= #ifdef CONFIG_X86_64 VM_EXIT_HOST_ADDR_SPACE_SIZE | #endif VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; - nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | + vmx->nested.nested_vmx_exit_ctls_high |= + VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; if (vmx_mpx_supported()) - nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; + vmx->nested.nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; /* We support free control of debug control saving. */ - nested_vmx_true_exit_ctls_low = nested_vmx_exit_ctls_low & + vmx->nested.nested_vmx_true_exit_ctls_low = + vmx->nested.nested_vmx_exit_ctls_low & ~VM_EXIT_SAVE_DEBUG_CONTROLS; /* entry controls */ rdmsr(MSR_IA32_VMX_ENTRY_CTLS, - nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high); - nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; - nested_vmx_entry_ctls_high &= + vmx->nested.nested_vmx_entry_ctls_low, + vmx->nested.nested_vmx_entry_ctls_high); + vmx->nested.nested_vmx_entry_ctls_low = + VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; + vmx->nested.nested_vmx_entry_ctls_high &= #ifdef CONFIG_X86_64 VM_ENTRY_IA32E_MODE | #endif VM_ENTRY_LOAD_IA32_PAT; - nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | - VM_ENTRY_LOAD_IA32_EFER); + vmx->nested.nested_vmx_entry_ctls_high |= + (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); if (vmx_mpx_supported()) - nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; + vmx->nested.nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; /* We support free control of debug control loading. */ - nested_vmx_true_entry_ctls_low = nested_vmx_entry_ctls_low & + vmx->nested.nested_vmx_true_entry_ctls_low = + vmx->nested.nested_vmx_entry_ctls_low & ~VM_ENTRY_LOAD_DEBUG_CONTROLS; /* cpu-based controls */ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, - nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high); - nested_vmx_procbased_ctls_low = CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; - nested_vmx_procbased_ctls_high &= + vmx->nested.nested_vmx_procbased_ctls_low, + vmx->nested.nested_vmx_procbased_ctls_high); + vmx->nested.nested_vmx_procbased_ctls_low = + CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; + vmx->nested.nested_vmx_procbased_ctls_high &= CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING | CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | @@ -2386,45 +2451,55 @@ static __init void nested_vmx_setup_ctls_msrs(void) * can use it to avoid exits to L1 - even when L0 runs L2 * without MSR bitmaps. */ - nested_vmx_procbased_ctls_high |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | + vmx->nested.nested_vmx_procbased_ctls_high |= + CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | CPU_BASED_USE_MSR_BITMAPS; /* We support free control of CR3 access interception. */ - nested_vmx_true_procbased_ctls_low = nested_vmx_procbased_ctls_low & + vmx->nested.nested_vmx_true_procbased_ctls_low = + vmx->nested.nested_vmx_procbased_ctls_low & ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); /* secondary cpu-based controls */ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, - nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high); - nested_vmx_secondary_ctls_low = 0; - nested_vmx_secondary_ctls_high &= + vmx->nested.nested_vmx_secondary_ctls_low, + vmx->nested.nested_vmx_secondary_ctls_high); + vmx->nested.nested_vmx_secondary_ctls_low = 0; + vmx->nested.nested_vmx_secondary_ctls_high &= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_WBINVD_EXITING | SECONDARY_EXEC_XSAVES; if (enable_ept) { /* nested EPT: emulate EPT also to L1 */ - nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT | + vmx->nested.nested_vmx_secondary_ctls_high |= + SECONDARY_EXEC_ENABLE_EPT | SECONDARY_EXEC_UNRESTRICTED_GUEST; - nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT | + vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT | VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT | VMX_EPT_INVEPT_BIT; - nested_vmx_ept_caps &= vmx_capability.ept; + vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept; /* * For nested guests, we don't do anything specific * for single context invalidation. Hence, only advertise * support for global context invalidation. */ - nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT; + vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT; } else - nested_vmx_ept_caps = 0; + vmx->nested.nested_vmx_ept_caps = 0; /* miscellaneous data */ - rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high); - nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA; - nested_vmx_misc_low |= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | + rdmsr(MSR_IA32_VMX_MISC, + vmx->nested.nested_vmx_misc_low, + vmx->nested.nested_vmx_misc_high); + vmx->nested.nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA; + vmx->nested.nested_vmx_misc_low |= + VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | VMX_MISC_ACTIVITY_HLT; - nested_vmx_misc_high = 0; + vmx->nested.nested_vmx_misc_high = 0; } static inline bool vmx_control_verify(u32 control, u32 low, u32 high) @@ -2443,6 +2518,8 @@ static inline u64 vmx_control_msr(u32 low, u32 high) /* Returns 0 on success, non-0 otherwise. */ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) { + struct vcpu_vmx *vmx = to_vmx(vcpu); + switch (msr_index) { case MSR_IA32_VMX_BASIC: /* @@ -2457,36 +2534,44 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) break; case MSR_IA32_VMX_TRUE_PINBASED_CTLS: case MSR_IA32_VMX_PINBASED_CTLS: - *pdata = vmx_control_msr(nested_vmx_pinbased_ctls_low, - nested_vmx_pinbased_ctls_high); + *pdata = vmx_control_msr( + vmx->nested.nested_vmx_pinbased_ctls_low, + vmx->nested.nested_vmx_pinbased_ctls_high); break; case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: - *pdata = vmx_control_msr(nested_vmx_true_procbased_ctls_low, - nested_vmx_procbased_ctls_high); + *pdata = vmx_control_msr( + vmx->nested.nested_vmx_true_procbased_ctls_low, + vmx->nested.nested_vmx_procbased_ctls_high); break; case MSR_IA32_VMX_PROCBASED_CTLS: - *pdata = vmx_control_msr(nested_vmx_procbased_ctls_low, - nested_vmx_procbased_ctls_high); + *pdata = vmx_control_msr( + vmx->nested.nested_vmx_procbased_ctls_low, + vmx->nested.nested_vmx_procbased_ctls_high); break; case MSR_IA32_VMX_TRUE_EXIT_CTLS: - *pdata = vmx_control_msr(nested_vmx_true_exit_ctls_low, - nested_vmx_exit_ctls_high); + *pdata = vmx_control_msr( + vmx->nested.nested_vmx_true_exit_ctls_low, + vmx->nested.nested_vmx_exit_ctls_high); break; case MSR_IA32_VMX_EXIT_CTLS: - *pdata = vmx_control_msr(nested_vmx_exit_ctls_low, - nested_vmx_exit_ctls_high); + *pdata = vmx_control_msr( + vmx->nested.nested_vmx_exit_ctls_low, + vmx->nested.nested_vmx_exit_ctls_high); break; case MSR_IA32_VMX_TRUE_ENTRY_CTLS: - *pdata = vmx_control_msr(nested_vmx_true_entry_ctls_low, - nested_vmx_entry_ctls_high); + *pdata = vmx_control_msr( + vmx->nested.nested_vmx_true_entry_ctls_low, + vmx->nested.nested_vmx_entry_ctls_high); break; case MSR_IA32_VMX_ENTRY_CTLS: - *pdata = vmx_control_msr(nested_vmx_entry_ctls_low, - nested_vmx_entry_ctls_high); + *pdata = vmx_control_msr( + vmx->nested.nested_vmx_entry_ctls_low, + vmx->nested.nested_vmx_entry_ctls_high); break; case MSR_IA32_VMX_MISC: - *pdata = vmx_control_msr(nested_vmx_misc_low, - nested_vmx_misc_high); + *pdata = vmx_control_msr( + vmx->nested.nested_vmx_misc_low, + vmx->nested.nested_vmx_misc_high); break; /* * These MSRs specify bits which the guest must keep fixed (on or off) @@ -2511,12 +2596,13 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) *pdata = 0x2e; /* highest index: VMX_PREEMPTION_TIMER_VALUE */ break; case MSR_IA32_VMX_PROCBASED_CTLS2: - *pdata = vmx_control_msr(nested_vmx_secondary_ctls_low, - nested_vmx_secondary_ctls_high); + *pdata = vmx_control_msr( + vmx->nested.nested_vmx_secondary_ctls_low, + vmx->nested.nested_vmx_secondary_ctls_high); break; case MSR_IA32_VMX_EPT_VPID_CAP: /* Currently, no nested vpid support */ - *pdata = nested_vmx_ept_caps; + *pdata = vmx->nested.nested_vmx_ept_caps; break; default: return 1; @@ -2785,7 +2871,7 @@ static int hardware_enable(void) u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); u64 old, test_bits; - if (read_cr4() & X86_CR4_VMXE) + if (cr4_read_shadow() & X86_CR4_VMXE) return -EBUSY; INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); @@ -2812,7 +2898,7 @@ static int hardware_enable(void) /* enable and lock */ wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits); } - write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ + cr4_set_bits(X86_CR4_VMXE); if (vmm_exclusive) { kvm_cpu_vmxon(phys_addr); @@ -2849,7 +2935,7 @@ static void hardware_disable(void) vmclear_local_loaded_vmcss(); kvm_cpu_vmxoff(); } - write_cr4(read_cr4() & ~X86_CR4_VMXE); + cr4_clear_bits(X86_CR4_VMXE); } static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, @@ -2929,7 +3015,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_SHADOW_VMCS | - SECONDARY_EXEC_XSAVES; + SECONDARY_EXEC_XSAVES | + SECONDARY_EXEC_ENABLE_PML; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) @@ -4159,6 +4246,52 @@ static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, } } +/* + * If a msr is allowed by L0, we should check whether it is allowed by L1. + * The corresponding bit will be cleared unless both of L0 and L1 allow it. + */ +static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1, + unsigned long *msr_bitmap_nested, + u32 msr, int type) +{ + int f = sizeof(unsigned long); + + if (!cpu_has_vmx_msr_bitmap()) { + WARN_ON(1); + return; + } + + /* + * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals + * have the write-low and read-high bitmap offsets the wrong way round. + * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. + */ + if (msr <= 0x1fff) { + if (type & MSR_TYPE_R && + !test_bit(msr, msr_bitmap_l1 + 0x000 / f)) + /* read-low */ + __clear_bit(msr, msr_bitmap_nested + 0x000 / f); + + if (type & MSR_TYPE_W && + !test_bit(msr, msr_bitmap_l1 + 0x800 / f)) + /* write-low */ + __clear_bit(msr, msr_bitmap_nested + 0x800 / f); + + } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { + msr &= 0x1fff; + if (type & MSR_TYPE_R && + !test_bit(msr, msr_bitmap_l1 + 0x400 / f)) + /* read-high */ + __clear_bit(msr, msr_bitmap_nested + 0x400 / f); + + if (type & MSR_TYPE_W && + !test_bit(msr, msr_bitmap_l1 + 0xc00 / f)) + /* write-high */ + __clear_bit(msr, msr_bitmap_nested + 0xc00 / f); + + } +} + static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) { if (!longmode_only) @@ -4197,6 +4330,64 @@ static int vmx_vm_has_apicv(struct kvm *kvm) return enable_apicv && irqchip_in_kernel(kvm); } +static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int max_irr; + void *vapic_page; + u16 status; + + if (vmx->nested.pi_desc && + vmx->nested.pi_pending) { + vmx->nested.pi_pending = false; + if (!pi_test_and_clear_on(vmx->nested.pi_desc)) + return 0; + + max_irr = find_last_bit( + (unsigned long *)vmx->nested.pi_desc->pir, 256); + + if (max_irr == 256) + return 0; + + vapic_page = kmap(vmx->nested.virtual_apic_page); + if (!vapic_page) { + WARN_ON(1); + return -ENOMEM; + } + __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page); + kunmap(vmx->nested.virtual_apic_page); + + status = vmcs_read16(GUEST_INTR_STATUS); + if ((u8)max_irr > ((u8)status & 0xff)) { + status &= ~0xff; + status |= (u8)max_irr; + vmcs_write16(GUEST_INTR_STATUS, status); + } + } + return 0; +} + +static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, + int vector) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (is_guest_mode(vcpu) && + vector == vmx->nested.posted_intr_nv) { + /* the PIR and ON have been set by L1. */ + if (vcpu->mode == IN_GUEST_MODE) + apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), + POSTED_INTR_VECTOR); + /* + * If a posted intr is not recognized by hardware, + * we will accomplish it in the next vmentry. + */ + vmx->nested.pi_pending = true; + kvm_make_request(KVM_REQ_EVENT, vcpu); + return 0; + } + return -1; +} /* * Send interrupt to vcpu via posted interrupt way. * 1. If target vcpu is running(non-root mode), send posted interrupt @@ -4209,6 +4400,10 @@ static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) struct vcpu_vmx *vmx = to_vmx(vcpu); int r; + r = vmx_deliver_nested_posted_interrupt(vcpu, vector); + if (!r) + return; + if (pi_test_and_set_pir(vector, &vmx->pi_desc)) return; @@ -4255,7 +4450,7 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ /* Save the most likely value for this task's CR4 in the VMCS. */ - cr4 = read_cr4(); + cr4 = cr4_read_shadow(); vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */ vmx->host_state.vmcs_host_cr4 = cr4; @@ -4360,6 +4555,9 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) a current VMCS12 */ exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; + /* PML is enabled/disabled in creating/destorying vcpu */ + exec_control &= ~SECONDARY_EXEC_ENABLE_PML; + return exec_control; } @@ -4986,11 +5184,12 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) hypercall[2] = 0xc1; } -static bool nested_cr0_valid(struct vmcs12 *vmcs12, unsigned long val) +static bool nested_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) { unsigned long always_on = VMXON_CR0_ALWAYSON; + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - if (nested_vmx_secondary_ctls_high & + if (to_vmx(vcpu)->nested.nested_vmx_secondary_ctls_high & SECONDARY_EXEC_UNRESTRICTED_GUEST && nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) always_on &= ~(X86_CR0_PE | X86_CR0_PG); @@ -5015,7 +5214,7 @@ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) val = (val & ~vmcs12->cr0_guest_host_mask) | (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask); - if (!nested_cr0_valid(vmcs12, val)) + if (!nested_cr0_valid(vcpu, val)) return 1; if (kvm_set_cr0(vcpu, val)) @@ -5817,13 +6016,21 @@ static __init int hardware_setup(void) (unsigned long *)__get_free_page(GFP_KERNEL); if (!vmx_msr_bitmap_longmode_x2apic) goto out4; + + if (nested) { + vmx_msr_bitmap_nested = + (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_msr_bitmap_nested) + goto out5; + } + vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); if (!vmx_vmread_bitmap) - goto out5; + goto out6; vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); if (!vmx_vmwrite_bitmap) - goto out6; + goto out7; memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); @@ -5839,10 +6046,12 @@ static __init int hardware_setup(void) memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); + if (nested) + memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE); if (setup_vmcs_config(&vmcs_config) < 0) { r = -EIO; - goto out7; + goto out8; } if (boot_cpu_has(X86_FEATURE_NX)) @@ -5868,16 +6077,16 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_unrestricted_guest()) enable_unrestricted_guest = 0; - if (!cpu_has_vmx_flexpriority()) { + if (!cpu_has_vmx_flexpriority()) flexpriority_enabled = 0; - /* - * set_apic_access_page_addr() is used to reload apic access - * page upon invalidation. No need to do anything if the - * processor does not have the APIC_ACCESS_ADDR VMCS field. - */ + /* + * set_apic_access_page_addr() is used to reload apic access + * page upon invalidation. No need to do anything if not + * using the APIC_ACCESS_ADDR VMCS field. + */ + if (!flexpriority_enabled) kvm_x86_ops->set_apic_access_page_addr = NULL; - } if (!cpu_has_vmx_tpr_shadow()) kvm_x86_ops->update_cr8_intercept = NULL; @@ -5895,13 +6104,11 @@ static __init int hardware_setup(void) kvm_x86_ops->update_cr8_intercept = NULL; else { kvm_x86_ops->hwapic_irr_update = NULL; + kvm_x86_ops->hwapic_isr_update = NULL; kvm_x86_ops->deliver_posted_interrupt = NULL; kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy; } - if (nested) - nested_vmx_setup_ctls_msrs(); - vmx_disable_intercept_for_msr(MSR_FS_BASE, false); vmx_disable_intercept_for_msr(MSR_GS_BASE, false); vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); @@ -5945,12 +6152,29 @@ static __init int hardware_setup(void) update_ple_window_actual_max(); + /* + * Only enable PML when hardware supports PML feature, and both EPT + * and EPT A/D bit features are enabled -- PML depends on them to work. + */ + if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml()) + enable_pml = 0; + + if (!enable_pml) { + kvm_x86_ops->slot_enable_log_dirty = NULL; + kvm_x86_ops->slot_disable_log_dirty = NULL; + kvm_x86_ops->flush_log_dirty = NULL; + kvm_x86_ops->enable_log_dirty_pt_masked = NULL; + } + return alloc_kvm_area(); -out7: +out8: free_page((unsigned long)vmx_vmwrite_bitmap); -out6: +out7: free_page((unsigned long)vmx_vmread_bitmap); +out6: + if (nested) + free_page((unsigned long)vmx_msr_bitmap_nested); out5: free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); out4: @@ -5977,6 +6201,8 @@ static __exit void hardware_unsetup(void) free_page((unsigned long)vmx_io_bitmap_a); free_page((unsigned long)vmx_vmwrite_bitmap); free_page((unsigned long)vmx_vmread_bitmap); + if (nested) + free_page((unsigned long)vmx_msr_bitmap_nested); free_kvm_area(); } @@ -6143,6 +6369,13 @@ static void nested_vmx_failValid(struct kvm_vcpu *vcpu, */ } +static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) +{ + /* TODO: not to reset guest simply here. */ + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + pr_warn("kvm: nested vmx abort, indicator %d\n", indicator); +} + static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) { struct vcpu_vmx *vmx = @@ -6432,6 +6665,7 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); vmcs_write64(VMCS_LINK_POINTER, -1ull); } + vmx->nested.posted_intr_nv = -1; kunmap(vmx->nested.current_vmcs12_page); nested_release_page(vmx->nested.current_vmcs12_page); vmx->nested.current_vmptr = -1ull; @@ -6460,6 +6694,12 @@ static void free_nested(struct vcpu_vmx *vmx) nested_release_page(vmx->nested.virtual_apic_page); vmx->nested.virtual_apic_page = NULL; } + if (vmx->nested.pi_desc_page) { + kunmap(vmx->nested.pi_desc_page); + nested_release_page(vmx->nested.pi_desc_page); + vmx->nested.pi_desc_page = NULL; + vmx->nested.pi_desc = NULL; + } nested_free_all_saved_vmcss(vmx); } @@ -6893,6 +7133,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) /* Emulate the INVEPT instruction */ static int handle_invept(struct kvm_vcpu *vcpu) { + struct vcpu_vmx *vmx = to_vmx(vcpu); u32 vmx_instruction_info, types; unsigned long type; gva_t gva; @@ -6901,8 +7142,9 @@ static int handle_invept(struct kvm_vcpu *vcpu) u64 eptp, gpa; } operand; - if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) || - !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) { + if (!(vmx->nested.nested_vmx_secondary_ctls_high & + SECONDARY_EXEC_ENABLE_EPT) || + !(vmx->nested.nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } @@ -6918,7 +7160,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); - types = (nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; + types = (vmx->nested.nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; if (!(types & (1UL << type))) { nested_vmx_failValid(vcpu, @@ -6960,6 +7202,31 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) return 1; } +static int handle_pml_full(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qualification; + + trace_kvm_pml_full(vcpu->vcpu_id); + + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + + /* + * PML buffer FULL happened while executing iret from NMI, + * "blocked by NMI" bit has to be set before next VM entry. + */ + if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && + cpu_has_virtual_nmis() && + (exit_qualification & INTR_INFO_UNBLOCK_NMI)) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + + /* + * PML buffer already flushed at beginning of VMEXIT. Nothing to do + * here.., and there's no userspace involvement needed for PML. + */ + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -7008,6 +7275,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_INVVPID] = handle_invvpid, [EXIT_REASON_XSAVES] = handle_xsaves, [EXIT_REASON_XRSTORS] = handle_xrstors, + [EXIT_REASON_PML_FULL] = handle_pml_full, }; static const int kvm_vmx_max_exit_handlers = @@ -7275,6 +7543,10 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) case EXIT_REASON_APIC_ACCESS: return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); + case EXIT_REASON_APIC_WRITE: + case EXIT_REASON_EOI_INDUCED: + /* apic_write and eoi_induced should exit unconditionally. */ + return 1; case EXIT_REASON_EPT_VIOLATION: /* * L0 always deals with the EPT violation. If nested EPT is @@ -7314,6 +7586,89 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) *info2 = vmcs_read32(VM_EXIT_INTR_INFO); } +static int vmx_enable_pml(struct vcpu_vmx *vmx) +{ + struct page *pml_pg; + u32 exec_control; + + pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!pml_pg) + return -ENOMEM; + + vmx->pml_pg = pml_pg; + + vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); + vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); + + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + exec_control |= SECONDARY_EXEC_ENABLE_PML; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + + return 0; +} + +static void vmx_disable_pml(struct vcpu_vmx *vmx) +{ + u32 exec_control; + + ASSERT(vmx->pml_pg); + __free_page(vmx->pml_pg); + vmx->pml_pg = NULL; + + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + exec_control &= ~SECONDARY_EXEC_ENABLE_PML; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); +} + +static void vmx_flush_pml_buffer(struct vcpu_vmx *vmx) +{ + struct kvm *kvm = vmx->vcpu.kvm; + u64 *pml_buf; + u16 pml_idx; + + pml_idx = vmcs_read16(GUEST_PML_INDEX); + + /* Do nothing if PML buffer is empty */ + if (pml_idx == (PML_ENTITY_NUM - 1)) + return; + + /* PML index always points to next available PML buffer entity */ + if (pml_idx >= PML_ENTITY_NUM) + pml_idx = 0; + else + pml_idx++; + + pml_buf = page_address(vmx->pml_pg); + for (; pml_idx < PML_ENTITY_NUM; pml_idx++) { + u64 gpa; + + gpa = pml_buf[pml_idx]; + WARN_ON(gpa & (PAGE_SIZE - 1)); + mark_page_dirty(kvm, gpa >> PAGE_SHIFT); + } + + /* reset PML index */ + vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); +} + +/* + * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap. + * Called before reporting dirty_bitmap to userspace. + */ +static void kvm_flush_pml_buffers(struct kvm *kvm) +{ + int i; + struct kvm_vcpu *vcpu; + /* + * We only need to kick vcpu out of guest mode here, as PML buffer + * is flushed at beginning of all VMEXITs, and it's obvious that only + * vcpus running in guest are possible to have unflushed GPAs in PML + * buffer. + */ + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_vcpu_kick(vcpu); +} + /* * The guest has exited. See if we can fix it or if we need userspace * assistance. @@ -7324,6 +7679,16 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) u32 exit_reason = vmx->exit_reason; u32 vectoring_info = vmx->idt_vectoring_info; + /* + * Flush logged GPAs PML buffer, this will make dirty_bitmap more + * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before + * querying dirty_bitmap, we only need to kick all vcpus out of guest + * mode as if vcpus is in root mode, the PML buffer must has been + * flushed already. + */ + if (enable_pml) + vmx_flush_pml_buffer(vmx); + /* If guest state is invalid, start emulating */ if (vmx->emulation_required) return handle_invalid_guest_state(vcpu); @@ -7471,9 +7836,6 @@ static void vmx_hwapic_isr_update(struct kvm *kvm, int isr) u16 status; u8 old; - if (!vmx_vm_has_apicv(kvm)) - return; - if (isr == -1) isr = 0; @@ -7784,7 +8146,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); - cr4 = read_cr4(); + cr4 = cr4_read_shadow(); if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) { vmcs_writel(HOST_CR4, cr4); vmx->host_state.vmcs_host_cr4 = cr4; @@ -7973,6 +8335,8 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + if (enable_pml) + vmx_disable_pml(vmx); free_vpid(vmx); leave_guest_mode(vcpu); vmx_load_vmcs01(vcpu); @@ -8040,9 +8404,25 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) goto free_vmcs; } + if (nested) + nested_vmx_setup_ctls_msrs(vmx); + + vmx->nested.posted_intr_nv = -1; vmx->nested.current_vmptr = -1ull; vmx->nested.current_vmcs12 = NULL; + /* + * If PML is turned on, failure on enabling PML just results in failure + * of creating the vcpu, therefore we can simplify PML logic (by + * avoiding dealing with cases, such as enabling PML partially on vcpus + * for the guest, etc. + */ + if (enable_pml) { + err = vmx_enable_pml(vmx); + if (err) + goto free_vmcs; + } + return &vmx->vcpu; free_vmcs: @@ -8184,9 +8564,10 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) { - kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu, - nested_vmx_ept_caps & VMX_EPT_EXECUTE_ONLY_BIT); - + WARN_ON(mmu_is_nested(vcpu)); + kvm_init_shadow_ept_mmu(vcpu, + to_vmx(vcpu)->nested.nested_vmx_ept_caps & + VMX_EPT_EXECUTE_ONLY_BIT); vcpu->arch.mmu.set_cr3 = vmx_set_cr3; vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3; vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault; @@ -8199,6 +8580,18 @@ static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) vcpu->arch.walk_mmu = &vcpu->arch.mmu; } +static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, + u16 error_code) +{ + bool inequality, bit; + + bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; + inequality = + (error_code & vmcs12->page_fault_error_code_mask) != + vmcs12->page_fault_error_code_match; + return inequality ^ bit; +} + static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, struct x86_exception *fault) { @@ -8206,8 +8599,7 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, WARN_ON(!is_guest_mode(vcpu)); - /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */ - if (vmcs12->exception_bitmap & (1u << PF_VECTOR)) + if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code)) nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason, vmcs_read32(VM_EXIT_INTR_INFO), vmcs_readl(EXIT_QUALIFICATION)); @@ -8261,6 +8653,31 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, return false; } + if (nested_cpu_has_posted_intr(vmcs12)) { + if (!IS_ALIGNED(vmcs12->posted_intr_desc_addr, 64)) + return false; + + if (vmx->nested.pi_desc_page) { /* shouldn't happen */ + kunmap(vmx->nested.pi_desc_page); + nested_release_page(vmx->nested.pi_desc_page); + } + vmx->nested.pi_desc_page = + nested_get_page(vcpu, vmcs12->posted_intr_desc_addr); + if (!vmx->nested.pi_desc_page) + return false; + + vmx->nested.pi_desc = + (struct pi_desc *)kmap(vmx->nested.pi_desc_page); + if (!vmx->nested.pi_desc) { + nested_release_page_clean(vmx->nested.pi_desc_page); + return false; + } + vmx->nested.pi_desc = + (struct pi_desc *)((void *)vmx->nested.pi_desc + + (unsigned long)(vmcs12->posted_intr_desc_addr & + (PAGE_SIZE - 1))); + } + return true; } @@ -8286,6 +8703,310 @@ static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL); } +static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + int maxphyaddr; + u64 addr; + + if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) + return 0; + + if (vmcs12_read_any(vcpu, MSR_BITMAP, &addr)) { + WARN_ON(1); + return -EINVAL; + } + maxphyaddr = cpuid_maxphyaddr(vcpu); + + if (!PAGE_ALIGNED(vmcs12->msr_bitmap) || + ((addr + PAGE_SIZE) >> maxphyaddr)) + return -EINVAL; + + return 0; +} + +/* + * Merge L0's and L1's MSR bitmap, return false to indicate that + * we do not use the hardware. + */ +static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + int msr; + struct page *page; + unsigned long *msr_bitmap; + + if (!nested_cpu_has_virt_x2apic_mode(vmcs12)) + return false; + + page = nested_get_page(vcpu, vmcs12->msr_bitmap); + if (!page) { + WARN_ON(1); + return false; + } + msr_bitmap = (unsigned long *)kmap(page); + if (!msr_bitmap) { + nested_release_page_clean(page); + WARN_ON(1); + return false; + } + + if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { + if (nested_cpu_has_apic_reg_virt(vmcs12)) + for (msr = 0x800; msr <= 0x8ff; msr++) + nested_vmx_disable_intercept_for_msr( + msr_bitmap, + vmx_msr_bitmap_nested, + msr, MSR_TYPE_R); + /* TPR is allowed */ + nested_vmx_disable_intercept_for_msr(msr_bitmap, + vmx_msr_bitmap_nested, + APIC_BASE_MSR + (APIC_TASKPRI >> 4), + MSR_TYPE_R | MSR_TYPE_W); + if (nested_cpu_has_vid(vmcs12)) { + /* EOI and self-IPI are allowed */ + nested_vmx_disable_intercept_for_msr( + msr_bitmap, + vmx_msr_bitmap_nested, + APIC_BASE_MSR + (APIC_EOI >> 4), + MSR_TYPE_W); + nested_vmx_disable_intercept_for_msr( + msr_bitmap, + vmx_msr_bitmap_nested, + APIC_BASE_MSR + (APIC_SELF_IPI >> 4), + MSR_TYPE_W); + } + } else { + /* + * Enable reading intercept of all the x2apic + * MSRs. We should not rely on vmcs12 to do any + * optimizations here, it may have been modified + * by L1. + */ + for (msr = 0x800; msr <= 0x8ff; msr++) + __vmx_enable_intercept_for_msr( + vmx_msr_bitmap_nested, + msr, + MSR_TYPE_R); + + __vmx_enable_intercept_for_msr( + vmx_msr_bitmap_nested, + APIC_BASE_MSR + (APIC_TASKPRI >> 4), + MSR_TYPE_W); + __vmx_enable_intercept_for_msr( + vmx_msr_bitmap_nested, + APIC_BASE_MSR + (APIC_EOI >> 4), + MSR_TYPE_W); + __vmx_enable_intercept_for_msr( + vmx_msr_bitmap_nested, + APIC_BASE_MSR + (APIC_SELF_IPI >> 4), + MSR_TYPE_W); + } + kunmap(page); + nested_release_page_clean(page); + + return true; +} + +static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && + !nested_cpu_has_apic_reg_virt(vmcs12) && + !nested_cpu_has_vid(vmcs12) && + !nested_cpu_has_posted_intr(vmcs12)) + return 0; + + /* + * If virtualize x2apic mode is enabled, + * virtualize apic access must be disabled. + */ + if (nested_cpu_has_virt_x2apic_mode(vmcs12) && + nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) + return -EINVAL; + + /* + * If virtual interrupt delivery is enabled, + * we must exit on external interrupts. + */ + if (nested_cpu_has_vid(vmcs12) && + !nested_exit_on_intr(vcpu)) + return -EINVAL; + + /* + * bits 15:8 should be zero in posted_intr_nv, + * the descriptor address has been already checked + * in nested_get_vmcs12_pages. + */ + if (nested_cpu_has_posted_intr(vmcs12) && + (!nested_cpu_has_vid(vmcs12) || + !nested_exit_intr_ack_set(vcpu) || + vmcs12->posted_intr_nv & 0xff00)) + return -EINVAL; + + /* tpr shadow is needed by all apicv features. */ + if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) + return -EINVAL; + + return 0; +} + +static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, + unsigned long count_field, + unsigned long addr_field, + int maxphyaddr) +{ + u64 count, addr; + + if (vmcs12_read_any(vcpu, count_field, &count) || + vmcs12_read_any(vcpu, addr_field, &addr)) { + WARN_ON(1); + return -EINVAL; + } + if (count == 0) + return 0; + if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr || + (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) { + pr_warn_ratelimited( + "nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)", + addr_field, maxphyaddr, count, addr); + return -EINVAL; + } + return 0; +} + +static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + int maxphyaddr; + + if (vmcs12->vm_exit_msr_load_count == 0 && + vmcs12->vm_exit_msr_store_count == 0 && + vmcs12->vm_entry_msr_load_count == 0) + return 0; /* Fast path */ + maxphyaddr = cpuid_maxphyaddr(vcpu); + if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT, + VM_EXIT_MSR_LOAD_ADDR, maxphyaddr) || + nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT, + VM_EXIT_MSR_STORE_ADDR, maxphyaddr) || + nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT, + VM_ENTRY_MSR_LOAD_ADDR, maxphyaddr)) + return -EINVAL; + return 0; +} + +static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, + struct vmx_msr_entry *e) +{ + /* x2APIC MSR accesses are not allowed */ + if (apic_x2apic_mode(vcpu->arch.apic) && e->index >> 8 == 0x8) + return -EINVAL; + if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */ + e->index == MSR_IA32_UCODE_REV) + return -EINVAL; + if (e->reserved != 0) + return -EINVAL; + return 0; +} + +static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, + struct vmx_msr_entry *e) +{ + if (e->index == MSR_FS_BASE || + e->index == MSR_GS_BASE || + e->index == MSR_IA32_SMM_MONITOR_CTL || /* SMM is not supported */ + nested_vmx_msr_check_common(vcpu, e)) + return -EINVAL; + return 0; +} + +static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, + struct vmx_msr_entry *e) +{ + if (e->index == MSR_IA32_SMBASE || /* SMM is not supported */ + nested_vmx_msr_check_common(vcpu, e)) + return -EINVAL; + return 0; +} + +/* + * Load guest's/host's msr at nested entry/exit. + * return 0 for success, entry index for failure. + */ +static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) +{ + u32 i; + struct vmx_msr_entry e; + struct msr_data msr; + + msr.host_initiated = false; + for (i = 0; i < count; i++) { + if (kvm_read_guest(vcpu->kvm, gpa + i * sizeof(e), + &e, sizeof(e))) { + pr_warn_ratelimited( + "%s cannot read MSR entry (%u, 0x%08llx)\n", + __func__, i, gpa + i * sizeof(e)); + goto fail; + } + if (nested_vmx_load_msr_check(vcpu, &e)) { + pr_warn_ratelimited( + "%s check failed (%u, 0x%x, 0x%x)\n", + __func__, i, e.index, e.reserved); + goto fail; + } + msr.index = e.index; + msr.data = e.value; + if (kvm_set_msr(vcpu, &msr)) { + pr_warn_ratelimited( + "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", + __func__, i, e.index, e.value); + goto fail; + } + } + return 0; +fail: + return i + 1; +} + +static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) +{ + u32 i; + struct vmx_msr_entry e; + + for (i = 0; i < count; i++) { + if (kvm_read_guest(vcpu->kvm, + gpa + i * sizeof(e), + &e, 2 * sizeof(u32))) { + pr_warn_ratelimited( + "%s cannot read MSR entry (%u, 0x%08llx)\n", + __func__, i, gpa + i * sizeof(e)); + return -EINVAL; + } + if (nested_vmx_store_msr_check(vcpu, &e)) { + pr_warn_ratelimited( + "%s check failed (%u, 0x%x, 0x%x)\n", + __func__, i, e.index, e.reserved); + return -EINVAL; + } + if (kvm_get_msr(vcpu, e.index, &e.value)) { + pr_warn_ratelimited( + "%s cannot read MSR (%u, 0x%x)\n", + __func__, i, e.index); + return -EINVAL; + } + if (kvm_write_guest(vcpu->kvm, + gpa + i * sizeof(e) + + offsetof(struct vmx_msr_entry, value), + &e.value, sizeof(e.value))) { + pr_warn_ratelimited( + "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", + __func__, i, e.index, e.value); + return -EINVAL; + } + } + return 0; +} + /* * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it @@ -8365,8 +9086,23 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) exec_control = vmcs12->pin_based_vm_exec_control; exec_control |= vmcs_config.pin_based_exec_ctrl; - exec_control &= ~(PIN_BASED_VMX_PREEMPTION_TIMER | - PIN_BASED_POSTED_INTR); + exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + + if (nested_cpu_has_posted_intr(vmcs12)) { + /* + * Note that we use L0's vector here and in + * vmx_deliver_nested_posted_interrupt. + */ + vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; + vmx->nested.pi_pending = false; + vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR); + vmcs_write64(POSTED_INTR_DESC_ADDR, + page_to_phys(vmx->nested.pi_desc_page) + + (unsigned long)(vmcs12->posted_intr_desc_addr & + (PAGE_SIZE - 1))); + } else + exec_control &= ~PIN_BASED_POSTED_INTR; + vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control); vmx->nested.preemption_timer_expired = false; @@ -8423,12 +9159,26 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) else vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(vmx->nested.apic_access_page)); - } else if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) { + } else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) && + (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))) { exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; kvm_vcpu_reload_apic_access_page(vcpu); } + if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) { + vmcs_write64(EOI_EXIT_BITMAP0, + vmcs12->eoi_exit_bitmap0); + vmcs_write64(EOI_EXIT_BITMAP1, + vmcs12->eoi_exit_bitmap1); + vmcs_write64(EOI_EXIT_BITMAP2, + vmcs12->eoi_exit_bitmap2); + vmcs_write64(EOI_EXIT_BITMAP3, + vmcs12->eoi_exit_bitmap3); + vmcs_write16(GUEST_INTR_STATUS, + vmcs12->guest_intr_status); + } + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); } @@ -8462,11 +9212,17 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); } + if (cpu_has_vmx_msr_bitmap() && + exec_control & CPU_BASED_USE_MSR_BITMAPS && + nested_vmx_merge_msr_bitmap(vcpu, vmcs12)) { + vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_nested)); + } else + exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; + /* - * Merging of IO and MSR bitmaps not currently supported. + * Merging of IO bitmap not currently supported. * Rather, exit every time. */ - exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; exec_control &= ~CPU_BASED_USE_IO_BITMAPS; exec_control |= CPU_BASED_UNCOND_IO_EXITING; @@ -8582,6 +9338,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) int cpu; struct loaded_vmcs *vmcs02; bool ia32e; + u32 msr_entry_idx; if (!nested_vmx_check_permission(vcpu) || !nested_vmx_check_vmcs12(vcpu)) @@ -8616,41 +9373,42 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) return 1; } - if ((vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_MSR_BITMAPS) && - !PAGE_ALIGNED(vmcs12->msr_bitmap)) { + if (!nested_get_vmcs12_pages(vcpu, vmcs12)) { /*TODO: Also verify bits beyond physical address width are 0*/ nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); return 1; } - if (!nested_get_vmcs12_pages(vcpu, vmcs12)) { - /*TODO: Also verify bits beyond physical address width are 0*/ + if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); return 1; } - if (vmcs12->vm_entry_msr_load_count > 0 || - vmcs12->vm_exit_msr_load_count > 0 || - vmcs12->vm_exit_msr_store_count > 0) { - pr_warn_ratelimited("%s: VMCS MSR_{LOAD,STORE} unsupported\n", - __func__); + if (nested_vmx_check_apicv_controls(vcpu, vmcs12)) { + nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); + return 1; + } + + if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); return 1; } if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, - nested_vmx_true_procbased_ctls_low, - nested_vmx_procbased_ctls_high) || + vmx->nested.nested_vmx_true_procbased_ctls_low, + vmx->nested.nested_vmx_procbased_ctls_high) || !vmx_control_verify(vmcs12->secondary_vm_exec_control, - nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high) || + vmx->nested.nested_vmx_secondary_ctls_low, + vmx->nested.nested_vmx_secondary_ctls_high) || !vmx_control_verify(vmcs12->pin_based_vm_exec_control, - nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high) || + vmx->nested.nested_vmx_pinbased_ctls_low, + vmx->nested.nested_vmx_pinbased_ctls_high) || !vmx_control_verify(vmcs12->vm_exit_controls, - nested_vmx_true_exit_ctls_low, - nested_vmx_exit_ctls_high) || + vmx->nested.nested_vmx_true_exit_ctls_low, + vmx->nested.nested_vmx_exit_ctls_high) || !vmx_control_verify(vmcs12->vm_entry_controls, - nested_vmx_true_entry_ctls_low, - nested_vmx_entry_ctls_high)) + vmx->nested.nested_vmx_true_entry_ctls_low, + vmx->nested.nested_vmx_entry_ctls_high)) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); return 1; @@ -8663,7 +9421,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) return 1; } - if (!nested_cr0_valid(vmcs12, vmcs12->guest_cr0) || + if (!nested_cr0_valid(vcpu, vmcs12->guest_cr0) || ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) { nested_vmx_entry_failure(vcpu, vmcs12, EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); @@ -8739,10 +9497,21 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) vmx_segment_cache_clear(vmx); - vmcs12->launch_state = 1; - prepare_vmcs02(vcpu, vmcs12); + msr_entry_idx = nested_vmx_load_msr(vcpu, + vmcs12->vm_entry_msr_load_addr, + vmcs12->vm_entry_msr_load_count); + if (msr_entry_idx) { + leave_guest_mode(vcpu); + vmx_load_vmcs01(vcpu); + nested_vmx_entry_failure(vcpu, vmcs12, + EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx); + return 1; + } + + vmcs12->launch_state = 1; + if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) return kvm_emulate_halt(vcpu); @@ -8869,9 +9638,10 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) if (vmx->nested.nested_run_pending) return -EBUSY; nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); + return 0; } - return 0; + return vmx_complete_nested_posted_interrupt(vcpu); } static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) @@ -8981,6 +9751,9 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); } + if (nested_cpu_has_vid(vmcs12)) + vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); + vmcs12->vm_entry_controls = (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); @@ -9172,6 +9945,13 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, kvm_set_dr(vcpu, 7, 0x400); vmcs_write64(GUEST_IA32_DEBUGCTL, 0); + + if (cpu_has_vmx_msr_bitmap()) + vmx_set_msr_bitmap(vcpu); + + if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, + vmcs12->vm_exit_msr_load_count)) + nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); } /* @@ -9193,6 +9973,10 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, exit_qualification); + if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr, + vmcs12->vm_exit_msr_store_count)) + nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL); + vmx_load_vmcs01(vcpu); if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) @@ -9235,6 +10019,12 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, nested_release_page(vmx->nested.virtual_apic_page); vmx->nested.virtual_apic_page = NULL; } + if (vmx->nested.pi_desc_page) { + kunmap(vmx->nested.pi_desc_page); + nested_release_page(vmx->nested.pi_desc_page); + vmx->nested.pi_desc_page = NULL; + vmx->nested.pi_desc = NULL; + } /* * We are now running in L2, mmu_notifier will force to reload the @@ -9301,6 +10091,31 @@ static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) shrink_ple_window(vcpu); } +static void vmx_slot_enable_log_dirty(struct kvm *kvm, + struct kvm_memory_slot *slot) +{ + kvm_mmu_slot_leaf_clear_dirty(kvm, slot); + kvm_mmu_slot_largepage_remove_write_access(kvm, slot); +} + +static void vmx_slot_disable_log_dirty(struct kvm *kvm, + struct kvm_memory_slot *slot) +{ + kvm_mmu_slot_set_dirty(kvm, slot); +} + +static void vmx_flush_log_dirty(struct kvm *kvm) +{ + kvm_flush_pml_buffers(kvm); +} + +static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *memslot, + gfn_t offset, unsigned long mask) +{ + kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask); +} + static struct kvm_x86_ops vmx_x86_ops = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -9409,6 +10224,11 @@ static struct kvm_x86_ops vmx_x86_ops = { .check_nested_events = vmx_check_nested_events, .sched_in = vmx_sched_in, + + .slot_enable_log_dirty = vmx_slot_enable_log_dirty, + .slot_disable_log_dirty = vmx_slot_disable_log_dirty, + .flush_log_dirty = vmx_flush_log_dirty, + .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c259814..bd7a70b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -108,6 +108,10 @@ EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); static u32 tsc_tolerance_ppm = 250; module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); +/* lapic timer advance (tscdeadline mode only) in nanoseconds */ +unsigned int lapic_timer_advance_ns = 0; +module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR); + static bool backwards_tsc_observed = false; #define KVM_NR_SHARED_MSRS 16 @@ -141,6 +145,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "irq_window", VCPU_STAT(irq_window_exits) }, { "nmi_window", VCPU_STAT(nmi_window_exits) }, { "halt_exits", VCPU_STAT(halt_exits) }, + { "halt_successful_poll", VCPU_STAT(halt_successful_poll) }, { "halt_wakeup", VCPU_STAT(halt_wakeup) }, { "hypercalls", VCPU_STAT(hypercalls) }, { "request_irq", VCPU_STAT(request_irq_exits) }, @@ -492,7 +497,7 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, } EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu); -int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, +static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset, int len, u32 access) { return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn, @@ -643,7 +648,7 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) } } -int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) +static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) { u64 xcr0 = xcr; u64 old_xcr0 = vcpu->arch.xcr0; @@ -1083,6 +1088,15 @@ static void update_pvclock_gtod(struct timekeeper *tk) } #endif +void kvm_set_pending_timer(struct kvm_vcpu *vcpu) +{ + /* + * Note: KVM_REQ_PENDING_TIMER is implicitly checked in + * vcpu_enter_guest. This function is only called from + * the physical CPU that is running vcpu. + */ + kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); +} static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) { @@ -1180,7 +1194,7 @@ static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0); #endif static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); -unsigned long max_tsc_khz; +static unsigned long max_tsc_khz; static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) { @@ -1234,7 +1248,7 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) return tsc; } -void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) +static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) { #ifdef CONFIG_X86_64 bool vcpus_matched; @@ -1529,7 +1543,8 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm) &ka->master_cycle_now); ka->use_master_clock = host_tsc_clocksource && vcpus_matched - && !backwards_tsc_observed; + && !backwards_tsc_observed + && !ka->boot_vcpu_runs_old_kvmclock; if (ka->use_master_clock) atomic_set(&kvm_guest_has_master_clock, 1); @@ -2161,8 +2176,20 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_KVM_SYSTEM_TIME_NEW: case MSR_KVM_SYSTEM_TIME: { u64 gpa_offset; + struct kvm_arch *ka = &vcpu->kvm->arch; + kvmclock_reset(vcpu); + if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) { + bool tmp = (msr == MSR_KVM_SYSTEM_TIME); + + if (ka->boot_vcpu_runs_old_kvmclock != tmp) + set_bit(KVM_REQ_MASTERCLOCK_UPDATE, + &vcpu->requests); + + ka->boot_vcpu_runs_old_kvmclock = tmp; + } + vcpu->arch.time = data; kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); @@ -2324,6 +2351,7 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) { return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); } +EXPORT_SYMBOL_GPL(kvm_get_msr); static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) { @@ -2738,6 +2766,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_READONLY_MEM: case KVM_CAP_HYPERV_TIME: case KVM_CAP_IOAPIC_POLARITY_IGNORED: + case KVM_CAP_TSC_DEADLINE_TIMER: #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT case KVM_CAP_ASSIGN_DEV_IRQ: case KVM_CAP_PCI_2_3: @@ -2776,9 +2805,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_TSC_CONTROL: r = kvm_has_tsc_control; break; - case KVM_CAP_TSC_DEADLINE_TIMER: - r = boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER); - break; default: r = 0; break; @@ -3734,83 +3760,43 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm, * @kvm: kvm instance * @log: slot id and address to which we copy the log * - * We need to keep it in mind that VCPU threads can write to the bitmap - * concurrently. So, to avoid losing data, we keep the following order for - * each bit: + * Steps 1-4 below provide general overview of dirty page logging. See + * kvm_get_dirty_log_protect() function description for additional details. + * + * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we + * always flush the TLB (step 4) even if previous step failed and the dirty + * bitmap may be corrupt. Regardless of previous outcome the KVM logging API + * does not preclude user space subsequent dirty log read. Flushing TLB ensures + * writes will be marked dirty for next log read. * * 1. Take a snapshot of the bit and clear it if needed. * 2. Write protect the corresponding page. - * 3. Flush TLB's if needed. - * 4. Copy the snapshot to the userspace. - * - * Between 2 and 3, the guest may write to the page using the remaining TLB - * entry. This is not a problem because the page will be reported dirty at - * step 4 using the snapshot taken before and step 3 ensures that successive - * writes will be logged for the next call. + * 3. Copy the snapshot to the userspace. + * 4. Flush TLB's if needed. */ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { - int r; - struct kvm_memory_slot *memslot; - unsigned long n, i; - unsigned long *dirty_bitmap; - unsigned long *dirty_bitmap_buffer; bool is_dirty = false; + int r; mutex_lock(&kvm->slots_lock); - r = -EINVAL; - if (log->slot >= KVM_USER_MEM_SLOTS) - goto out; - - memslot = id_to_memslot(kvm->memslots, log->slot); - - dirty_bitmap = memslot->dirty_bitmap; - r = -ENOENT; - if (!dirty_bitmap) - goto out; - - n = kvm_dirty_bitmap_bytes(memslot); - - dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long); - memset(dirty_bitmap_buffer, 0, n); - - spin_lock(&kvm->mmu_lock); - - for (i = 0; i < n / sizeof(long); i++) { - unsigned long mask; - gfn_t offset; - - if (!dirty_bitmap[i]) - continue; - - is_dirty = true; - - mask = xchg(&dirty_bitmap[i], 0); - dirty_bitmap_buffer[i] = mask; - - offset = i * BITS_PER_LONG; - kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask); - } - - spin_unlock(&kvm->mmu_lock); + /* + * Flush potentially hardware-cached dirty pages to dirty_bitmap. + */ + if (kvm_x86_ops->flush_log_dirty) + kvm_x86_ops->flush_log_dirty(kvm); - /* See the comments in kvm_mmu_slot_remove_write_access(). */ - lockdep_assert_held(&kvm->slots_lock); + r = kvm_get_dirty_log_protect(kvm, log, &is_dirty); /* * All the TLBs can be flushed out of mmu lock, see the comments in * kvm_mmu_slot_remove_write_access(). */ + lockdep_assert_held(&kvm->slots_lock); if (is_dirty) kvm_flush_remote_tlbs(kvm); - r = -EFAULT; - if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) - goto out; - - r = 0; -out: mutex_unlock(&kvm->slots_lock); return r; } @@ -4516,6 +4502,8 @@ int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr, if (rc != X86EMUL_CONTINUE) return rc; addr += now; + if (ctxt->mode != X86EMUL_MODE_PROT64) + addr = (u32)addr; val += now; bytes -= now; } @@ -4984,6 +4972,11 @@ static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulon kvm_register_write(emul_to_vcpu(ctxt), reg, val); } +static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked) +{ + kvm_x86_ops->set_nmi_mask(emul_to_vcpu(ctxt), masked); +} + static const struct x86_emulate_ops emulate_ops = { .read_gpr = emulator_read_gpr, .write_gpr = emulator_write_gpr, @@ -5019,6 +5012,7 @@ static const struct x86_emulate_ops emulate_ops = { .put_fpu = emulator_put_fpu, .intercept = emulator_intercept, .get_cpuid = emulator_get_cpuid, + .set_nmi_mask = emulator_set_nmi_mask, }; static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) @@ -6311,6 +6305,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } trace_kvm_entry(vcpu->vcpu_id); + wait_lapic_expire(vcpu); kvm_x86_ops->run(vcpu); /* @@ -7041,15 +7036,13 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) return r; } -int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) +void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) { - int r; struct msr_data msr; struct kvm *kvm = vcpu->kvm; - r = vcpu_load(vcpu); - if (r) - return r; + if (vcpu_load(vcpu)) + return; msr.data = 0x0; msr.index = MSR_IA32_TSC; msr.host_initiated = true; @@ -7058,8 +7051,6 @@ int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) schedule_delayed_work(&kvm->arch.kvmclock_sync_work, KVMCLOCK_SYNC_PERIOD); - - return r; } void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) @@ -7549,12 +7540,62 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, return 0; } +static void kvm_mmu_slot_apply_flags(struct kvm *kvm, + struct kvm_memory_slot *new) +{ + /* Still write protect RO slot */ + if (new->flags & KVM_MEM_READONLY) { + kvm_mmu_slot_remove_write_access(kvm, new); + return; + } + + /* + * Call kvm_x86_ops dirty logging hooks when they are valid. + * + * kvm_x86_ops->slot_disable_log_dirty is called when: + * + * - KVM_MR_CREATE with dirty logging is disabled + * - KVM_MR_FLAGS_ONLY with dirty logging is disabled in new flag + * + * The reason is, in case of PML, we need to set D-bit for any slots + * with dirty logging disabled in order to eliminate unnecessary GPA + * logging in PML buffer (and potential PML buffer full VMEXT). This + * guarantees leaving PML enabled during guest's lifetime won't have + * any additonal overhead from PML when guest is running with dirty + * logging disabled for memory slots. + * + * kvm_x86_ops->slot_enable_log_dirty is called when switching new slot + * to dirty logging mode. + * + * If kvm_x86_ops dirty logging hooks are invalid, use write protect. + * + * In case of write protect: + * + * Write protect all pages for dirty logging. + * + * All the sptes including the large sptes which point to this + * slot are set to readonly. We can not create any new large + * spte on this slot until the end of the logging. + * + * See the comments in fast_page_fault(). + */ + if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) { + if (kvm_x86_ops->slot_enable_log_dirty) + kvm_x86_ops->slot_enable_log_dirty(kvm, new); + else + kvm_mmu_slot_remove_write_access(kvm, new); + } else { + if (kvm_x86_ops->slot_disable_log_dirty) + kvm_x86_ops->slot_disable_log_dirty(kvm, new); + } +} + void kvm_arch_commit_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, const struct kvm_memory_slot *old, enum kvm_mr_change change) { - + struct kvm_memory_slot *new; int nr_mmu_pages = 0; if ((mem->slot >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_DELETE)) { @@ -7573,17 +7614,20 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, if (nr_mmu_pages) kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); + + /* It's OK to get 'new' slot here as it has already been installed */ + new = id_to_memslot(kvm->memslots, mem->slot); + /* - * Write protect all pages for dirty logging. + * Set up write protection and/or dirty logging for the new slot. * - * All the sptes including the large sptes which point to this - * slot are set to readonly. We can not create any new large - * spte on this slot until the end of the logging. - * - * See the comments in fast_page_fault(). + * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of old slot have + * been zapped so no dirty logging staff is needed for old slot. For + * KVM_MR_FLAGS_ONLY, the old slot is essentially the same one as the + * new and it's also covered when dealing with the new slot. */ - if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES)) - kvm_mmu_slot_remove_write_access(kvm, mem->slot); + if (change != KVM_MR_DELETE) + kvm_mmu_slot_apply_flags(kvm, new); } void kvm_arch_flush_shadow_all(struct kvm *kvm) @@ -7837,3 +7881,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index cc1d61a..f5fef18 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -147,6 +147,7 @@ static inline void kvm_register_writel(struct kvm_vcpu *vcpu, void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); +void kvm_set_pending_timer(struct kvm_vcpu *vcpu); int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr); @@ -170,5 +171,7 @@ extern u64 kvm_supported_xcr0(void); extern unsigned int min_timer_period_us; +extern unsigned int lapic_timer_advance_ns; + extern struct static_key kvm_no_apic_vcpu; #endif diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index c1c1544..ac4453d 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -56,6 +56,9 @@ #include <linux/virtio_console.h> #include <linux/pm.h> #include <linux/export.h> +#include <linux/pci.h> +#include <linux/virtio_pci.h> +#include <asm/acpi.h> #include <asm/apic.h> #include <asm/lguest.h> #include <asm/paravirt.h> @@ -71,6 +74,8 @@ #include <asm/stackprotector.h> #include <asm/reboot.h> /* for struct machine_ops */ #include <asm/kvm_para.h> +#include <asm/pci_x86.h> +#include <asm/pci-direct.h> /*G:010 * Welcome to the Guest! @@ -831,6 +836,24 @@ static struct irq_chip lguest_irq_controller = { .irq_unmask = enable_lguest_irq, }; +static int lguest_enable_irq(struct pci_dev *dev) +{ + u8 line = 0; + + /* We literally use the PCI interrupt line as the irq number. */ + pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &line); + irq_set_chip_and_handler_name(line, &lguest_irq_controller, + handle_level_irq, "level"); + dev->irq = line; + return 0; +} + +/* We don't do hotplug PCI, so this shouldn't be called. */ +static void lguest_disable_irq(struct pci_dev *dev) +{ + WARN_ON(1); +} + /* * This sets up the Interrupt Descriptor Table (IDT) entry for each hardware * interrupt (except 128, which is used for system calls), and then tells the @@ -1181,25 +1204,136 @@ static __init char *lguest_memory_setup(void) return "LGUEST"; } +/* Offset within PCI config space of BAR access capability. */ +static int console_cfg_offset = 0; +static int console_access_cap; + +/* Set up so that we access off in bar0 (on bus 0, device 1, function 0) */ +static void set_cfg_window(u32 cfg_offset, u32 off) +{ + write_pci_config_byte(0, 1, 0, + cfg_offset + offsetof(struct virtio_pci_cap, bar), + 0); + write_pci_config(0, 1, 0, + cfg_offset + offsetof(struct virtio_pci_cap, length), + 4); + write_pci_config(0, 1, 0, + cfg_offset + offsetof(struct virtio_pci_cap, offset), + off); +} + +static void write_bar_via_cfg(u32 cfg_offset, u32 off, u32 val) +{ + /* + * We could set this up once, then leave it; nothing else in the * + * kernel should touch these registers. But if it went wrong, that + * would be a horrible bug to find. + */ + set_cfg_window(cfg_offset, off); + write_pci_config(0, 1, 0, + cfg_offset + sizeof(struct virtio_pci_cap), val); +} + +static void probe_pci_console(void) +{ + u8 cap, common_cap = 0, device_cap = 0; + /* Offset within BAR0 */ + u32 device_offset; + u32 device_len; + + /* Avoid recursive printk into here. */ + console_cfg_offset = -1; + + if (!early_pci_allowed()) { + printk(KERN_ERR "lguest: early PCI access not allowed!\n"); + return; + } + + /* We expect a console PCI device at BUS0, slot 1. */ + if (read_pci_config(0, 1, 0, 0) != 0x10431AF4) { + printk(KERN_ERR "lguest: PCI device is %#x!\n", + read_pci_config(0, 1, 0, 0)); + return; + } + + /* Find the capabilities we need (must be in bar0) */ + cap = read_pci_config_byte(0, 1, 0, PCI_CAPABILITY_LIST); + while (cap) { + u8 vndr = read_pci_config_byte(0, 1, 0, cap); + if (vndr == PCI_CAP_ID_VNDR) { + u8 type, bar; + u32 offset, length; + + type = read_pci_config_byte(0, 1, 0, + cap + offsetof(struct virtio_pci_cap, cfg_type)); + bar = read_pci_config_byte(0, 1, 0, + cap + offsetof(struct virtio_pci_cap, bar)); + offset = read_pci_config(0, 1, 0, + cap + offsetof(struct virtio_pci_cap, offset)); + length = read_pci_config(0, 1, 0, + cap + offsetof(struct virtio_pci_cap, length)); + + switch (type) { + case VIRTIO_PCI_CAP_DEVICE_CFG: + if (bar == 0) { + device_cap = cap; + device_offset = offset; + device_len = length; + } + break; + case VIRTIO_PCI_CAP_PCI_CFG: + console_access_cap = cap; + break; + } + } + cap = read_pci_config_byte(0, 1, 0, cap + PCI_CAP_LIST_NEXT); + } + if (!device_cap || !console_access_cap) { + printk(KERN_ERR "lguest: No caps (%u/%u/%u) in console!\n", + common_cap, device_cap, console_access_cap); + return; + } + + /* + * Note that we can't check features, until we've set the DRIVER + * status bit. We don't want to do that until we have a real driver, + * so we just check that the device-specific config has room for + * emerg_wr. If it doesn't support VIRTIO_CONSOLE_F_EMERG_WRITE + * it should ignore the access. + */ + if (device_len < (offsetof(struct virtio_console_config, emerg_wr) + + sizeof(u32))) { + printk(KERN_ERR "lguest: console missing emerg_wr field\n"); + return; + } + + console_cfg_offset = device_offset; + printk(KERN_INFO "lguest: Console via virtio-pci emerg_wr\n"); +} + /* * We will eventually use the virtio console device to produce console output, - * but before that is set up we use LHCALL_NOTIFY on normal memory to produce - * console output. + * but before that is set up we use the virtio PCI console's backdoor mmio + * access and the "emergency" write facility (which is legal even before the + * device is configured). */ static __init int early_put_chars(u32 vtermno, const char *buf, int count) { - char scratch[17]; - unsigned int len = count; + /* If we couldn't find PCI console, forget it. */ + if (console_cfg_offset < 0) + return count; - /* We use a nul-terminated string, so we make a copy. Icky, huh? */ - if (len > sizeof(scratch) - 1) - len = sizeof(scratch) - 1; - scratch[len] = '\0'; - memcpy(scratch, buf, len); - hcall(LHCALL_NOTIFY, __pa(scratch), 0, 0, 0); + if (unlikely(!console_cfg_offset)) { + probe_pci_console(); + if (console_cfg_offset < 0) + return count; + } - /* This routine returns the number of bytes actually written. */ - return len; + write_bar_via_cfg(console_access_cap, + console_cfg_offset + + offsetof(struct virtio_console_config, emerg_wr), + buf[0]); + return 1; } /* @@ -1400,14 +1534,6 @@ __init void lguest_init(void) atomic_notifier_chain_register(&panic_notifier_list, &paniced); /* - * The IDE code spends about 3 seconds probing for disks: if we reserve - * all the I/O ports up front it can't get them and so doesn't probe. - * Other device drivers are similar (but less severe). This cuts the - * kernel boot time on my machine from 4.1 seconds to 0.45 seconds. - */ - paravirt_disable_iospace(); - - /* * This is messy CPU setup stuff which the native boot code does before * start_kernel, so we have to do, too: */ @@ -1436,6 +1562,13 @@ __init void lguest_init(void) /* Register our very early console. */ virtio_cons_early_init(early_put_chars); + /* Don't let ACPI try to control our PCI interrupts. */ + disable_acpi(); + + /* We control them ourselves, by overriding these two hooks. */ + pcibios_enable_irq = lguest_enable_irq; + pcibios_disable_irq = lguest_disable_irq; + /* * Last of all, we set the power management poweroff hook to point to * the Guest routine to power off, and the reboot hook to our restart diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 56313a3..89b53c9 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -53,6 +53,8 @@ .Lmemcpy_e_e: .previous +.weak memcpy + ENTRY(__memcpy) ENTRY(memcpy) CFI_STARTPROC @@ -199,8 +201,8 @@ ENDPROC(__memcpy) * only outcome... */ .section .altinstructions, "a" - altinstruction_entry memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\ + altinstruction_entry __memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\ .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c - altinstruction_entry memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \ + altinstruction_entry __memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \ .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e .previous diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S index 65268a6..9c4b530 100644 --- a/arch/x86/lib/memmove_64.S +++ b/arch/x86/lib/memmove_64.S @@ -24,7 +24,10 @@ * Output: * rax: dest */ +.weak memmove + ENTRY(memmove) +ENTRY(__memmove) CFI_STARTPROC /* Handle more 32 bytes in loop */ @@ -220,4 +223,5 @@ ENTRY(memmove) .Lmemmove_end_forward-.Lmemmove_begin_forward, \ .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs .previous +ENDPROC(__memmove) ENDPROC(memmove) diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index 2dcb380..6f44935 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S @@ -56,6 +56,8 @@ .Lmemset_e_e: .previous +.weak memset + ENTRY(memset) ENTRY(__memset) CFI_STARTPROC @@ -147,8 +149,8 @@ ENDPROC(__memset) * feature to implement the right patch order. */ .section .altinstructions,"a" - altinstruction_entry memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\ - .Lfinal-memset,.Lmemset_e-.Lmemset_c - altinstruction_entry memset,.Lmemset_c_e,X86_FEATURE_ERMS, \ - .Lfinal-memset,.Lmemset_e_e-.Lmemset_c_e + altinstruction_entry __memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\ + .Lfinal-__memset,.Lmemset_e-.Lmemset_c + altinstruction_entry __memset,.Lmemset_c_e,X86_FEATURE_ERMS, \ + .Lfinal-__memset,.Lmemset_e_e-.Lmemset_c_e .previous diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index ecfdc46..c4cc740 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -20,6 +20,9 @@ obj-$(CONFIG_HIGHMEM) += highmem_32.o obj-$(CONFIG_KMEMCHECK) += kmemcheck/ +KASAN_SANITIZE_kasan_init_$(BITS).o := n +obj-$(CONFIG_KASAN) += kasan_init_$(BITS).o + obj-$(CONFIG_MMIOTRACE) += mmiotrace.o mmiotrace-y := kmmio.o pf_in.o mmio-mod.o obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index e3ff27a..ede025f 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -600,7 +600,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, printk(nx_warning, from_kuid(&init_user_ns, current_uid())); if (pte && pte_present(*pte) && pte_exec(*pte) && (pgd_flags(*pgd) & _PAGE_USER) && - (read_cr4() & X86_CR4_SMEP)) + (__read_cr4() & X86_CR4_SMEP)) printk(smep_warning, from_kuid(&init_user_ns, current_uid())); } diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index d754782..81bf3d2 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -84,7 +84,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, struct page *page; /* Similar to the PMD case, NUMA hinting must take slow path */ - if (pte_numa(pte)) { + if (pte_protnone(pte)) { pte_unmap(ptep); return 0; } @@ -172,13 +172,13 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, */ if (pmd_none(pmd) || pmd_trans_splitting(pmd)) return 0; - if (unlikely(pmd_large(pmd))) { + if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) { /* * NUMA hinting faults need to be handled in the GUP * slowpath for accounting purposes and so that they * can be serialised against THP migration. */ - if (pmd_numa(pmd)) + if (pmd_protnone(pmd)) return 0; if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) return 0; @@ -388,10 +388,9 @@ slow_irqon: start += nr << PAGE_SHIFT; pages += nr; - down_read(&mm->mmap_sem); - ret = get_user_pages(current, mm, start, - (end - start) >> PAGE_SHIFT, write, 0, pages, NULL); - up_read(&mm->mmap_sem); + ret = get_user_pages_unlocked(current, mm, start, + (end - start) >> PAGE_SHIFT, + write, 0, pages); /* Have to be a bit careful with return values */ if (nr > 0) { diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 8b977eb..42982b2 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -52,23 +52,17 @@ int pud_huge(pud_t pud) return 0; } -struct page * -follow_huge_pmd(struct mm_struct *mm, unsigned long address, - pmd_t *pmd, int write) -{ - return NULL; -} #else -struct page * -follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) -{ - return ERR_PTR(-EINVAL); -} - +/* + * pmd_huge() returns 1 if @pmd is hugetlb related entry, that is normal + * hugetlb entry or non-present (migration or hwpoisoned) hugetlb entry. + * Otherwise, returns 0. + */ int pmd_huge(pmd_t pmd) { - return !!(pmd_val(pmd) & _PAGE_PSE); + return !pmd_none(pmd) && + (pmd_val(pmd) & (_PAGE_PRESENT|_PAGE_PSE)) != _PAGE_PRESENT; } int pud_huge(pud_t pud) @@ -178,4 +172,15 @@ static __init int setup_hugepagesz(char *opt) return 1; } __setup("hugepagesz=", setup_hugepagesz); + +#ifdef CONFIG_CMA +static __init int gigantic_pages_init(void) +{ + /* With CMA we can allocate gigantic pages at runtime */ + if (cpu_has_gbpages && !size_to_hstate(1UL << PUD_SHIFT)) + hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); + return 0; +} +arch_initcall(gigantic_pages_init); +#endif #endif diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 079c3b6..a110efc 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -173,11 +173,11 @@ static void __init probe_page_size_mask(void) /* Enable PSE if available */ if (cpu_has_pse) - set_in_cr4(X86_CR4_PSE); + cr4_set_bits_and_update_boot(X86_CR4_PSE); /* Enable PGE if available */ if (cpu_has_pge) { - set_in_cr4(X86_CR4_PGE); + cr4_set_bits_and_update_boot(X86_CR4_PGE); __supported_pte_mask |= _PAGE_GLOBAL; } } @@ -238,6 +238,31 @@ static void __init_refok adjust_range_page_size_mask(struct map_range *mr, } } +static const char *page_size_string(struct map_range *mr) +{ + static const char str_1g[] = "1G"; + static const char str_2m[] = "2M"; + static const char str_4m[] = "4M"; + static const char str_4k[] = "4k"; + + if (mr->page_size_mask & (1<<PG_LEVEL_1G)) + return str_1g; + /* + * 32-bit without PAE has a 4M large page size. + * PG_LEVEL_2M is misnamed, but we can at least + * print out the right size in the string. + */ + if (IS_ENABLED(CONFIG_X86_32) && + !IS_ENABLED(CONFIG_X86_PAE) && + mr->page_size_mask & (1<<PG_LEVEL_2M)) + return str_4m; + + if (mr->page_size_mask & (1<<PG_LEVEL_2M)) + return str_2m; + + return str_4k; +} + static int __meminit split_mem_range(struct map_range *mr, int nr_range, unsigned long start, unsigned long end) @@ -333,8 +358,7 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, for (i = 0; i < nr_range; i++) printk(KERN_DEBUG " [mem %#010lx-%#010lx] page %s\n", mr[i].start, mr[i].end - 1, - (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":( - (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k")); + page_size_string(&mr[i])); return nr_range; } @@ -608,7 +632,7 @@ void __init init_mem_mapping(void) * * * On x86, access has to be given to the first megabyte of ram because that area - * contains bios code and data regions used by X and dosemu and similar apps. + * contains BIOS code and data regions used by X and dosemu and similar apps. * Access has to be given to non-kernel-ram areas as well, these contain the PCI * mmio resources as well as potential bios/acpi data regions. */ @@ -713,6 +737,15 @@ void __init zone_sizes_init(void) free_area_init_nodes(max_zone_pfns); } +DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { +#ifdef CONFIG_SMP + .active_mm = &init_mm, + .state = 0, +#endif + .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ +}; +EXPORT_SYMBOL_GPL(cpu_tlbstate); + void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache) { /* entry 0 MUST be WB (hardwired to speed up translations) */ diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c new file mode 100644 index 0000000..4860906 --- /dev/null +++ b/arch/x86/mm/kasan_init_64.c @@ -0,0 +1,206 @@ +#include <linux/bootmem.h> +#include <linux/kasan.h> +#include <linux/kdebug.h> +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/vmalloc.h> + +#include <asm/tlbflush.h> +#include <asm/sections.h> + +extern pgd_t early_level4_pgt[PTRS_PER_PGD]; +extern struct range pfn_mapped[E820_X_MAX]; + +extern unsigned char kasan_zero_page[PAGE_SIZE]; + +static int __init map_range(struct range *range) +{ + unsigned long start; + unsigned long end; + + start = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->start)); + end = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->end)); + + /* + * end + 1 here is intentional. We check several shadow bytes in advance + * to slightly speed up fastpath. In some rare cases we could cross + * boundary of mapped shadow, so we just map some more here. + */ + return vmemmap_populate(start, end + 1, NUMA_NO_NODE); +} + +static void __init clear_pgds(unsigned long start, + unsigned long end) +{ + for (; start < end; start += PGDIR_SIZE) + pgd_clear(pgd_offset_k(start)); +} + +void __init kasan_map_early_shadow(pgd_t *pgd) +{ + int i; + unsigned long start = KASAN_SHADOW_START; + unsigned long end = KASAN_SHADOW_END; + + for (i = pgd_index(start); start < end; i++) { + pgd[i] = __pgd(__pa_nodebug(kasan_zero_pud) + | _KERNPG_TABLE); + start += PGDIR_SIZE; + } +} + +static int __init zero_pte_populate(pmd_t *pmd, unsigned long addr, + unsigned long end) +{ + pte_t *pte = pte_offset_kernel(pmd, addr); + + while (addr + PAGE_SIZE <= end) { + WARN_ON(!pte_none(*pte)); + set_pte(pte, __pte(__pa_nodebug(kasan_zero_page) + | __PAGE_KERNEL_RO)); + addr += PAGE_SIZE; + pte = pte_offset_kernel(pmd, addr); + } + return 0; +} + +static int __init zero_pmd_populate(pud_t *pud, unsigned long addr, + unsigned long end) +{ + int ret = 0; + pmd_t *pmd = pmd_offset(pud, addr); + + while (IS_ALIGNED(addr, PMD_SIZE) && addr + PMD_SIZE <= end) { + WARN_ON(!pmd_none(*pmd)); + set_pmd(pmd, __pmd(__pa_nodebug(kasan_zero_pte) + | __PAGE_KERNEL_RO)); + addr += PMD_SIZE; + pmd = pmd_offset(pud, addr); + } + if (addr < end) { + if (pmd_none(*pmd)) { + void *p = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE); + if (!p) + return -ENOMEM; + set_pmd(pmd, __pmd(__pa_nodebug(p) | _KERNPG_TABLE)); + } + ret = zero_pte_populate(pmd, addr, end); + } + return ret; +} + + +static int __init zero_pud_populate(pgd_t *pgd, unsigned long addr, + unsigned long end) +{ + int ret = 0; + pud_t *pud = pud_offset(pgd, addr); + + while (IS_ALIGNED(addr, PUD_SIZE) && addr + PUD_SIZE <= end) { + WARN_ON(!pud_none(*pud)); + set_pud(pud, __pud(__pa_nodebug(kasan_zero_pmd) + | __PAGE_KERNEL_RO)); + addr += PUD_SIZE; + pud = pud_offset(pgd, addr); + } + + if (addr < end) { + if (pud_none(*pud)) { + void *p = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE); + if (!p) + return -ENOMEM; + set_pud(pud, __pud(__pa_nodebug(p) | _KERNPG_TABLE)); + } + ret = zero_pmd_populate(pud, addr, end); + } + return ret; +} + +static int __init zero_pgd_populate(unsigned long addr, unsigned long end) +{ + int ret = 0; + pgd_t *pgd = pgd_offset_k(addr); + + while (IS_ALIGNED(addr, PGDIR_SIZE) && addr + PGDIR_SIZE <= end) { + WARN_ON(!pgd_none(*pgd)); + set_pgd(pgd, __pgd(__pa_nodebug(kasan_zero_pud) + | __PAGE_KERNEL_RO)); + addr += PGDIR_SIZE; + pgd = pgd_offset_k(addr); + } + + if (addr < end) { + if (pgd_none(*pgd)) { + void *p = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE); + if (!p) + return -ENOMEM; + set_pgd(pgd, __pgd(__pa_nodebug(p) | _KERNPG_TABLE)); + } + ret = zero_pud_populate(pgd, addr, end); + } + return ret; +} + + +static void __init populate_zero_shadow(const void *start, const void *end) +{ + if (zero_pgd_populate((unsigned long)start, (unsigned long)end)) + panic("kasan: unable to map zero shadow!"); +} + + +#ifdef CONFIG_KASAN_INLINE +static int kasan_die_handler(struct notifier_block *self, + unsigned long val, + void *data) +{ + if (val == DIE_GPF) { + pr_emerg("CONFIG_KASAN_INLINE enabled"); + pr_emerg("GPF could be caused by NULL-ptr deref or user memory access"); + } + return NOTIFY_OK; +} + +static struct notifier_block kasan_die_notifier = { + .notifier_call = kasan_die_handler, +}; +#endif + +void __init kasan_init(void) +{ + int i; + +#ifdef CONFIG_KASAN_INLINE + register_die_notifier(&kasan_die_notifier); +#endif + + memcpy(early_level4_pgt, init_level4_pgt, sizeof(early_level4_pgt)); + load_cr3(early_level4_pgt); + + clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END); + + populate_zero_shadow((void *)KASAN_SHADOW_START, + kasan_mem_to_shadow((void *)PAGE_OFFSET)); + + for (i = 0; i < E820_X_MAX; i++) { + if (pfn_mapped[i].end == 0) + break; + + if (map_range(&pfn_mapped[i])) + panic("kasan: unable to allocate shadow!"); + } + populate_zero_shadow(kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM), + kasan_mem_to_shadow((void *)__START_KERNEL_map)); + + vmemmap_populate((unsigned long)kasan_mem_to_shadow(_stext), + (unsigned long)kasan_mem_to_shadow(_end), + NUMA_NO_NODE); + + populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), + (void *)KASAN_SHADOW_END); + + memset(kasan_zero_page, 0, PAGE_SIZE); + + load_cr3(init_level4_pgt); + init_task.kasan_depth = 0; +} diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 919b912..df4552b 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -35,12 +35,12 @@ struct va_alignment __read_mostly va_align = { .flags = -1, }; -static unsigned int stack_maxrandom_size(void) +static unsigned long stack_maxrandom_size(void) { - unsigned int max = 0; + unsigned long max = 0; if ((current->flags & PF_RANDOMIZE) && !(current->personality & ADDR_NO_RANDOMIZE)) { - max = ((-1U) & STACK_RND_MASK) << PAGE_SHIFT; + max = ((-1UL) & STACK_RND_MASK) << PAGE_SHIFT; } return max; diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 1a88370..cd4785b 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -794,7 +794,6 @@ int early_cpu_to_node(int cpu) void debug_cpumask_set_cpu(int cpu, int node, bool enable) { struct cpumask *mask; - char buf[64]; if (node == NUMA_NO_NODE) { /* early_cpu_to_node() already emits a warning and trace */ @@ -812,10 +811,9 @@ void debug_cpumask_set_cpu(int cpu, int node, bool enable) else cpumask_clear_cpu(cpu, mask); - cpulist_scnprintf(buf, sizeof(buf), mask); - printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", + printk(KERN_DEBUG "%s cpu %d node %d: mask now %*pbl\n", enable ? "numa_add_cpu" : "numa_remove_cpu", - cpu, node, buf); + cpu, node, cpumask_pr_args(mask)); return; } diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index d223e1f..5a7e525 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -190,7 +190,7 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) #endif /* CONFIG_X86_PAE */ -static void free_pmds(pmd_t *pmds[]) +static void free_pmds(struct mm_struct *mm, pmd_t *pmds[]) { int i; @@ -198,10 +198,11 @@ static void free_pmds(pmd_t *pmds[]) if (pmds[i]) { pgtable_pmd_page_dtor(virt_to_page(pmds[i])); free_page((unsigned long)pmds[i]); + mm_dec_nr_pmds(mm); } } -static int preallocate_pmds(pmd_t *pmds[]) +static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[]) { int i; bool failed = false; @@ -215,11 +216,13 @@ static int preallocate_pmds(pmd_t *pmds[]) pmd = NULL; failed = true; } + if (pmd) + mm_inc_nr_pmds(mm); pmds[i] = pmd; } if (failed) { - free_pmds(pmds); + free_pmds(mm, pmds); return -ENOMEM; } @@ -246,6 +249,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT); pmd_free(mm, pmd); + mm_dec_nr_pmds(mm); } } } @@ -358,7 +362,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) mm->pgd = pgd; - if (preallocate_pmds(pmds) != 0) + if (preallocate_pmds(mm, pmds) != 0) goto out_free_pgd; if (paravirt_pgd_alloc(mm) != 0) @@ -379,7 +383,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) return pgd; out_free_pmds: - free_pmds(pmds); + free_pmds(mm, pmds); out_free_pgd: _pgd_free(pgd); out: diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index ee61c36..3250f23 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -14,9 +14,6 @@ #include <asm/uv/uv.h> #include <linux/debugfs.h> -DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) - = { &init_mm, 0, }; - /* * Smarter SMP flushing macros. * c/o Linus Torvalds. diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index cfd1b13..6ac2738 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -10,9 +10,6 @@ struct pci_root_info { struct acpi_device *bridge; char name[16]; - unsigned int res_num; - struct resource *res; - resource_size_t *res_offset; struct pci_sysdata sd; #ifdef CONFIG_PCI_MMCONFIG bool mcfg_added; @@ -218,130 +215,41 @@ static void teardown_mcfg_map(struct pci_root_info *info) } #endif -static acpi_status resource_to_addr(struct acpi_resource *resource, - struct acpi_resource_address64 *addr) -{ - acpi_status status; - struct acpi_resource_memory24 *memory24; - struct acpi_resource_memory32 *memory32; - struct acpi_resource_fixed_memory32 *fixed_memory32; - - memset(addr, 0, sizeof(*addr)); - switch (resource->type) { - case ACPI_RESOURCE_TYPE_MEMORY24: - memory24 = &resource->data.memory24; - addr->resource_type = ACPI_MEMORY_RANGE; - addr->minimum = memory24->minimum; - addr->address_length = memory24->address_length; - addr->maximum = addr->minimum + addr->address_length - 1; - return AE_OK; - case ACPI_RESOURCE_TYPE_MEMORY32: - memory32 = &resource->data.memory32; - addr->resource_type = ACPI_MEMORY_RANGE; - addr->minimum = memory32->minimum; - addr->address_length = memory32->address_length; - addr->maximum = addr->minimum + addr->address_length - 1; - return AE_OK; - case ACPI_RESOURCE_TYPE_FIXED_MEMORY32: - fixed_memory32 = &resource->data.fixed_memory32; - addr->resource_type = ACPI_MEMORY_RANGE; - addr->minimum = fixed_memory32->address; - addr->address_length = fixed_memory32->address_length; - addr->maximum = addr->minimum + addr->address_length - 1; - return AE_OK; - case ACPI_RESOURCE_TYPE_ADDRESS16: - case ACPI_RESOURCE_TYPE_ADDRESS32: - case ACPI_RESOURCE_TYPE_ADDRESS64: - status = acpi_resource_to_address64(resource, addr); - if (ACPI_SUCCESS(status) && - (addr->resource_type == ACPI_MEMORY_RANGE || - addr->resource_type == ACPI_IO_RANGE) && - addr->address_length > 0) { - return AE_OK; - } - break; - } - return AE_ERROR; -} - -static acpi_status count_resource(struct acpi_resource *acpi_res, void *data) +static void validate_resources(struct device *dev, struct list_head *crs_res, + unsigned long type) { - struct pci_root_info *info = data; - struct acpi_resource_address64 addr; - acpi_status status; - - status = resource_to_addr(acpi_res, &addr); - if (ACPI_SUCCESS(status)) - info->res_num++; - return AE_OK; -} - -static acpi_status setup_resource(struct acpi_resource *acpi_res, void *data) -{ - struct pci_root_info *info = data; - struct resource *res; - struct acpi_resource_address64 addr; - acpi_status status; - unsigned long flags; - u64 start, orig_end, end; - - status = resource_to_addr(acpi_res, &addr); - if (!ACPI_SUCCESS(status)) - return AE_OK; - - if (addr.resource_type == ACPI_MEMORY_RANGE) { - flags = IORESOURCE_MEM; - if (addr.info.mem.caching == ACPI_PREFETCHABLE_MEMORY) - flags |= IORESOURCE_PREFETCH; - } else if (addr.resource_type == ACPI_IO_RANGE) { - flags = IORESOURCE_IO; - } else - return AE_OK; - - start = addr.minimum + addr.translation_offset; - orig_end = end = addr.maximum + addr.translation_offset; - - /* Exclude non-addressable range or non-addressable portion of range */ - end = min(end, (u64)iomem_resource.end); - if (end <= start) { - dev_info(&info->bridge->dev, - "host bridge window [%#llx-%#llx] " - "(ignored, not CPU addressable)\n", start, orig_end); - return AE_OK; - } else if (orig_end != end) { - dev_info(&info->bridge->dev, - "host bridge window [%#llx-%#llx] " - "([%#llx-%#llx] ignored, not CPU addressable)\n", - start, orig_end, end + 1, orig_end); - } + LIST_HEAD(list); + struct resource *res1, *res2, *root = NULL; + struct resource_entry *tmp, *entry, *entry2; - res = &info->res[info->res_num]; - res->name = info->name; - res->flags = flags; - res->start = start; - res->end = end; - info->res_offset[info->res_num] = addr.translation_offset; - info->res_num++; + BUG_ON((type & (IORESOURCE_MEM | IORESOURCE_IO)) == 0); + root = (type & IORESOURCE_MEM) ? &iomem_resource : &ioport_resource; - if (!pci_use_crs) - dev_printk(KERN_DEBUG, &info->bridge->dev, - "host bridge window %pR (ignored)\n", res); + list_splice_init(crs_res, &list); + resource_list_for_each_entry_safe(entry, tmp, &list) { + bool free = false; + resource_size_t end; - return AE_OK; -} - -static void coalesce_windows(struct pci_root_info *info, unsigned long type) -{ - int i, j; - struct resource *res1, *res2; - - for (i = 0; i < info->res_num; i++) { - res1 = &info->res[i]; + res1 = entry->res; if (!(res1->flags & type)) - continue; + goto next; + + /* Exclude non-addressable range or non-addressable portion */ + end = min(res1->end, root->end); + if (end <= res1->start) { + dev_info(dev, "host bridge window %pR (ignored, not CPU addressable)\n", + res1); + free = true; + goto next; + } else if (res1->end != end) { + dev_info(dev, "host bridge window %pR ([%#llx-%#llx] ignored, not CPU addressable)\n", + res1, (unsigned long long)end + 1, + (unsigned long long)res1->end); + res1->end = end; + } - for (j = i + 1; j < info->res_num; j++) { - res2 = &info->res[j]; + resource_list_for_each_entry(entry2, crs_res) { + res2 = entry2->res; if (!(res2->flags & type)) continue; @@ -353,118 +261,92 @@ static void coalesce_windows(struct pci_root_info *info, unsigned long type) if (resource_overlaps(res1, res2)) { res2->start = min(res1->start, res2->start); res2->end = max(res1->end, res2->end); - dev_info(&info->bridge->dev, - "host bridge window expanded to %pR; %pR ignored\n", + dev_info(dev, "host bridge window expanded to %pR; %pR ignored\n", res2, res1); - res1->flags = 0; + free = true; + goto next; } } + +next: + resource_list_del(entry); + if (free) + resource_list_free_entry(entry); + else + resource_list_add_tail(entry, crs_res); } } static void add_resources(struct pci_root_info *info, - struct list_head *resources) + struct list_head *resources, + struct list_head *crs_res) { - int i; - struct resource *res, *root, *conflict; - - coalesce_windows(info, IORESOURCE_MEM); - coalesce_windows(info, IORESOURCE_IO); + struct resource_entry *entry, *tmp; + struct resource *res, *conflict, *root = NULL; - for (i = 0; i < info->res_num; i++) { - res = &info->res[i]; + validate_resources(&info->bridge->dev, crs_res, IORESOURCE_MEM); + validate_resources(&info->bridge->dev, crs_res, IORESOURCE_IO); + resource_list_for_each_entry_safe(entry, tmp, crs_res) { + res = entry->res; if (res->flags & IORESOURCE_MEM) root = &iomem_resource; else if (res->flags & IORESOURCE_IO) root = &ioport_resource; else - continue; + BUG_ON(res); conflict = insert_resource_conflict(root, res); - if (conflict) + if (conflict) { dev_info(&info->bridge->dev, "ignoring host bridge window %pR (conflicts with %s %pR)\n", res, conflict->name, conflict); - else - pci_add_resource_offset(resources, res, - info->res_offset[i]); + resource_list_destroy_entry(entry); + } } -} -static void free_pci_root_info_res(struct pci_root_info *info) -{ - kfree(info->res); - info->res = NULL; - kfree(info->res_offset); - info->res_offset = NULL; - info->res_num = 0; + list_splice_tail(crs_res, resources); } -static void __release_pci_root_info(struct pci_root_info *info) +static void release_pci_root_info(struct pci_host_bridge *bridge) { - int i; struct resource *res; + struct resource_entry *entry; + struct pci_root_info *info = bridge->release_data; - for (i = 0; i < info->res_num; i++) { - res = &info->res[i]; - - if (!res->parent) - continue; - - if (!(res->flags & (IORESOURCE_MEM | IORESOURCE_IO))) - continue; - - release_resource(res); + resource_list_for_each_entry(entry, &bridge->windows) { + res = entry->res; + if (res->parent && + (res->flags & (IORESOURCE_MEM | IORESOURCE_IO))) + release_resource(res); } - free_pci_root_info_res(info); - teardown_mcfg_map(info); - kfree(info); } -static void release_pci_root_info(struct pci_host_bridge *bridge) -{ - struct pci_root_info *info = bridge->release_data; - - __release_pci_root_info(info); -} - static void probe_pci_root_info(struct pci_root_info *info, struct acpi_device *device, - int busnum, int domain) + int busnum, int domain, + struct list_head *list) { - size_t size; + int ret; + struct resource_entry *entry; sprintf(info->name, "PCI Bus %04x:%02x", domain, busnum); info->bridge = device; - - info->res_num = 0; - acpi_walk_resources(device->handle, METHOD_NAME__CRS, count_resource, - info); - if (!info->res_num) - return; - - size = sizeof(*info->res) * info->res_num; - info->res = kzalloc_node(size, GFP_KERNEL, info->sd.node); - if (!info->res) { - info->res_num = 0; - return; - } - - size = sizeof(*info->res_offset) * info->res_num; - info->res_num = 0; - info->res_offset = kzalloc_node(size, GFP_KERNEL, info->sd.node); - if (!info->res_offset) { - kfree(info->res); - info->res = NULL; - return; - } - - acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource, - info); + ret = acpi_dev_get_resources(device, list, + acpi_dev_filter_resource_type_cb, + (void *)(IORESOURCE_IO | IORESOURCE_MEM)); + if (ret < 0) + dev_warn(&device->dev, + "failed to parse _CRS method, error code %d\n", ret); + else if (ret == 0) + dev_dbg(&device->dev, + "no IO and memory resources present in _CRS\n"); + else + resource_list_for_each_entry(entry, list) + entry->res->name = info->name; } struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) @@ -473,6 +355,8 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) struct pci_root_info *info; int domain = root->segment; int busnum = root->secondary.start; + struct resource_entry *res_entry; + LIST_HEAD(crs_res); LIST_HEAD(resources); struct pci_bus *bus; struct pci_sysdata *sd; @@ -520,18 +404,22 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) memcpy(bus->sysdata, sd, sizeof(*sd)); kfree(info); } else { - probe_pci_root_info(info, device, busnum, domain); - /* insert busn res at first */ pci_add_resource(&resources, &root->secondary); + /* * _CRS with no apertures is normal, so only fall back to * defaults or native bridge info if we're ignoring _CRS. */ - if (pci_use_crs) - add_resources(info, &resources); - else { - free_pci_root_info_res(info); + probe_pci_root_info(info, device, busnum, domain, &crs_res); + if (pci_use_crs) { + add_resources(info, &resources, &crs_res); + } else { + resource_list_for_each_entry(res_entry, &crs_res) + dev_printk(KERN_DEBUG, &device->dev, + "host bridge window %pR (ignored)\n", + res_entry->res); + resource_list_free(&crs_res); x86_pci_root_bus_resources(busnum, &resources); } @@ -546,8 +434,9 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) to_pci_host_bridge(bus->bridge), release_pci_root_info, info); } else { - pci_free_resource_list(&resources); - __release_pci_root_info(info); + resource_list_free(&resources); + teardown_mcfg_map(info); + kfree(info); } } diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c index f3a2cfc..7bcf06a 100644 --- a/arch/x86/pci/bus_numa.c +++ b/arch/x86/pci/bus_numa.c @@ -31,7 +31,7 @@ void x86_pci_root_bus_resources(int bus, struct list_head *resources) { struct pci_root_info *info = x86_find_pci_root_info(bus); struct pci_root_res *root_res; - struct pci_host_bridge_window *window; + struct resource_entry *window; bool found = false; if (!info) @@ -41,7 +41,7 @@ void x86_pci_root_bus_resources(int bus, struct list_head *resources) bus); /* already added by acpi ? */ - list_for_each_entry(window, resources, list) + resource_list_for_each_entry(window, resources) if (window->res->flags & IORESOURCE_BUS) { found = true; break; diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 2fb3847..3d2612b 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -513,6 +513,31 @@ void __init pcibios_set_cache_line_size(void) } } +/* + * Some device drivers assume dev->irq won't change after calling + * pci_disable_device(). So delay releasing of IRQ resource to driver + * unbinding time. Otherwise it will break PM subsystem and drivers + * like xen-pciback etc. + */ +static int pci_irq_notifier(struct notifier_block *nb, unsigned long action, + void *data) +{ + struct pci_dev *dev = to_pci_dev(data); + + if (action != BUS_NOTIFY_UNBOUND_DRIVER) + return NOTIFY_DONE; + + if (pcibios_disable_irq) + pcibios_disable_irq(dev); + + return NOTIFY_OK; +} + +static struct notifier_block pci_irq_nb = { + .notifier_call = pci_irq_notifier, + .priority = INT_MIN, +}; + int __init pcibios_init(void) { if (!raw_pci_ops) { @@ -525,6 +550,9 @@ int __init pcibios_init(void) if (pci_bf_sort >= pci_force_bf) pci_sort_breadthfirst(); + + bus_register_notifier(&pci_bus_type, &pci_irq_nb); + return 0; } @@ -683,12 +711,6 @@ int pcibios_enable_device(struct pci_dev *dev, int mask) return 0; } -void pcibios_disable_device (struct pci_dev *dev) -{ - if (!pci_dev_msi_enabled(dev) && pcibios_disable_irq) - pcibios_disable_irq(dev); -} - int pci_ext_cfg_avail(void) { if (raw_pci_ext_ops) diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c index 852aa4c..efb8493 100644 --- a/arch/x86/pci/intel_mid_pci.c +++ b/arch/x86/pci/intel_mid_pci.c @@ -234,10 +234,10 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev) static void intel_mid_pci_irq_disable(struct pci_dev *dev) { - if (!mp_should_keep_irq(&dev->dev) && dev->irq_managed && - dev->irq > 0) { + if (dev->irq_managed && dev->irq > 0) { mp_unmap_irq(dev->irq); dev->irq_managed = 0; + dev->irq = 0; } } diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 5dc6ca5..e71b3db 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -1256,22 +1256,9 @@ static int pirq_enable_irq(struct pci_dev *dev) return 0; } -bool mp_should_keep_irq(struct device *dev) -{ - if (dev->power.is_prepared) - return true; -#ifdef CONFIG_PM - if (dev->power.runtime_status == RPM_SUSPENDING) - return true; -#endif - - return false; -} - static void pirq_disable_irq(struct pci_dev *dev) { - if (io_apic_assign_pci_irqs && !mp_should_keep_irq(&dev->dev) && - dev->irq_managed && dev->irq) { + if (io_apic_assign_pci_irqs && dev->irq_managed && dev->irq) { mp_unmap_irq(dev->irq); dev->irq = 0; dev->irq_managed = 0; diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 676e5e0..dd30b7e 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -397,12 +397,12 @@ static acpi_status check_mcfg_resource(struct acpi_resource *res, void *data) status = acpi_resource_to_address64(res, &address); if (ACPI_FAILURE(status) || - (address.address_length <= 0) || + (address.address.address_length <= 0) || (address.resource_type != ACPI_MEMORY_RANGE)) return AE_OK; - if ((mcfg_res->start >= address.minimum) && - (mcfg_res->end < (address.minimum + address.address_length))) { + if ((mcfg_res->start >= address.address.minimum) && + (mcfg_res->end < (address.address.minimum + address.address.address_length))) { mcfg_res->flags = 1; return AE_CTRL_TERMINATE; } diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 9098d88..d22f4b5 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -298,12 +298,16 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) map_irq.entry_nr = nvec; } else if (type == PCI_CAP_ID_MSIX) { int pos; + unsigned long flags; u32 table_offset, bir; pos = dev->msix_cap; pci_read_config_dword(dev, pos + PCI_MSIX_TABLE, &table_offset); bir = (u8)(table_offset & PCI_MSIX_TABLE_BIR); + flags = pci_resource_flags(dev, bir); + if (!flags || (flags & IORESOURCE_UNSET)) + return -EINVAL; map_irq.table_base = pci_resource_start(dev, bir); map_irq.entry_nr = msidesc->msi_attrib.entry_nr; diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile index 85afde1..a62e0be 100644 --- a/arch/x86/platform/Makefile +++ b/arch/x86/platform/Makefile @@ -5,6 +5,7 @@ obj-y += geode/ obj-y += goldfish/ obj-y += iris/ obj-y += intel-mid/ +obj-y += intel-quark/ obj-y += olpc/ obj-y += scx200/ obj-y += sfi/ diff --git a/arch/x86/platform/efi/efi_stub_64.S b/arch/x86/platform/efi/efi_stub_64.S index 5fcda72..86d0f9e 100644 --- a/arch/x86/platform/efi/efi_stub_64.S +++ b/arch/x86/platform/efi/efi_stub_64.S @@ -91,167 +91,6 @@ ENTRY(efi_call) ret ENDPROC(efi_call) -#ifdef CONFIG_EFI_MIXED - -/* - * We run this function from the 1:1 mapping. - * - * This function must be invoked with a 1:1 mapped stack. - */ -ENTRY(__efi64_thunk) - movl %ds, %eax - push %rax - movl %es, %eax - push %rax - movl %ss, %eax - push %rax - - subq $32, %rsp - movl %esi, 0x0(%rsp) - movl %edx, 0x4(%rsp) - movl %ecx, 0x8(%rsp) - movq %r8, %rsi - movl %esi, 0xc(%rsp) - movq %r9, %rsi - movl %esi, 0x10(%rsp) - - sgdt save_gdt(%rip) - - leaq 1f(%rip), %rbx - movq %rbx, func_rt_ptr(%rip) - - /* Switch to gdt with 32-bit segments */ - movl 64(%rsp), %eax - lgdt (%rax) - - leaq efi_enter32(%rip), %rax - pushq $__KERNEL_CS - pushq %rax - lretq - -1: addq $32, %rsp - - lgdt save_gdt(%rip) - - pop %rbx - movl %ebx, %ss - pop %rbx - movl %ebx, %es - pop %rbx - movl %ebx, %ds - - /* - * Convert 32-bit status code into 64-bit. - */ - test %rax, %rax - jz 1f - movl %eax, %ecx - andl $0x0fffffff, %ecx - andl $0xf0000000, %eax - shl $32, %rax - or %rcx, %rax -1: - ret -ENDPROC(__efi64_thunk) - -ENTRY(efi_exit32) - movq func_rt_ptr(%rip), %rax - push %rax - mov %rdi, %rax - ret -ENDPROC(efi_exit32) - - .code32 -/* - * EFI service pointer must be in %edi. - * - * The stack should represent the 32-bit calling convention. - */ -ENTRY(efi_enter32) - movl $__KERNEL_DS, %eax - movl %eax, %ds - movl %eax, %es - movl %eax, %ss - - /* Reload pgtables */ - movl %cr3, %eax - movl %eax, %cr3 - - /* Disable paging */ - movl %cr0, %eax - btrl $X86_CR0_PG_BIT, %eax - movl %eax, %cr0 - - /* Disable long mode via EFER */ - movl $MSR_EFER, %ecx - rdmsr - btrl $_EFER_LME, %eax - wrmsr - - call *%edi - - /* We must preserve return value */ - movl %eax, %edi - - /* - * Some firmware will return with interrupts enabled. Be sure to - * disable them before we switch GDTs. - */ - cli - - movl 68(%esp), %eax - movl %eax, 2(%eax) - lgdtl (%eax) - - movl %cr4, %eax - btsl $(X86_CR4_PAE_BIT), %eax - movl %eax, %cr4 - - movl %cr3, %eax - movl %eax, %cr3 - - movl $MSR_EFER, %ecx - rdmsr - btsl $_EFER_LME, %eax - wrmsr - - xorl %eax, %eax - lldt %ax - - movl 72(%esp), %eax - pushl $__KERNEL_CS - pushl %eax - - /* Enable paging */ - movl %cr0, %eax - btsl $X86_CR0_PG_BIT, %eax - movl %eax, %cr0 - lret -ENDPROC(efi_enter32) - - .data - .balign 8 - .global efi32_boot_gdt -efi32_boot_gdt: .word 0 - .quad 0 - -save_gdt: .word 0 - .quad 0 -func_rt_ptr: .quad 0 - - .global efi_gdt64 -efi_gdt64: - .word efi_gdt64_end - efi_gdt64 - .long 0 /* Filled out by user */ - .word 0 - .quad 0x0000000000000000 /* NULL descriptor */ - .quad 0x00af9a000000ffff /* __KERNEL_CS */ - .quad 0x00cf92000000ffff /* __KERNEL_DS */ - .quad 0x0080890000000000 /* TS descriptor */ - .quad 0x0000000000000000 /* TS continued */ -efi_gdt64_end: -#endif /* CONFIG_EFI_MIXED */ - .data ENTRY(efi_scratch) .fill 3,8,0 diff --git a/arch/x86/platform/efi/efi_thunk_64.S b/arch/x86/platform/efi/efi_thunk_64.S index 8806fa7..ff85d28 100644 --- a/arch/x86/platform/efi/efi_thunk_64.S +++ b/arch/x86/platform/efi/efi_thunk_64.S @@ -1,9 +1,26 @@ /* * Copyright (C) 2014 Intel Corporation; author Matt Fleming + * + * Support for invoking 32-bit EFI runtime services from a 64-bit + * kernel. + * + * The below thunking functions are only used after ExitBootServices() + * has been called. This simplifies things considerably as compared with + * the early EFI thunking because we can leave all the kernel state + * intact (GDT, IDT, etc) and simply invoke the the 32-bit EFI runtime + * services from __KERNEL32_CS. This means we can continue to service + * interrupts across an EFI mixed mode call. + * + * We do however, need to handle the fact that we're running in a full + * 64-bit virtual address space. Things like the stack and instruction + * addresses need to be accessible by the 32-bit firmware, so we rely on + * using the identity mappings in the EFI page table to access the stack + * and kernel text (see efi_setup_page_tables()). */ #include <linux/linkage.h> #include <asm/page_types.h> +#include <asm/segment.h> .text .code64 @@ -33,14 +50,6 @@ ENTRY(efi64_thunk) leaq efi_exit32(%rip), %rbx subq %rax, %rbx movl %ebx, 8(%rsp) - leaq efi_gdt64(%rip), %rbx - subq %rax, %rbx - movl %ebx, 2(%ebx) - movl %ebx, 4(%rsp) - leaq efi_gdt32(%rip), %rbx - subq %rax, %rbx - movl %ebx, 2(%ebx) - movl %ebx, (%rsp) leaq __efi64_thunk(%rip), %rbx subq %rax, %rbx @@ -52,14 +61,92 @@ ENTRY(efi64_thunk) retq ENDPROC(efi64_thunk) - .data -efi_gdt32: - .word efi_gdt32_end - efi_gdt32 - .long 0 /* Filled out above */ - .word 0 - .quad 0x0000000000000000 /* NULL descriptor */ - .quad 0x00cf9a000000ffff /* __KERNEL_CS */ - .quad 0x00cf93000000ffff /* __KERNEL_DS */ -efi_gdt32_end: +/* + * We run this function from the 1:1 mapping. + * + * This function must be invoked with a 1:1 mapped stack. + */ +ENTRY(__efi64_thunk) + movl %ds, %eax + push %rax + movl %es, %eax + push %rax + movl %ss, %eax + push %rax + + subq $32, %rsp + movl %esi, 0x0(%rsp) + movl %edx, 0x4(%rsp) + movl %ecx, 0x8(%rsp) + movq %r8, %rsi + movl %esi, 0xc(%rsp) + movq %r9, %rsi + movl %esi, 0x10(%rsp) + + leaq 1f(%rip), %rbx + movq %rbx, func_rt_ptr(%rip) + + /* Switch to 32-bit descriptor */ + pushq $__KERNEL32_CS + leaq efi_enter32(%rip), %rax + pushq %rax + lretq + +1: addq $32, %rsp + + pop %rbx + movl %ebx, %ss + pop %rbx + movl %ebx, %es + pop %rbx + movl %ebx, %ds + /* + * Convert 32-bit status code into 64-bit. + */ + test %rax, %rax + jz 1f + movl %eax, %ecx + andl $0x0fffffff, %ecx + andl $0xf0000000, %eax + shl $32, %rax + or %rcx, %rax +1: + ret +ENDPROC(__efi64_thunk) + +ENTRY(efi_exit32) + movq func_rt_ptr(%rip), %rax + push %rax + mov %rdi, %rax + ret +ENDPROC(efi_exit32) + + .code32 +/* + * EFI service pointer must be in %edi. + * + * The stack should represent the 32-bit calling convention. + */ +ENTRY(efi_enter32) + movl $__KERNEL_DS, %eax + movl %eax, %ds + movl %eax, %es + movl %eax, %ss + + call *%edi + + /* We must preserve return value */ + movl %eax, %edi + + movl 72(%esp), %eax + pushl $__KERNEL_CS + pushl %eax + + lret +ENDPROC(efi_enter32) + + .data + .balign 8 +func_rt_ptr: .quad 0 efi_saved_sp: .quad 0 diff --git a/arch/x86/platform/intel-mid/device_libs/Makefile b/arch/x86/platform/intel-mid/device_libs/Makefile index af9307f..91ec9f8 100644 --- a/arch/x86/platform/intel-mid/device_libs/Makefile +++ b/arch/x86/platform/intel-mid/device_libs/Makefile @@ -16,8 +16,6 @@ obj-$(subst m,y,$(CONFIG_INPUT_MPU3050)) += platform_mpu3050.o obj-$(subst m,y,$(CONFIG_INPUT_BMA150)) += platform_bma023.o obj-$(subst m,y,$(CONFIG_GPIO_PCA953X)) += platform_tca6416.o obj-$(subst m,y,$(CONFIG_DRM_MEDFIELD)) += platform_tc35876x.o -# SPI Devices -obj-$(subst m,y,$(CONFIG_SERIAL_MRST_MAX3110)) += platform_max3111.o # MISC Devices obj-$(subst m,y,$(CONFIG_KEYBOARD_GPIO)) += platform_gpio_keys.o obj-$(subst m,y,$(CONFIG_INTEL_MID_WATCHDOG)) += platform_wdt.o diff --git a/arch/x86/platform/intel-mid/device_libs/platform_max3111.c b/arch/x86/platform/intel-mid/device_libs/platform_max3111.c deleted file mode 100644 index afd1df9..0000000 --- a/arch/x86/platform/intel-mid/device_libs/platform_max3111.c +++ /dev/null @@ -1,35 +0,0 @@ -/* - * platform_max3111.c: max3111 platform data initilization file - * - * (C) Copyright 2013 Intel Corporation - * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. - */ - -#include <linux/gpio.h> -#include <linux/spi/spi.h> -#include <asm/intel-mid.h> - -static void __init *max3111_platform_data(void *info) -{ - struct spi_board_info *spi_info = info; - int intr = get_gpio_by_name("max3111_int"); - - spi_info->mode = SPI_MODE_0; - if (intr == -1) - return NULL; - spi_info->irq = intr + INTEL_MID_IRQ_OFFSET; - return NULL; -} - -static const struct devs_id max3111_dev_id __initconst = { - .name = "spi_max3111", - .type = SFI_DEV_TYPE_SPI, - .get_platform_data = &max3111_platform_data, -}; - -sfi_device(max3111_dev_id); diff --git a/arch/x86/platform/intel-mid/early_printk_intel_mid.c b/arch/x86/platform/intel-mid/early_printk_intel_mid.c index e0bd082..4e72082 100644 --- a/arch/x86/platform/intel-mid/early_printk_intel_mid.c +++ b/arch/x86/platform/intel-mid/early_printk_intel_mid.c @@ -10,15 +10,13 @@ */ /* - * This file implements two early consoles named mrst and hsu. - * mrst is based on Maxim3110 spi-uart device, it exists in both - * Moorestown and Medfield platforms, while hsu is based on a High - * Speed UART device which only exists in the Medfield platform + * This file implements early console named hsu. + * hsu is based on a High Speed UART device which only exists in the Medfield + * platform */ #include <linux/serial_reg.h> #include <linux/serial_mfd.h> -#include <linux/kmsg_dump.h> #include <linux/console.h> #include <linux/kernel.h> #include <linux/delay.h> @@ -28,216 +26,6 @@ #include <asm/pgtable.h> #include <asm/intel-mid.h> -#define MRST_SPI_TIMEOUT 0x200000 -#define MRST_REGBASE_SPI0 0xff128000 -#define MRST_REGBASE_SPI1 0xff128400 -#define MRST_CLK_SPI0_REG 0xff11d86c - -/* Bit fields in CTRLR0 */ -#define SPI_DFS_OFFSET 0 - -#define SPI_FRF_OFFSET 4 -#define SPI_FRF_SPI 0x0 -#define SPI_FRF_SSP 0x1 -#define SPI_FRF_MICROWIRE 0x2 -#define SPI_FRF_RESV 0x3 - -#define SPI_MODE_OFFSET 6 -#define SPI_SCPH_OFFSET 6 -#define SPI_SCOL_OFFSET 7 -#define SPI_TMOD_OFFSET 8 -#define SPI_TMOD_TR 0x0 /* xmit & recv */ -#define SPI_TMOD_TO 0x1 /* xmit only */ -#define SPI_TMOD_RO 0x2 /* recv only */ -#define SPI_TMOD_EPROMREAD 0x3 /* eeprom read mode */ - -#define SPI_SLVOE_OFFSET 10 -#define SPI_SRL_OFFSET 11 -#define SPI_CFS_OFFSET 12 - -/* Bit fields in SR, 7 bits */ -#define SR_MASK 0x7f /* cover 7 bits */ -#define SR_BUSY (1 << 0) -#define SR_TF_NOT_FULL (1 << 1) -#define SR_TF_EMPT (1 << 2) -#define SR_RF_NOT_EMPT (1 << 3) -#define SR_RF_FULL (1 << 4) -#define SR_TX_ERR (1 << 5) -#define SR_DCOL (1 << 6) - -struct dw_spi_reg { - u32 ctrl0; - u32 ctrl1; - u32 ssienr; - u32 mwcr; - u32 ser; - u32 baudr; - u32 txfltr; - u32 rxfltr; - u32 txflr; - u32 rxflr; - u32 sr; - u32 imr; - u32 isr; - u32 risr; - u32 txoicr; - u32 rxoicr; - u32 rxuicr; - u32 msticr; - u32 icr; - u32 dmacr; - u32 dmatdlr; - u32 dmardlr; - u32 idr; - u32 version; - - /* Currently operates as 32 bits, though only the low 16 bits matter */ - u32 dr; -} __packed; - -#define dw_readl(dw, name) __raw_readl(&(dw)->name) -#define dw_writel(dw, name, val) __raw_writel((val), &(dw)->name) - -/* Default use SPI0 register for mrst, we will detect Penwell and use SPI1 */ -static unsigned long mrst_spi_paddr = MRST_REGBASE_SPI0; - -static u32 *pclk_spi0; -/* Always contains an accessible address, start with 0 */ -static struct dw_spi_reg *pspi; - -static struct kmsg_dumper dw_dumper; -static int dumper_registered; - -static void dw_kmsg_dump(struct kmsg_dumper *dumper, - enum kmsg_dump_reason reason) -{ - static char line[1024]; - size_t len; - - /* When run to this, we'd better re-init the HW */ - mrst_early_console_init(); - - while (kmsg_dump_get_line(dumper, true, line, sizeof(line), &len)) - early_mrst_console.write(&early_mrst_console, line, len); -} - -/* Set the ratio rate to 115200, 8n1, IRQ disabled */ -static void max3110_write_config(void) -{ - u16 config; - - config = 0xc001; - dw_writel(pspi, dr, config); -} - -/* Translate char to a eligible word and send to max3110 */ -static void max3110_write_data(char c) -{ - u16 data; - - data = 0x8000 | c; - dw_writel(pspi, dr, data); -} - -void mrst_early_console_init(void) -{ - u32 ctrlr0 = 0; - u32 spi0_cdiv; - u32 freq; /* Freqency info only need be searched once */ - - /* Base clk is 100 MHz, the actual clk = 100M / (clk_divider + 1) */ - pclk_spi0 = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE, - MRST_CLK_SPI0_REG); - spi0_cdiv = ((*pclk_spi0) & 0xe00) >> 9; - freq = 100000000 / (spi0_cdiv + 1); - - if (intel_mid_identify_cpu() == INTEL_MID_CPU_CHIP_PENWELL) - mrst_spi_paddr = MRST_REGBASE_SPI1; - - pspi = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE, - mrst_spi_paddr); - - /* Disable SPI controller */ - dw_writel(pspi, ssienr, 0); - - /* Set control param, 8 bits, transmit only mode */ - ctrlr0 = dw_readl(pspi, ctrl0); - - ctrlr0 &= 0xfcc0; - ctrlr0 |= 0xf | (SPI_FRF_SPI << SPI_FRF_OFFSET) - | (SPI_TMOD_TO << SPI_TMOD_OFFSET); - dw_writel(pspi, ctrl0, ctrlr0); - - /* - * Change the spi0 clk to comply with 115200 bps, use 100000 to - * calculate the clk dividor to make the clock a little slower - * than real baud rate. - */ - dw_writel(pspi, baudr, freq/100000); - - /* Disable all INT for early phase */ - dw_writel(pspi, imr, 0x0); - - /* Set the cs to spi-uart */ - dw_writel(pspi, ser, 0x2); - - /* Enable the HW, the last step for HW init */ - dw_writel(pspi, ssienr, 0x1); - - /* Set the default configuration */ - max3110_write_config(); - - /* Register the kmsg dumper */ - if (!dumper_registered) { - dw_dumper.dump = dw_kmsg_dump; - kmsg_dump_register(&dw_dumper); - dumper_registered = 1; - } -} - -/* Slave select should be called in the read/write function */ -static void early_mrst_spi_putc(char c) -{ - unsigned int timeout; - u32 sr; - - timeout = MRST_SPI_TIMEOUT; - /* Early putc needs to make sure the TX FIFO is not full */ - while (--timeout) { - sr = dw_readl(pspi, sr); - if (!(sr & SR_TF_NOT_FULL)) - cpu_relax(); - else - break; - } - - if (!timeout) - pr_warn("MRST earlycon: timed out\n"); - else - max3110_write_data(c); -} - -/* Early SPI only uses polling mode */ -static void early_mrst_spi_write(struct console *con, const char *str, - unsigned n) -{ - int i; - - for (i = 0; i < n && *str; i++) { - if (*str == '\n') - early_mrst_spi_putc('\r'); - early_mrst_spi_putc(*str); - str++; - } -} - -struct console early_mrst_console = { - .name = "earlymrst", - .write = early_mrst_spi_write, - .flags = CON_PRINTBUFFER, - .index = -1, -}; - /* * Following is the early console based on Medfield HSU (High * Speed UART) device. @@ -259,7 +47,7 @@ void hsu_early_console_init(const char *s) port = clamp_val(port, 0, 2); paddr = HSU_PORT_BASE + port * 0x80; - phsu = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE, paddr); + phsu = (void __iomem *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE, paddr); /* Disable FIFO */ writeb(0x0, phsu + UART_FCR); diff --git a/arch/x86/platform/intel-mid/intel_mid_vrtc.c b/arch/x86/platform/intel-mid/intel_mid_vrtc.c index 4762cff..32947ba 100644 --- a/arch/x86/platform/intel-mid/intel_mid_vrtc.c +++ b/arch/x86/platform/intel-mid/intel_mid_vrtc.c @@ -110,7 +110,7 @@ int vrtc_set_mmss(const struct timespec *now) spin_unlock_irqrestore(&rtc_lock, flags); } else { pr_err("%s: Invalid vRTC value: write of %lx to vRTC failed\n", - __FUNCTION__, now->tv_sec); + __func__, now->tv_sec); retval = -EINVAL; } return retval; diff --git a/arch/x86/platform/intel-quark/Makefile b/arch/x86/platform/intel-quark/Makefile new file mode 100644 index 0000000..9cc57ed --- /dev/null +++ b/arch/x86/platform/intel-quark/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_INTEL_IMR) += imr.o +obj-$(CONFIG_DEBUG_IMR_SELFTEST) += imr_selftest.o diff --git a/arch/x86/platform/intel-quark/imr.c b/arch/x86/platform/intel-quark/imr.c new file mode 100644 index 0000000..0ee619f --- /dev/null +++ b/arch/x86/platform/intel-quark/imr.c @@ -0,0 +1,661 @@ +/** + * imr.c + * + * Copyright(c) 2013 Intel Corporation. + * Copyright(c) 2015 Bryan O'Donoghue <pure.logic@nexus-software.ie> + * + * IMR registers define an isolated region of memory that can + * be masked to prohibit certain system agents from accessing memory. + * When a device behind a masked port performs an access - snooped or + * not, an IMR may optionally prevent that transaction from changing + * the state of memory or from getting correct data in response to the + * operation. + * + * Write data will be dropped and reads will return 0xFFFFFFFF, the + * system will reset and system BIOS will print out an error message to + * inform the user that an IMR has been violated. + * + * This code is based on the Linux MTRR code and reference code from + * Intel's Quark BSP EFI, Linux and grub code. + * + * See quark-x1000-datasheet.pdf for register definitions. + * http://www.intel.com/content/dam/www/public/us/en/documents/datasheets/quark-x1000-datasheet.pdf + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <asm-generic/sections.h> +#include <asm/cpu_device_id.h> +#include <asm/imr.h> +#include <asm/iosf_mbi.h> +#include <linux/debugfs.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/types.h> + +struct imr_device { + struct dentry *file; + bool init; + struct mutex lock; + int max_imr; + int reg_base; +}; + +static struct imr_device imr_dev; + +/* + * IMR read/write mask control registers. + * See quark-x1000-datasheet.pdf sections 12.7.4.5 and 12.7.4.6 for + * bit definitions. + * + * addr_hi + * 31 Lock bit + * 30:24 Reserved + * 23:2 1 KiB aligned lo address + * 1:0 Reserved + * + * addr_hi + * 31:24 Reserved + * 23:2 1 KiB aligned hi address + * 1:0 Reserved + */ +#define IMR_LOCK BIT(31) + +struct imr_regs { + u32 addr_lo; + u32 addr_hi; + u32 rmask; + u32 wmask; +}; + +#define IMR_NUM_REGS (sizeof(struct imr_regs)/sizeof(u32)) +#define IMR_SHIFT 8 +#define imr_to_phys(x) ((x) << IMR_SHIFT) +#define phys_to_imr(x) ((x) >> IMR_SHIFT) + +/** + * imr_is_enabled - true if an IMR is enabled false otherwise. + * + * Determines if an IMR is enabled based on address range and read/write + * mask. An IMR set with an address range set to zero and a read/write + * access mask set to all is considered to be disabled. An IMR in any + * other state - for example set to zero but without read/write access + * all is considered to be enabled. This definition of disabled is how + * firmware switches off an IMR and is maintained in kernel for + * consistency. + * + * @imr: pointer to IMR descriptor. + * @return: true if IMR enabled false if disabled. + */ +static inline int imr_is_enabled(struct imr_regs *imr) +{ + return !(imr->rmask == IMR_READ_ACCESS_ALL && + imr->wmask == IMR_WRITE_ACCESS_ALL && + imr_to_phys(imr->addr_lo) == 0 && + imr_to_phys(imr->addr_hi) == 0); +} + +/** + * imr_read - read an IMR at a given index. + * + * Requires caller to hold imr mutex. + * + * @idev: pointer to imr_device structure. + * @imr_id: IMR entry to read. + * @imr: IMR structure representing address and access masks. + * @return: 0 on success or error code passed from mbi_iosf on failure. + */ +static int imr_read(struct imr_device *idev, u32 imr_id, struct imr_regs *imr) +{ + u32 reg = imr_id * IMR_NUM_REGS + idev->reg_base; + int ret; + + ret = iosf_mbi_read(QRK_MBI_UNIT_MM, QRK_MBI_MM_READ, + reg++, &imr->addr_lo); + if (ret) + return ret; + + ret = iosf_mbi_read(QRK_MBI_UNIT_MM, QRK_MBI_MM_READ, + reg++, &imr->addr_hi); + if (ret) + return ret; + + ret = iosf_mbi_read(QRK_MBI_UNIT_MM, QRK_MBI_MM_READ, + reg++, &imr->rmask); + if (ret) + return ret; + + return iosf_mbi_read(QRK_MBI_UNIT_MM, QRK_MBI_MM_READ, + reg++, &imr->wmask); +} + +/** + * imr_write - write an IMR at a given index. + * + * Requires caller to hold imr mutex. + * Note lock bits need to be written independently of address bits. + * + * @idev: pointer to imr_device structure. + * @imr_id: IMR entry to write. + * @imr: IMR structure representing address and access masks. + * @lock: indicates if the IMR lock bit should be applied. + * @return: 0 on success or error code passed from mbi_iosf on failure. + */ +static int imr_write(struct imr_device *idev, u32 imr_id, + struct imr_regs *imr, bool lock) +{ + unsigned long flags; + u32 reg = imr_id * IMR_NUM_REGS + idev->reg_base; + int ret; + + local_irq_save(flags); + + ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE, reg++, + imr->addr_lo); + if (ret) + goto failed; + + ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE, + reg++, imr->addr_hi); + if (ret) + goto failed; + + ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE, + reg++, imr->rmask); + if (ret) + goto failed; + + ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE, + reg++, imr->wmask); + if (ret) + goto failed; + + /* Lock bit must be set separately to addr_lo address bits. */ + if (lock) { + imr->addr_lo |= IMR_LOCK; + ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE, + reg - IMR_NUM_REGS, imr->addr_lo); + if (ret) + goto failed; + } + + local_irq_restore(flags); + return 0; +failed: + /* + * If writing to the IOSF failed then we're in an unknown state, + * likely a very bad state. An IMR in an invalid state will almost + * certainly lead to a memory access violation. + */ + local_irq_restore(flags); + WARN(ret, "IOSF-MBI write fail range 0x%08x-0x%08x unreliable\n", + imr_to_phys(imr->addr_lo), imr_to_phys(imr->addr_hi) + IMR_MASK); + + return ret; +} + +/** + * imr_dbgfs_state_show - print state of IMR registers. + * + * @s: pointer to seq_file for output. + * @unused: unused parameter. + * @return: 0 on success or error code passed from mbi_iosf on failure. + */ +static int imr_dbgfs_state_show(struct seq_file *s, void *unused) +{ + phys_addr_t base; + phys_addr_t end; + int i; + struct imr_device *idev = s->private; + struct imr_regs imr; + size_t size; + int ret = -ENODEV; + + mutex_lock(&idev->lock); + + for (i = 0; i < idev->max_imr; i++) { + + ret = imr_read(idev, i, &imr); + if (ret) + break; + + /* + * Remember to add IMR_ALIGN bytes to size to indicate the + * inherent IMR_ALIGN size bytes contained in the masked away + * lower ten bits. + */ + if (imr_is_enabled(&imr)) { + base = imr_to_phys(imr.addr_lo); + end = imr_to_phys(imr.addr_hi) + IMR_MASK; + } else { + base = 0; + end = 0; + } + size = end - base; + seq_printf(s, "imr%02i: base=%pa, end=%pa, size=0x%08zx " + "rmask=0x%08x, wmask=0x%08x, %s, %s\n", i, + &base, &end, size, imr.rmask, imr.wmask, + imr_is_enabled(&imr) ? "enabled " : "disabled", + imr.addr_lo & IMR_LOCK ? "locked" : "unlocked"); + } + + mutex_unlock(&idev->lock); + return ret; +} + +/** + * imr_state_open - debugfs open callback. + * + * @inode: pointer to struct inode. + * @file: pointer to struct file. + * @return: result of single open. + */ +static int imr_state_open(struct inode *inode, struct file *file) +{ + return single_open(file, imr_dbgfs_state_show, inode->i_private); +} + +static const struct file_operations imr_state_ops = { + .open = imr_state_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/** + * imr_debugfs_register - register debugfs hooks. + * + * @idev: pointer to imr_device structure. + * @return: 0 on success - errno on failure. + */ +static int imr_debugfs_register(struct imr_device *idev) +{ + idev->file = debugfs_create_file("imr_state", S_IFREG | S_IRUGO, NULL, + idev, &imr_state_ops); + return PTR_ERR_OR_ZERO(idev->file); +} + +/** + * imr_debugfs_unregister - unregister debugfs hooks. + * + * @idev: pointer to imr_device structure. + * @return: + */ +static void imr_debugfs_unregister(struct imr_device *idev) +{ + debugfs_remove(idev->file); +} + +/** + * imr_check_params - check passed address range IMR alignment and non-zero size + * + * @base: base address of intended IMR. + * @size: size of intended IMR. + * @return: zero on valid range -EINVAL on unaligned base/size. + */ +static int imr_check_params(phys_addr_t base, size_t size) +{ + if ((base & IMR_MASK) || (size & IMR_MASK)) { + pr_err("base %pa size 0x%08zx must align to 1KiB\n", + &base, size); + return -EINVAL; + } + if (size == 0) + return -EINVAL; + + return 0; +} + +/** + * imr_raw_size - account for the IMR_ALIGN bytes that addr_hi appends. + * + * IMR addr_hi has a built in offset of plus IMR_ALIGN (0x400) bytes from the + * value in the register. We need to subtract IMR_ALIGN bytes from input sizes + * as a result. + * + * @size: input size bytes. + * @return: reduced size. + */ +static inline size_t imr_raw_size(size_t size) +{ + return size - IMR_ALIGN; +} + +/** + * imr_address_overlap - detects an address overlap. + * + * @addr: address to check against an existing IMR. + * @imr: imr being checked. + * @return: true for overlap false for no overlap. + */ +static inline int imr_address_overlap(phys_addr_t addr, struct imr_regs *imr) +{ + return addr >= imr_to_phys(imr->addr_lo) && addr <= imr_to_phys(imr->addr_hi); +} + +/** + * imr_add_range - add an Isolated Memory Region. + * + * @base: physical base address of region aligned to 1KiB. + * @size: physical size of region in bytes must be aligned to 1KiB. + * @read_mask: read access mask. + * @write_mask: write access mask. + * @lock: indicates whether or not to permanently lock this region. + * @return: zero on success or negative value indicating error. + */ +int imr_add_range(phys_addr_t base, size_t size, + unsigned int rmask, unsigned int wmask, bool lock) +{ + phys_addr_t end; + unsigned int i; + struct imr_device *idev = &imr_dev; + struct imr_regs imr; + size_t raw_size; + int reg; + int ret; + + if (WARN_ONCE(idev->init == false, "driver not initialized")) + return -ENODEV; + + ret = imr_check_params(base, size); + if (ret) + return ret; + + /* Tweak the size value. */ + raw_size = imr_raw_size(size); + end = base + raw_size; + + /* + * Check for reserved IMR value common to firmware, kernel and grub + * indicating a disabled IMR. + */ + imr.addr_lo = phys_to_imr(base); + imr.addr_hi = phys_to_imr(end); + imr.rmask = rmask; + imr.wmask = wmask; + if (!imr_is_enabled(&imr)) + return -ENOTSUPP; + + mutex_lock(&idev->lock); + + /* + * Find a free IMR while checking for an existing overlapping range. + * Note there's no restriction in silicon to prevent IMR overlaps. + * For the sake of simplicity and ease in defining/debugging an IMR + * memory map we exclude IMR overlaps. + */ + reg = -1; + for (i = 0; i < idev->max_imr; i++) { + ret = imr_read(idev, i, &imr); + if (ret) + goto failed; + + /* Find overlap @ base or end of requested range. */ + ret = -EINVAL; + if (imr_is_enabled(&imr)) { + if (imr_address_overlap(base, &imr)) + goto failed; + if (imr_address_overlap(end, &imr)) + goto failed; + } else { + reg = i; + } + } + + /* Error out if we have no free IMR entries. */ + if (reg == -1) { + ret = -ENOMEM; + goto failed; + } + + pr_debug("add %d phys %pa-%pa size %zx mask 0x%08x wmask 0x%08x\n", + reg, &base, &end, raw_size, rmask, wmask); + + /* Enable IMR at specified range and access mask. */ + imr.addr_lo = phys_to_imr(base); + imr.addr_hi = phys_to_imr(end); + imr.rmask = rmask; + imr.wmask = wmask; + + ret = imr_write(idev, reg, &imr, lock); + if (ret < 0) { + /* + * In the highly unlikely event iosf_mbi_write failed + * attempt to rollback the IMR setup skipping the trapping + * of further IOSF write failures. + */ + imr.addr_lo = 0; + imr.addr_hi = 0; + imr.rmask = IMR_READ_ACCESS_ALL; + imr.wmask = IMR_WRITE_ACCESS_ALL; + imr_write(idev, reg, &imr, false); + } +failed: + mutex_unlock(&idev->lock); + return ret; +} +EXPORT_SYMBOL_GPL(imr_add_range); + +/** + * __imr_remove_range - delete an Isolated Memory Region. + * + * This function allows you to delete an IMR by its index specified by reg or + * by address range specified by base and size respectively. If you specify an + * index on its own the base and size parameters are ignored. + * imr_remove_range(0, base, size); delete IMR at index 0 base/size ignored. + * imr_remove_range(-1, base, size); delete IMR from base to base+size. + * + * @reg: imr index to remove. + * @base: physical base address of region aligned to 1 KiB. + * @size: physical size of region in bytes aligned to 1 KiB. + * @return: -EINVAL on invalid range or out or range id + * -ENODEV if reg is valid but no IMR exists or is locked + * 0 on success. + */ +static int __imr_remove_range(int reg, phys_addr_t base, size_t size) +{ + phys_addr_t end; + bool found = false; + unsigned int i; + struct imr_device *idev = &imr_dev; + struct imr_regs imr; + size_t raw_size; + int ret = 0; + + if (WARN_ONCE(idev->init == false, "driver not initialized")) + return -ENODEV; + + /* + * Validate address range if deleting by address, else we are + * deleting by index where base and size will be ignored. + */ + if (reg == -1) { + ret = imr_check_params(base, size); + if (ret) + return ret; + } + + /* Tweak the size value. */ + raw_size = imr_raw_size(size); + end = base + raw_size; + + mutex_lock(&idev->lock); + + if (reg >= 0) { + /* If a specific IMR is given try to use it. */ + ret = imr_read(idev, reg, &imr); + if (ret) + goto failed; + + if (!imr_is_enabled(&imr) || imr.addr_lo & IMR_LOCK) { + ret = -ENODEV; + goto failed; + } + found = true; + } else { + /* Search for match based on address range. */ + for (i = 0; i < idev->max_imr; i++) { + ret = imr_read(idev, i, &imr); + if (ret) + goto failed; + + if (!imr_is_enabled(&imr) || imr.addr_lo & IMR_LOCK) + continue; + + if ((imr_to_phys(imr.addr_lo) == base) && + (imr_to_phys(imr.addr_hi) == end)) { + found = true; + reg = i; + break; + } + } + } + + if (!found) { + ret = -ENODEV; + goto failed; + } + + pr_debug("remove %d phys %pa-%pa size %zx\n", reg, &base, &end, raw_size); + + /* Tear down the IMR. */ + imr.addr_lo = 0; + imr.addr_hi = 0; + imr.rmask = IMR_READ_ACCESS_ALL; + imr.wmask = IMR_WRITE_ACCESS_ALL; + + ret = imr_write(idev, reg, &imr, false); + +failed: + mutex_unlock(&idev->lock); + return ret; +} + +/** + * imr_remove_range - delete an Isolated Memory Region by address + * + * This function allows you to delete an IMR by an address range specified + * by base and size respectively. + * imr_remove_range(base, size); delete IMR from base to base+size. + * + * @base: physical base address of region aligned to 1 KiB. + * @size: physical size of region in bytes aligned to 1 KiB. + * @return: -EINVAL on invalid range or out or range id + * -ENODEV if reg is valid but no IMR exists or is locked + * 0 on success. + */ +int imr_remove_range(phys_addr_t base, size_t size) +{ + return __imr_remove_range(-1, base, size); +} +EXPORT_SYMBOL_GPL(imr_remove_range); + +/** + * imr_clear - delete an Isolated Memory Region by index + * + * This function allows you to delete an IMR by an address range specified + * by the index of the IMR. Useful for initial sanitization of the IMR + * address map. + * imr_ge(base, size); delete IMR from base to base+size. + * + * @reg: imr index to remove. + * @return: -EINVAL on invalid range or out or range id + * -ENODEV if reg is valid but no IMR exists or is locked + * 0 on success. + */ +static inline int imr_clear(int reg) +{ + return __imr_remove_range(reg, 0, 0); +} + +/** + * imr_fixup_memmap - Tear down IMRs used during bootup. + * + * BIOS and Grub both setup IMRs around compressed kernel, initrd memory + * that need to be removed before the kernel hands out one of the IMR + * encased addresses to a downstream DMA agent such as the SD or Ethernet. + * IMRs on Galileo are setup to immediately reset the system on violation. + * As a result if you're running a root filesystem from SD - you'll need + * the boot-time IMRs torn down or you'll find seemingly random resets when + * using your filesystem. + * + * @idev: pointer to imr_device structure. + * @return: + */ +static void __init imr_fixup_memmap(struct imr_device *idev) +{ + phys_addr_t base = virt_to_phys(&_text); + size_t size = virt_to_phys(&__end_rodata) - base; + int i; + int ret; + + /* Tear down all existing unlocked IMRs. */ + for (i = 0; i < idev->max_imr; i++) + imr_clear(i); + + /* + * Setup a locked IMR around the physical extent of the kernel + * from the beginning of the .text secton to the end of the + * .rodata section as one physically contiguous block. + */ + ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, true); + if (ret < 0) { + pr_err("unable to setup IMR for kernel: (%p - %p)\n", + &_text, &__end_rodata); + } else { + pr_info("protecting kernel .text - .rodata: %zu KiB (%p - %p)\n", + size / 1024, &_text, &__end_rodata); + } + +} + +static const struct x86_cpu_id imr_ids[] __initconst = { + { X86_VENDOR_INTEL, 5, 9 }, /* Intel Quark SoC X1000. */ + {} +}; +MODULE_DEVICE_TABLE(x86cpu, imr_ids); + +/** + * imr_init - entry point for IMR driver. + * + * return: -ENODEV for no IMR support 0 if good to go. + */ +static int __init imr_init(void) +{ + struct imr_device *idev = &imr_dev; + int ret; + + if (!x86_match_cpu(imr_ids) || !iosf_mbi_available()) + return -ENODEV; + + idev->max_imr = QUARK_X1000_IMR_MAX; + idev->reg_base = QUARK_X1000_IMR_REGBASE; + idev->init = true; + + mutex_init(&idev->lock); + ret = imr_debugfs_register(idev); + if (ret != 0) + pr_warn("debugfs register failed!\n"); + imr_fixup_memmap(idev); + return 0; +} + +/** + * imr_exit - exit point for IMR code. + * + * Deregisters debugfs, leave IMR state as-is. + * + * return: + */ +static void __exit imr_exit(void) +{ + imr_debugfs_unregister(&imr_dev); +} + +module_init(imr_init); +module_exit(imr_exit); + +MODULE_AUTHOR("Bryan O'Donoghue <pure.logic@nexus-software.ie>"); +MODULE_DESCRIPTION("Intel Isolated Memory Region driver"); +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/arch/x86/platform/intel-quark/imr_selftest.c b/arch/x86/platform/intel-quark/imr_selftest.c new file mode 100644 index 0000000..c9a0838 --- /dev/null +++ b/arch/x86/platform/intel-quark/imr_selftest.c @@ -0,0 +1,129 @@ +/** + * imr_selftest.c + * + * Copyright(c) 2013 Intel Corporation. + * Copyright(c) 2015 Bryan O'Donoghue <pure.logic@nexus-software.ie> + * + * IMR self test. The purpose of this module is to run a set of tests on the + * IMR API to validate it's sanity. We check for overlapping, reserved + * addresses and setup/teardown sanity. + * + */ + +#include <asm-generic/sections.h> +#include <asm/imr.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/types.h> + +#define SELFTEST KBUILD_MODNAME ": " +/** + * imr_self_test_result - Print result string for self test. + * + * @res: result code - true if test passed false otherwise. + * @fmt: format string. + * ... variadic argument list. + */ +static void __init imr_self_test_result(int res, const char *fmt, ...) +{ + va_list vlist; + + /* Print pass/fail. */ + if (res) + pr_info(SELFTEST "pass "); + else + pr_info(SELFTEST "fail "); + + /* Print variable string. */ + va_start(vlist, fmt); + vprintk(fmt, vlist); + va_end(vlist); + + /* Optional warning. */ + WARN(res == 0, "test failed"); +} +#undef SELFTEST + +/** + * imr_self_test + * + * Verify IMR self_test with some simple tests to verify overlap, + * zero sized allocations and 1 KiB sized areas. + * + */ +static void __init imr_self_test(void) +{ + phys_addr_t base = virt_to_phys(&_text); + size_t size = virt_to_phys(&__end_rodata) - base; + const char *fmt_over = "overlapped IMR @ (0x%08lx - 0x%08lx)\n"; + int ret; + + /* Test zero zero. */ + ret = imr_add_range(0, 0, 0, 0, false); + imr_self_test_result(ret < 0, "zero sized IMR\n"); + + /* Test exact overlap. */ + ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, false); + imr_self_test_result(ret < 0, fmt_over, __va(base), __va(base + size)); + + /* Test overlap with base inside of existing. */ + base += size - IMR_ALIGN; + ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, false); + imr_self_test_result(ret < 0, fmt_over, __va(base), __va(base + size)); + + /* Test overlap with end inside of existing. */ + base -= size + IMR_ALIGN * 2; + ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, false); + imr_self_test_result(ret < 0, fmt_over, __va(base), __va(base + size)); + + /* Test that a 1 KiB IMR @ zero with read/write all will bomb out. */ + ret = imr_add_range(0, IMR_ALIGN, IMR_READ_ACCESS_ALL, + IMR_WRITE_ACCESS_ALL, false); + imr_self_test_result(ret < 0, "1KiB IMR @ 0x00000000 - access-all\n"); + + /* Test that a 1 KiB IMR @ zero with CPU only will work. */ + ret = imr_add_range(0, IMR_ALIGN, IMR_CPU, IMR_CPU, false); + imr_self_test_result(ret >= 0, "1KiB IMR @ 0x00000000 - cpu-access\n"); + if (ret >= 0) { + ret = imr_remove_range(0, IMR_ALIGN); + imr_self_test_result(ret == 0, "teardown - cpu-access\n"); + } + + /* Test 2 KiB works. */ + size = IMR_ALIGN * 2; + ret = imr_add_range(0, size, IMR_READ_ACCESS_ALL, + IMR_WRITE_ACCESS_ALL, false); + imr_self_test_result(ret >= 0, "2KiB IMR @ 0x00000000\n"); + if (ret >= 0) { + ret = imr_remove_range(0, size); + imr_self_test_result(ret == 0, "teardown 2KiB\n"); + } +} + +/** + * imr_self_test_init - entry point for IMR driver. + * + * return: -ENODEV for no IMR support 0 if good to go. + */ +static int __init imr_self_test_init(void) +{ + imr_self_test(); + return 0; +} + +/** + * imr_self_test_exit - exit point for IMR code. + * + * return: + */ +static void __exit imr_self_test_exit(void) +{ +} + +module_init(imr_self_test_init); +module_exit(imr_self_test_exit); + +MODULE_AUTHOR("Bryan O'Donoghue <pure.logic@nexus-software.ie>"); +MODULE_DESCRIPTION("Intel Isolated Memory Region self-test driver"); +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c index c6b146e..7488caf 100644 --- a/arch/x86/platform/uv/uv_nmi.c +++ b/arch/x86/platform/uv/uv_nmi.c @@ -273,20 +273,6 @@ static inline void uv_clear_nmi(int cpu) } } -/* Print non-responding cpus */ -static void uv_nmi_nr_cpus_pr(char *fmt) -{ - static char cpu_list[1024]; - int len = sizeof(cpu_list); - int c = cpumask_weight(uv_nmi_cpu_mask); - int n = cpulist_scnprintf(cpu_list, len, uv_nmi_cpu_mask); - - if (n >= len-1) - strcpy(&cpu_list[len - 6], "...\n"); - - printk(fmt, c, cpu_list); -} - /* Ping non-responding cpus attemping to force them into the NMI handler */ static void uv_nmi_nr_cpus_ping(void) { @@ -371,16 +357,19 @@ static void uv_nmi_wait(int master) break; /* if not all made it in, send IPI NMI to them */ - uv_nmi_nr_cpus_pr(KERN_ALERT - "UV: Sending NMI IPI to %d non-responding CPUs: %s\n"); + pr_alert("UV: Sending NMI IPI to %d non-responding CPUs: %*pbl\n", + cpumask_weight(uv_nmi_cpu_mask), + cpumask_pr_args(uv_nmi_cpu_mask)); + uv_nmi_nr_cpus_ping(); /* if all cpus are in, then done */ if (!uv_nmi_wait_cpus(0)) break; - uv_nmi_nr_cpus_pr(KERN_ALERT - "UV: %d CPUs not in NMI loop: %s\n"); + pr_alert("UV: %d CPUs not in NMI loop: %*pbl\n", + cpumask_weight(uv_nmi_cpu_mask), + cpumask_pr_args(uv_nmi_cpu_mask)); } while (0); pr_alert("UV: %d of %d CPUs in NMI\n", diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 6ec7910..3e32ed5 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -105,11 +105,8 @@ static void __save_processor_state(struct saved_context *ctxt) ctxt->cr0 = read_cr0(); ctxt->cr2 = read_cr2(); ctxt->cr3 = read_cr3(); -#ifdef CONFIG_X86_32 - ctxt->cr4 = read_cr4_safe(); -#else -/* CONFIG_X86_64 */ - ctxt->cr4 = read_cr4(); + ctxt->cr4 = __read_cr4_safe(); +#ifdef CONFIG_X86_64 ctxt->cr8 = read_cr8(); #endif ctxt->misc_enable_saved = !rdmsrl_safe(MSR_IA32_MISC_ENABLE, @@ -175,12 +172,12 @@ static void notrace __restore_processor_state(struct saved_context *ctxt) /* cr4 was introduced in the Pentium CPU */ #ifdef CONFIG_X86_32 if (ctxt->cr4) - write_cr4(ctxt->cr4); + __write_cr4(ctxt->cr4); #else /* CONFIG X86_64 */ wrmsrl(MSR_EFER, ctxt->efer); write_cr8(ctxt->cr8); - write_cr4(ctxt->cr4); + __write_cr4(ctxt->cr4); #endif write_cr3(ctxt->cr3); write_cr2(ctxt->cr2); diff --git a/arch/x86/realmode/Makefile b/arch/x86/realmode/Makefile index 94f7fbe..e02c2c6 100644 --- a/arch/x86/realmode/Makefile +++ b/arch/x86/realmode/Makefile @@ -6,7 +6,7 @@ # for more details. # # - +KASAN_SANITIZE := n subdir- := rm obj-y += init.o diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index bad628a..0b7a63d 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -81,7 +81,7 @@ void __init setup_real_mode(void) trampoline_header->start = (u64) secondary_startup_64; trampoline_cr4_features = &trampoline_header->cr4; - *trampoline_cr4_features = read_cr4(); + *trampoline_cr4_features = __read_cr4(); trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd); trampoline_pgd[0] = init_level4_pgt[pgd_index(__PAGE_OFFSET)].pgd; diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index 7c0d7be..2730d77 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -6,6 +6,7 @@ # for more details. # # +KASAN_SANITIZE := n always := realmode.bin realmode.relocs diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c index 79d8245..0c8c32b 100644 --- a/arch/x86/um/signal.c +++ b/arch/x86/um/signal.c @@ -157,7 +157,7 @@ static int copy_sc_from_user(struct pt_regs *regs, int err, pid; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; err = copy_from_user(&sc, from, sizeof(sc)); if (err) diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index 09297c8..7b9be98 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile @@ -3,6 +3,7 @@ # KBUILD_CFLAGS += $(DISABLE_LTO) +KASAN_SANITIZE := n VDSO64-$(CONFIG_X86_64) := y VDSOX32-$(CONFIG_X86_X32_ABI) := y diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 78a881b..bd8b845 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1494,10 +1494,10 @@ static void xen_pvh_set_cr_flags(int cpu) * set them here. For all, OSFXSR OSXMMEXCPT are set in fpu_init. */ if (cpu_has_pse) - set_in_cr4(X86_CR4_PSE); + cr4_set_bits_and_update_boot(X86_CR4_PSE); if (cpu_has_pge) - set_in_cr4(X86_CR4_PGE); + cr4_set_bits_and_update_boot(X86_CR4_PGE); } /* diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 5c1f9ac..adca9e2 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1489,7 +1489,7 @@ static void __init xen_set_pte_init(pte_t *ptep, pte_t pte) native_set_pte(ptep, pte); } -static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) +static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn) { struct mmuext_op op; op.cmd = cmd; @@ -1657,7 +1657,7 @@ void __init xen_reserve_top(void) * Like __va(), but returns address in the kernel mapping (which is * all we have until the physical memory mapping has been set up. */ -static void *__ka(phys_addr_t paddr) +static void * __init __ka(phys_addr_t paddr) { #ifdef CONFIG_X86_64 return (void *)(paddr + __START_KERNEL_map); @@ -1667,7 +1667,7 @@ static void *__ka(phys_addr_t paddr) } /* Convert a machine address to physical address */ -static unsigned long m2p(phys_addr_t maddr) +static unsigned long __init m2p(phys_addr_t maddr) { phys_addr_t paddr; @@ -1678,13 +1678,14 @@ static unsigned long m2p(phys_addr_t maddr) } /* Convert a machine address to kernel virtual */ -static void *m2v(phys_addr_t maddr) +static void * __init m2v(phys_addr_t maddr) { return __ka(m2p(maddr)); } /* Set the page permissions on an identity-mapped pages */ -static void set_page_prot_flags(void *addr, pgprot_t prot, unsigned long flags) +static void __init set_page_prot_flags(void *addr, pgprot_t prot, + unsigned long flags) { unsigned long pfn = __pa(addr) >> PAGE_SHIFT; pte_t pte = pfn_pte(pfn, prot); @@ -1696,7 +1697,7 @@ static void set_page_prot_flags(void *addr, pgprot_t prot, unsigned long flags) if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags)) BUG(); } -static void set_page_prot(void *addr, pgprot_t prot) +static void __init set_page_prot(void *addr, pgprot_t prot) { return set_page_prot_flags(addr, prot, UVMF_NONE); } @@ -1733,10 +1734,8 @@ static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) { pte_t pte; -#ifdef CONFIG_X86_32 if (pfn > max_pfn_mapped) max_pfn_mapped = pfn; -#endif if (!pte_none(pte_page[pteidx])) continue; @@ -1769,7 +1768,7 @@ void __init xen_setup_machphys_mapping(void) } #ifdef CONFIG_X86_64 -static void convert_pfn_mfn(void *v) +static void __init convert_pfn_mfn(void *v) { pte_t *pte = v; int i; diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 70fb507..740ae30 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -84,8 +84,6 @@ #define PMDS_PER_MID_PAGE (P2M_MID_PER_PAGE / PTRS_PER_PTE) -static void __init m2p_override_init(void); - unsigned long *xen_p2m_addr __read_mostly; EXPORT_SYMBOL_GPL(xen_p2m_addr); unsigned long xen_p2m_size __read_mostly; @@ -402,8 +400,6 @@ void __init xen_vmalloc_p2m_tree(void) xen_p2m_size = xen_max_p2m_pfn; xen_inv_extra_mem(); - - m2p_override_init(); } unsigned long get_phys_to_machine(unsigned long pfn) @@ -554,7 +550,7 @@ static bool alloc_p2m(unsigned long pfn) mid_mfn = NULL; } - p2m_pfn = pte_pfn(ACCESS_ONCE(*ptep)); + p2m_pfn = pte_pfn(READ_ONCE(*ptep)); if (p2m_pfn == PFN_DOWN(__pa(p2m_identity)) || p2m_pfn == PFN_DOWN(__pa(p2m_missing))) { /* p2m leaf page is missing */ @@ -652,100 +648,21 @@ bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) return true; } -#define M2P_OVERRIDE_HASH_SHIFT 10 -#define M2P_OVERRIDE_HASH (1 << M2P_OVERRIDE_HASH_SHIFT) - -static struct list_head *m2p_overrides; -static DEFINE_SPINLOCK(m2p_override_lock); - -static void __init m2p_override_init(void) -{ - unsigned i; - - m2p_overrides = alloc_bootmem_align( - sizeof(*m2p_overrides) * M2P_OVERRIDE_HASH, - sizeof(unsigned long)); - - for (i = 0; i < M2P_OVERRIDE_HASH; i++) - INIT_LIST_HEAD(&m2p_overrides[i]); -} - -static unsigned long mfn_hash(unsigned long mfn) -{ - return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT); -} - -/* Add an MFN override for a particular page */ -static int m2p_add_override(unsigned long mfn, struct page *page, - struct gnttab_map_grant_ref *kmap_op) -{ - unsigned long flags; - unsigned long pfn; - unsigned long uninitialized_var(address); - unsigned level; - pte_t *ptep = NULL; - - pfn = page_to_pfn(page); - if (!PageHighMem(page)) { - address = (unsigned long)__va(pfn << PAGE_SHIFT); - ptep = lookup_address(address, &level); - if (WARN(ptep == NULL || level != PG_LEVEL_4K, - "m2p_add_override: pfn %lx not mapped", pfn)) - return -EINVAL; - } - - if (kmap_op != NULL) { - if (!PageHighMem(page)) { - struct multicall_space mcs = - xen_mc_entry(sizeof(*kmap_op)); - - MULTI_grant_table_op(mcs.mc, - GNTTABOP_map_grant_ref, kmap_op, 1); - - xen_mc_issue(PARAVIRT_LAZY_MMU); - } - } - spin_lock_irqsave(&m2p_override_lock, flags); - list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]); - spin_unlock_irqrestore(&m2p_override_lock, flags); - - /* p2m(m2p(mfn)) == mfn: the mfn is already present somewhere in - * this domain. Set the FOREIGN_FRAME_BIT in the p2m for the other - * pfn so that the following mfn_to_pfn(mfn) calls will return the - * pfn from the m2p_override (the backend pfn) instead. - * We need to do this because the pages shared by the frontend - * (xen-blkfront) can be already locked (lock_page, called by - * do_read_cache_page); when the userspace backend tries to use them - * with direct_IO, mfn_to_pfn returns the pfn of the frontend, so - * do_blockdev_direct_IO is going to try to lock the same pages - * again resulting in a deadlock. - * As a side effect get_user_pages_fast might not be safe on the - * frontend pages while they are being shared with the backend, - * because mfn_to_pfn (that ends up being called by GUPF) will - * return the backend pfn rather than the frontend pfn. */ - pfn = mfn_to_pfn_no_overrides(mfn); - if (__pfn_to_mfn(pfn) == mfn) - set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)); - - return 0; -} - int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, struct gnttab_map_grant_ref *kmap_ops, struct page **pages, unsigned int count) { int i, ret = 0; - bool lazy = false; pte_t *pte; if (xen_feature(XENFEAT_auto_translated_physmap)) return 0; - if (kmap_ops && - !in_interrupt() && - paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) { - arch_enter_lazy_mmu_mode(); - lazy = true; + if (kmap_ops) { + ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, + kmap_ops, count); + if (ret) + goto out; } for (i = 0; i < count; i++) { @@ -764,170 +681,28 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, } pfn = page_to_pfn(pages[i]); - WARN_ON(PagePrivate(pages[i])); - SetPagePrivate(pages[i]); - set_page_private(pages[i], mfn); - pages[i]->index = pfn_to_mfn(pfn); + WARN(pfn_to_mfn(pfn) != INVALID_P2M_ENTRY, "page must be ballooned"); if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) { ret = -ENOMEM; goto out; } - - if (kmap_ops) { - ret = m2p_add_override(mfn, pages[i], &kmap_ops[i]); - if (ret) - goto out; - } } out: - if (lazy) - arch_leave_lazy_mmu_mode(); - return ret; } EXPORT_SYMBOL_GPL(set_foreign_p2m_mapping); -static struct page *m2p_find_override(unsigned long mfn) -{ - unsigned long flags; - struct list_head *bucket; - struct page *p, *ret; - - if (unlikely(!m2p_overrides)) - return NULL; - - ret = NULL; - bucket = &m2p_overrides[mfn_hash(mfn)]; - - spin_lock_irqsave(&m2p_override_lock, flags); - - list_for_each_entry(p, bucket, lru) { - if (page_private(p) == mfn) { - ret = p; - break; - } - } - - spin_unlock_irqrestore(&m2p_override_lock, flags); - - return ret; -} - -static int m2p_remove_override(struct page *page, - struct gnttab_map_grant_ref *kmap_op, - unsigned long mfn) -{ - unsigned long flags; - unsigned long pfn; - unsigned long uninitialized_var(address); - unsigned level; - pte_t *ptep = NULL; - - pfn = page_to_pfn(page); - - if (!PageHighMem(page)) { - address = (unsigned long)__va(pfn << PAGE_SHIFT); - ptep = lookup_address(address, &level); - - if (WARN(ptep == NULL || level != PG_LEVEL_4K, - "m2p_remove_override: pfn %lx not mapped", pfn)) - return -EINVAL; - } - - spin_lock_irqsave(&m2p_override_lock, flags); - list_del(&page->lru); - spin_unlock_irqrestore(&m2p_override_lock, flags); - - if (kmap_op != NULL) { - if (!PageHighMem(page)) { - struct multicall_space mcs; - struct gnttab_unmap_and_replace *unmap_op; - struct page *scratch_page = get_balloon_scratch_page(); - unsigned long scratch_page_address = (unsigned long) - __va(page_to_pfn(scratch_page) << PAGE_SHIFT); - - /* - * It might be that we queued all the m2p grant table - * hypercalls in a multicall, then m2p_remove_override - * get called before the multicall has actually been - * issued. In this case handle is going to -1 because - * it hasn't been modified yet. - */ - if (kmap_op->handle == -1) - xen_mc_flush(); - /* - * Now if kmap_op->handle is negative it means that the - * hypercall actually returned an error. - */ - if (kmap_op->handle == GNTST_general_error) { - pr_warn("m2p_remove_override: pfn %lx mfn %lx, failed to modify kernel mappings", - pfn, mfn); - put_balloon_scratch_page(); - return -1; - } - - xen_mc_batch(); - - mcs = __xen_mc_entry( - sizeof(struct gnttab_unmap_and_replace)); - unmap_op = mcs.args; - unmap_op->host_addr = kmap_op->host_addr; - unmap_op->new_addr = scratch_page_address; - unmap_op->handle = kmap_op->handle; - - MULTI_grant_table_op(mcs.mc, - GNTTABOP_unmap_and_replace, unmap_op, 1); - - mcs = __xen_mc_entry(0); - MULTI_update_va_mapping(mcs.mc, scratch_page_address, - pfn_pte(page_to_pfn(scratch_page), - PAGE_KERNEL_RO), 0); - - xen_mc_issue(PARAVIRT_LAZY_MMU); - - kmap_op->host_addr = 0; - put_balloon_scratch_page(); - } - } - - /* p2m(m2p(mfn)) == FOREIGN_FRAME(mfn): the mfn is already present - * somewhere in this domain, even before being added to the - * m2p_override (see comment above in m2p_add_override). - * If there are no other entries in the m2p_override corresponding - * to this mfn, then remove the FOREIGN_FRAME_BIT from the p2m for - * the original pfn (the one shared by the frontend): the backend - * cannot do any IO on this page anymore because it has been - * unshared. Removing the FOREIGN_FRAME_BIT from the p2m entry of - * the original pfn causes mfn_to_pfn(mfn) to return the frontend - * pfn again. */ - mfn &= ~FOREIGN_FRAME_BIT; - pfn = mfn_to_pfn_no_overrides(mfn); - if (__pfn_to_mfn(pfn) == FOREIGN_FRAME(mfn) && - m2p_find_override(mfn) == NULL) - set_phys_to_machine(pfn, mfn); - - return 0; -} - int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, - struct gnttab_map_grant_ref *kmap_ops, + struct gnttab_unmap_grant_ref *kunmap_ops, struct page **pages, unsigned int count) { int i, ret = 0; - bool lazy = false; if (xen_feature(XENFEAT_auto_translated_physmap)) return 0; - if (kmap_ops && - !in_interrupt() && - paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) { - arch_enter_lazy_mmu_mode(); - lazy = true; - } - for (i = 0; i < count; i++) { unsigned long mfn = __pfn_to_mfn(page_to_pfn(pages[i])); unsigned long pfn = page_to_pfn(pages[i]); @@ -937,36 +712,16 @@ int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, goto out; } - set_page_private(pages[i], INVALID_P2M_ENTRY); - WARN_ON(!PagePrivate(pages[i])); - ClearPagePrivate(pages[i]); - set_phys_to_machine(pfn, pages[i]->index); - - if (kmap_ops) - ret = m2p_remove_override(pages[i], &kmap_ops[i], mfn); - if (ret) - goto out; + set_phys_to_machine(pfn, INVALID_P2M_ENTRY); } - + if (kunmap_ops) + ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, + kunmap_ops, count); out: - if (lazy) - arch_leave_lazy_mmu_mode(); return ret; } EXPORT_SYMBOL_GPL(clear_foreign_p2m_mapping); -unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn) -{ - struct page *p = m2p_find_override(mfn); - unsigned long ret = pfn; - - if (p) - ret = page_to_pfn(p); - - return ret; -} -EXPORT_SYMBOL_GPL(m2p_find_override_pfn); - #ifdef CONFIG_XEN_DEBUG_FS #include <linux/debugfs.h> #include "debugfs.h" diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 865e56c..55f388e 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -32,16 +32,6 @@ #include "p2m.h" #include "mmu.h" -/* These are code, but not functions. Defined in entry.S */ -extern const char xen_hypervisor_callback[]; -extern const char xen_failsafe_callback[]; -#ifdef CONFIG_X86_64 -extern asmlinkage void nmi(void); -#endif -extern void xen_sysenter_target(void); -extern void xen_syscall_target(void); -extern void xen_syscall32_target(void); - /* Amount of extra memory space we add to the e820 ranges */ struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata; @@ -74,7 +64,7 @@ static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY; */ #define EXTRA_MEM_RATIO (10) -static void __init xen_add_extra_mem(u64 start, u64 size) +static void __init xen_add_extra_mem(phys_addr_t start, phys_addr_t size) { int i; @@ -97,10 +87,10 @@ static void __init xen_add_extra_mem(u64 start, u64 size) memblock_reserve(start, size); } -static void __init xen_del_extra_mem(u64 start, u64 size) +static void __init xen_del_extra_mem(phys_addr_t start, phys_addr_t size) { int i; - u64 start_r, size_r; + phys_addr_t start_r, size_r; for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { start_r = xen_extra_mem[i].start; @@ -267,7 +257,7 @@ static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn, static void __init xen_update_mem_tables(unsigned long pfn, unsigned long mfn) { struct mmu_update update = { - .ptr = ((unsigned long long)mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE, + .ptr = ((uint64_t)mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE, .val = pfn }; @@ -545,20 +535,21 @@ static unsigned long __init xen_get_max_pages(void) return min(max_pages, MAX_DOMAIN_PAGES); } -static void xen_align_and_add_e820_region(u64 start, u64 size, int type) +static void __init xen_align_and_add_e820_region(phys_addr_t start, + phys_addr_t size, int type) { - u64 end = start + size; + phys_addr_t end = start + size; /* Align RAM regions to page boundaries. */ if (type == E820_RAM) { start = PAGE_ALIGN(start); - end &= ~((u64)PAGE_SIZE - 1); + end &= ~((phys_addr_t)PAGE_SIZE - 1); } e820_add_region(start, end - start, type); } -void xen_ignore_unusable(struct e820entry *list, size_t map_size) +static void __init xen_ignore_unusable(struct e820entry *list, size_t map_size) { struct e820entry *entry; unsigned int i; @@ -577,7 +568,7 @@ char * __init xen_memory_setup(void) static struct e820entry map[E820MAX] __initdata; unsigned long max_pfn = xen_start_info->nr_pages; - unsigned long long mem_end; + phys_addr_t mem_end; int rc; struct xen_memory_map memmap; unsigned long max_pages; @@ -652,16 +643,16 @@ char * __init xen_memory_setup(void) extra_pages); i = 0; while (i < memmap.nr_entries) { - u64 addr = map[i].addr; - u64 size = map[i].size; + phys_addr_t addr = map[i].addr; + phys_addr_t size = map[i].size; u32 type = map[i].type; if (type == E820_RAM) { if (addr < mem_end) { size = min(size, mem_end - addr); } else if (extra_pages) { - size = min(size, (u64)extra_pages * PAGE_SIZE); - extra_pages -= size / PAGE_SIZE; + size = min(size, PFN_PHYS(extra_pages)); + extra_pages -= PFN_DOWN(size); xen_add_extra_mem(addr, size); xen_max_p2m_pfn = PFN_DOWN(addr + size); } else diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 4c071ae..08e8489 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -507,7 +507,7 @@ static int xen_cpu_disable(void) static void xen_cpu_die(unsigned int cpu) { while (xen_pv_domain() && HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) { - current->state = TASK_UNINTERRUPTIBLE; + __set_current_state(TASK_UNINTERRUPTIBLE); schedule_timeout(HZ/10); } diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 23b45eb..956374c 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -41,7 +41,7 @@ static u8 zero_stats; static inline void check_zero(void) { u8 ret; - u8 old = ACCESS_ONCE(zero_stats); + u8 old = READ_ONCE(zero_stats); if (unlikely(old)) { ret = cmpxchg(&zero_stats, old, 0); /* This ensures only one fellow resets the stat */ @@ -112,6 +112,7 @@ __visible void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want) struct xen_lock_waiting *w = this_cpu_ptr(&lock_waiting); int cpu = smp_processor_id(); u64 start; + __ticket_t head; unsigned long flags; /* If kicker interrupts not initialized yet, just spin */ @@ -159,11 +160,15 @@ __visible void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want) */ __ticket_enter_slowpath(lock); + /* make sure enter_slowpath, which is atomic does not cross the read */ + smp_mb__after_atomic(); + /* * check again make sure it didn't become free while * we weren't looking */ - if (ACCESS_ONCE(lock->tickets.head) == want) { + head = READ_ONCE(lock->tickets.head); + if (__tickets_equal(head, want)) { add_stats(TAKEN_SLOW_PICKUP, 1); goto out; } @@ -204,8 +209,8 @@ static void xen_unlock_kick(struct arch_spinlock *lock, __ticket_t next) const struct xen_lock_waiting *w = &per_cpu(lock_waiting, cpu); /* Make sure we read lock before want */ - if (ACCESS_ONCE(w->lock) == lock && - ACCESS_ONCE(w->want) == next) { + if (READ_ONCE(w->lock) == lock && + READ_ONCE(w->want) == next) { add_stats(RELEASED_SLOW_KICKED, 1); xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR); break; diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 6908734..55da33b 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -479,6 +479,10 @@ static void __init xen_time_init(void) int cpu = smp_processor_id(); struct timespec tp; + /* As Dom0 is never moved, no penalty on using TSC there */ + if (xen_initial_domain()) + xen_clocksource.rating = 275; + clocksource_register_hz(&xen_clocksource, NSEC_PER_SEC); if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) { diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 5686bd9..9e195c6 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -10,6 +10,12 @@ extern const char xen_hypervisor_callback[]; extern const char xen_failsafe_callback[]; +void xen_sysenter_target(void); +#ifdef CONFIG_X86_64 +void xen_syscall_target(void); +void xen_syscall32_target(void); +#endif + extern void *xen_initial_gdt; struct trap_info; |