From fa19871a166f6a940eb97dd511d3ddb1841ed95a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 14 Jul 2017 13:38:28 +0200 Subject: KVM: VMX: remove unused field Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 29fd8af..d04092f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -563,7 +563,6 @@ struct vcpu_vmx { struct kvm_vcpu vcpu; unsigned long host_rsp; u8 fail; - bool nmi_known_unmasked; u32 exit_intr_info; u32 idt_vectoring_info; ulong rflags; -- cgit v1.1 From b3625980a65db6b6b6bbd5790a77ab95ce6397c5 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 13 Jul 2017 10:35:45 -0700 Subject: perf/x86/intel/uncore: Fix Skylake UPI PMU event masks This patch fixes the event_mask and event_ext_mask for the Intel Skylake Server UPI PMU. Bit 21 is not used as a filter. The extended umask is from bit 32 to bit 55. Correct both umasks. Signed-off-by: Stephane Eranian Signed-off-by: Kan Liang Acked-by: Peter Zijlstra Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/1499967350-10385-2-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/uncore_snbep.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index dae2fed..19a00a7 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -316,7 +316,7 @@ #define SKX_UPI_PCI_PMON_CTL0 0x350 #define SKX_UPI_PCI_PMON_CTR0 0x318 #define SKX_UPI_PCI_PMON_BOX_CTL 0x378 -#define SKX_PMON_CTL_UMASK_EXT 0xff +#define SKX_UPI_CTL_UMASK_EXT 0xffefff /* SKX M2M */ #define SKX_M2M_PCI_PMON_CTL0 0x228 @@ -328,7 +328,7 @@ DEFINE_UNCORE_FORMAT_ATTR(event2, event, "config:0-6"); DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21"); DEFINE_UNCORE_FORMAT_ATTR(use_occ_ctr, use_occ_ctr, "config:7"); DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); -DEFINE_UNCORE_FORMAT_ATTR(umask_ext, umask, "config:8-15,32-39"); +DEFINE_UNCORE_FORMAT_ATTR(umask_ext, umask, "config:8-15,32-43,45-55"); DEFINE_UNCORE_FORMAT_ATTR(qor, qor, "config:16"); DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18"); DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19"); @@ -3603,8 +3603,8 @@ static struct intel_uncore_type skx_uncore_upi = { .perf_ctr_bits = 48, .perf_ctr = SKX_UPI_PCI_PMON_CTR0, .event_ctl = SKX_UPI_PCI_PMON_CTL0, - .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK, - .event_mask_ext = SKX_PMON_CTL_UMASK_EXT, + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .event_mask_ext = SKX_UPI_CTL_UMASK_EXT, .box_ctl = SKX_UPI_PCI_PMON_BOX_CTL, .ops = &skx_upi_uncore_pci_ops, .format_group = &skx_upi_uncore_format_group, -- cgit v1.1 From bab4e569e80c07ba6fe5e4f2d815adeef26cee94 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 13 Jul 2017 10:35:46 -0700 Subject: perf/x86/intel/uncore: Fix Skylake server PCU PMU event format PCU event format for SKX are different from snbep. Introduce a new format group for SKX PCU. Signed-off-by: Kan Liang Acked-by: Peter Zijlstra Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/1499967350-10385-3-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/uncore_snbep.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 19a00a7..fbf8f6e 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -3492,6 +3492,26 @@ static struct intel_uncore_type skx_uncore_irp = { .format_group = &skx_uncore_format_group, }; +static struct attribute *skx_uncore_pcu_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + &format_attr_occ_invert.attr, + &format_attr_occ_edge_det.attr, + &format_attr_filter_band0.attr, + &format_attr_filter_band1.attr, + &format_attr_filter_band2.attr, + &format_attr_filter_band3.attr, + NULL, +}; + +static struct attribute_group skx_uncore_pcu_format_group = { + .name = "format", + .attrs = skx_uncore_pcu_formats_attr, +}; + static struct intel_uncore_ops skx_uncore_pcu_ops = { IVBEP_UNCORE_MSR_OPS_COMMON_INIT(), .hw_config = hswep_pcu_hw_config, @@ -3510,7 +3530,7 @@ static struct intel_uncore_type skx_uncore_pcu = { .box_ctl = HSWEP_PCU_MSR_PMON_BOX_CTL, .num_shared_regs = 1, .ops = &skx_uncore_pcu_ops, - .format_group = &snbep_uncore_pcu_format_group, + .format_group = &skx_uncore_pcu_format_group, }; static struct intel_uncore_type *skx_msr_uncores[] = { -- cgit v1.1 From c3f02682a101b83424128915b14e60c156c03f02 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 13 Jul 2017 10:35:47 -0700 Subject: perf/x86/intel/uncore: Fix Skylake server CHA LLC_LOOKUP event umask Correct the umask for LLC_LOOKUP.LOCAL and LLC_LOOKUP.REMOTE events Signed-off-by: Kan Liang Acked-by: Peter Zijlstra Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/1499967350-10385-4-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/uncore_snbep.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index fbf8f6e..a30bf97 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -3333,8 +3333,8 @@ static struct extra_reg skx_uncore_cha_extra_regs[] = { SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4), SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4), SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4), - SNBEP_CBO_EVENT_EXTRA_REG(0x2134, 0xffff, 0x4), - SNBEP_CBO_EVENT_EXTRA_REG(0x8134, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x3134, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x9134, 0xffff, 0x4), }; static u64 skx_cha_filter_mask(int fields) -- cgit v1.1 From 9ad0fbd8fcd9e6815908c772f8d792a9d764449e Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 13 Jul 2017 10:35:48 -0700 Subject: perf/x86/intel/uncore: Remove invalid Skylake server CHA filter field There is no field c6 and link for CHA BOX FILTER. Signed-off-by: Kan Liang Acked-by: Peter Zijlstra Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/1499967350-10385-5-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/uncore_snbep.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index a30bf97..2401d06 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -351,7 +351,6 @@ DEFINE_UNCORE_FORMAT_ATTR(filter_cid, filter_cid, "config1:5"); DEFINE_UNCORE_FORMAT_ATTR(filter_link, filter_link, "config1:5-8"); DEFINE_UNCORE_FORMAT_ATTR(filter_link2, filter_link, "config1:6-8"); DEFINE_UNCORE_FORMAT_ATTR(filter_link3, filter_link, "config1:12"); -DEFINE_UNCORE_FORMAT_ATTR(filter_link4, filter_link, "config1:9-12"); DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17"); DEFINE_UNCORE_FORMAT_ATTR(filter_nid2, filter_nid, "config1:32-47"); DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22"); @@ -3302,7 +3301,6 @@ static struct attribute *skx_uncore_cha_formats_attr[] = { &format_attr_inv.attr, &format_attr_thresh8.attr, &format_attr_filter_tid4.attr, - &format_attr_filter_link4.attr, &format_attr_filter_state5.attr, &format_attr_filter_rem.attr, &format_attr_filter_loc.attr, @@ -3312,7 +3310,6 @@ static struct attribute *skx_uncore_cha_formats_attr[] = { &format_attr_filter_opc_0.attr, &format_attr_filter_opc_1.attr, &format_attr_filter_nc.attr, - &format_attr_filter_c6.attr, &format_attr_filter_isoc.attr, NULL, }; -- cgit v1.1 From 8aa7b7b4b4a601978672dce6604b9f5630b2eeb8 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 13 Jul 2017 10:35:49 -0700 Subject: perf/x86/intel/uncore: Fix SKX CHA event extra regs This patch adds two missing event extra regs for Skylake Server CHA PMU: - TOR_INSERTS - TOR_OCCUPANCY Were missing support for all the filters, including opcode matchers. Signed-off-by: Stephane Eranian Signed-off-by: Kan Liang Acked-by: Peter Zijlstra Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/1499967350-10385-6-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/uncore_snbep.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 2401d06..f9f825b 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -3332,6 +3332,8 @@ static struct extra_reg skx_uncore_cha_extra_regs[] = { SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4), SNBEP_CBO_EVENT_EXTRA_REG(0x3134, 0xffff, 0x4), SNBEP_CBO_EVENT_EXTRA_REG(0x9134, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x35, 0xff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x36, 0xff, 0x8), }; static u64 skx_cha_filter_mask(int fields) @@ -3344,6 +3346,17 @@ static u64 skx_cha_filter_mask(int fields) mask |= SKX_CHA_MSR_PMON_BOX_FILTER_LINK; if (fields & 0x4) mask |= SKX_CHA_MSR_PMON_BOX_FILTER_STATE; + if (fields & 0x8) { + mask |= SKX_CHA_MSR_PMON_BOX_FILTER_REM; + mask |= SKX_CHA_MSR_PMON_BOX_FILTER_LOC; + mask |= SKX_CHA_MSR_PMON_BOX_FILTER_ALL_OPC; + mask |= SKX_CHA_MSR_PMON_BOX_FILTER_NM; + mask |= SKX_CHA_MSR_PMON_BOX_FILTER_NOT_NM; + mask |= SKX_CHA_MSR_PMON_BOX_FILTER_OPC0; + mask |= SKX_CHA_MSR_PMON_BOX_FILTER_OPC1; + mask |= SKX_CHA_MSR_PMON_BOX_FILTER_NC; + mask |= SKX_CHA_MSR_PMON_BOX_FILTER_ISOC; + } return mask; } -- cgit v1.1 From ba883b4abc9cd837441b01eb9cf8d9196181294d Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 13 Jul 2017 10:35:50 -0700 Subject: perf/x86/intel/uncore: Fix missing marker for skx_uncore_cha_extra_regs This skx_uncore_cha_extra_regs array was missing an end-marker. Signed-off-by: Stephane Eranian Signed-off-by: Kan Liang Acked-by: Peter Zijlstra Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/1499967350-10385-7-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/uncore_snbep.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index f9f825b..4f91276 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -3334,6 +3334,7 @@ static struct extra_reg skx_uncore_cha_extra_regs[] = { SNBEP_CBO_EVENT_EXTRA_REG(0x9134, 0xffff, 0x4), SNBEP_CBO_EVENT_EXTRA_REG(0x35, 0xff, 0x8), SNBEP_CBO_EVENT_EXTRA_REG(0x36, 0xff, 0x8), + EVENT_EXTRA_END }; static u64 skx_cha_filter_mask(int fields) -- cgit v1.1 From 38115f2f8cec8087d558c062e779c443a01f87d6 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 21 Jul 2017 23:45:52 +0900 Subject: kprobes/x86: Release insn_slot in failure path The following commit: 003002e04ed3 ("kprobes: Fix arch_prepare_kprobe to handle copy insn failures") returns an error if the copying of the instruction, but does not release the allocated insn_slot. Clean up correctly. Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: David S . Miller Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 003002e04ed3 ("kprobes: Fix arch_prepare_kprobe to handle copy insn failures") Link: http://lkml.kernel.org/r/150064834183.6172.11694375818447664416.stgit@devbox Signed-off-by: Ingo Molnar --- arch/x86/kernel/kprobes/core.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 6b87780..f015371 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -457,6 +457,8 @@ static int arch_copy_kprobe(struct kprobe *p) int arch_prepare_kprobe(struct kprobe *p) { + int ret; + if (alternatives_text_reserved(p->addr, p->addr)) return -EINVAL; @@ -467,7 +469,13 @@ int arch_prepare_kprobe(struct kprobe *p) if (!p->ainsn.insn) return -ENOMEM; - return arch_copy_kprobe(p); + ret = arch_copy_kprobe(p); + if (ret) { + free_insn_slot(p->ainsn.insn, 0); + p->ainsn.insn = NULL; + } + + return ret; } void arch_arm_kprobe(struct kprobe *p) -- cgit v1.1 From 18d5e6c34a8eda438d5ad8b3b15f42dab01bf05d Mon Sep 17 00:00:00 2001 From: Michael Davidson Date: Mon, 24 Jul 2017 16:51:55 -0700 Subject: x86/boot: #undef memcpy() et al in string.c undef memcpy() and friends in boot/string.c so that the functions defined here will have the correct names, otherwise we end up up trying to redefine __builtin_memcpy() etc. Surprisingly, GCC allows this (and, helpfully, discards the __builtin_ prefix from the function name when compiling it), but clang does not. Adding these #undef's appears to preserve what I assume was the original intent of the code. Signed-off-by: Michael Davidson Signed-off-by: Matthias Kaehlcke Acked-by: H. Peter Anvin Cc: Arnd Bergmann Cc: Bernhard.Rosenkranzer@linaro.org Cc: Greg Hackmann Cc: Kees Cook Cc: Linus Torvalds Cc: Nick Desaulniers Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170724235155.79255-1-mka@chromium.org Signed-off-by: Ingo Molnar --- arch/x86/boot/string.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c index 630e366..16f4912 100644 --- a/arch/x86/boot/string.c +++ b/arch/x86/boot/string.c @@ -16,6 +16,15 @@ #include "ctype.h" #include "string.h" +/* + * Undef these macros so that the functions that we provide + * here will have the correct names regardless of how string.h + * may have chosen to #define them. + */ +#undef memcpy +#undef memset +#undef memcmp + int memcmp(const void *s1, const void *s2, size_t len) { bool diff; -- cgit v1.1 From 4ecf7191fdcc5686c1eb2b06d7f93ce29d636cf0 Mon Sep 17 00:00:00 2001 From: Stefan Assmann Date: Mon, 24 Jul 2017 14:22:48 +0200 Subject: x86/efi: Fix reboot_mode when EFI runtime services are disabled When EFI runtime services are disabled, for example by the "noefi" kernel cmdline parameter, the reboot_type could still be set to BOOT_EFI causing reboot to fail. Fix this by checking if EFI runtime services are enabled. Signed-off-by: Stefan Assmann Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170724122248.24006-1-sassmann@kpanic.de [ Fixed 'not disabled' double negation. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/reboot.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 67393fc..a56bf60 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -471,12 +471,12 @@ static int __init reboot_init(void) /* * The DMI quirks table takes precedence. If no quirks entry - * matches and the ACPI Hardware Reduced bit is set, force EFI - * reboot. + * matches and the ACPI Hardware Reduced bit is set and EFI + * runtime services are enabled, force EFI reboot. */ rv = dmi_check_system(reboot_dmi_table); - if (!rv && efi_reboot_required()) + if (!rv && efi_reboot_required() && !efi_runtime_disabled()) reboot_type = BOOT_EFI; return 0; -- cgit v1.1 From a512177ef3bb92dbec8a96fe337b11c126bf9c91 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 24 Jul 2017 18:54:38 +0200 Subject: KVM: x86: do mask out upper bits of PAE CR3 This reverts the change of commit f85c758dbee54cc3612a6e873ef7cecdb66ebee5, as the behavior it modified was intended. The VM is running in 32-bit PAE mode, and Table 4-7 of the Intel manual says: Table 4-7. Use of CR3 with PAE Paging Bit Position(s) Contents 4:0 Ignored 31:5 Physical address of the 32-Byte aligned page-directory-pointer table used for linear-address translation 63:32 Ignored (these bits exist only on processors supporting the Intel-64 architecture) To placate the static checker, write the mask explicitly as an unsigned long constant instead of using a 32-bit unsigned constant. Cc: Dan Carpenter Fixes: f85c758dbee54cc3612a6e873ef7cecdb66ebee5 Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 82a63c5..6c97c82 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -597,8 +597,8 @@ bool pdptrs_changed(struct kvm_vcpu *vcpu) (unsigned long *)&vcpu->arch.regs_avail)) return true; - gfn = (kvm_read_cr3(vcpu) & ~31ul) >> PAGE_SHIFT; - offset = (kvm_read_cr3(vcpu) & ~31ul) & (PAGE_SIZE - 1); + gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT; + offset = (kvm_read_cr3(vcpu) & 0xffffffe0ul) & (PAGE_SIZE - 1); r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte), PFERR_USER_MASK | PFERR_WRITE_MASK); if (r < 0) -- cgit v1.1 From 210f84b0ca7743f3b2a9acfae81df668dbbb6a12 Mon Sep 17 00:00:00 2001 From: Wincy Van Date: Fri, 28 Apr 2017 13:13:58 +0800 Subject: x86: irq: Define a global vector for nested posted interrupts We are using the same vector for nested/non-nested posted interrupts delivery, this may cause interrupts latency in L1 since we can't kick the L2 vcpu out of vmx-nonroot mode. This patch introduces a new vector which is only for nested posted interrupts to solve the problems above. Signed-off-by: Wincy Van Signed-off-by: Paolo Bonzini --- arch/x86/entry/entry_64.S | 1 + arch/x86/include/asm/entry_arch.h | 2 ++ arch/x86/include/asm/hardirq.h | 1 + arch/x86/include/asm/hw_irq.h | 2 ++ arch/x86/include/asm/irq_vectors.h | 3 ++- arch/x86/kernel/irq.c | 19 +++++++++++++++++++ arch/x86/kernel/irqinit.c | 2 ++ 7 files changed, 29 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index a9a8027..d271fb7 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -705,6 +705,7 @@ apicinterrupt X86_PLATFORM_IPI_VECTOR x86_platform_ipi smp_x86_platform_ipi #ifdef CONFIG_HAVE_KVM apicinterrupt3 POSTED_INTR_VECTOR kvm_posted_intr_ipi smp_kvm_posted_intr_ipi apicinterrupt3 POSTED_INTR_WAKEUP_VECTOR kvm_posted_intr_wakeup_ipi smp_kvm_posted_intr_wakeup_ipi +apicinterrupt3 POSTED_INTR_NESTED_VECTOR kvm_posted_intr_nested_ipi smp_kvm_posted_intr_nested_ipi #endif #ifdef CONFIG_X86_MCE_THRESHOLD diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index df00299..07b0695 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -25,6 +25,8 @@ BUILD_INTERRUPT3(kvm_posted_intr_ipi, POSTED_INTR_VECTOR, smp_kvm_posted_intr_ipi) BUILD_INTERRUPT3(kvm_posted_intr_wakeup_ipi, POSTED_INTR_WAKEUP_VECTOR, smp_kvm_posted_intr_wakeup_ipi) +BUILD_INTERRUPT3(kvm_posted_intr_nested_ipi, POSTED_INTR_NESTED_VECTOR, + smp_kvm_posted_intr_nested_ipi) #endif /* diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 9b76cd3..ad1ed53 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -15,6 +15,7 @@ typedef struct { #ifdef CONFIG_HAVE_KVM unsigned int kvm_posted_intr_ipis; unsigned int kvm_posted_intr_wakeup_ipis; + unsigned int kvm_posted_intr_nested_ipis; #endif unsigned int x86_platform_ipis; /* arch dependent */ unsigned int apic_perf_irqs; diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index b90e105..d6dbafb 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -30,6 +30,7 @@ extern asmlinkage void apic_timer_interrupt(void); extern asmlinkage void x86_platform_ipi(void); extern asmlinkage void kvm_posted_intr_ipi(void); extern asmlinkage void kvm_posted_intr_wakeup_ipi(void); +extern asmlinkage void kvm_posted_intr_nested_ipi(void); extern asmlinkage void error_interrupt(void); extern asmlinkage void irq_work_interrupt(void); @@ -62,6 +63,7 @@ extern void trace_call_function_single_interrupt(void); #define trace_reboot_interrupt reboot_interrupt #define trace_kvm_posted_intr_ipi kvm_posted_intr_ipi #define trace_kvm_posted_intr_wakeup_ipi kvm_posted_intr_wakeup_ipi +#define trace_kvm_posted_intr_nested_ipi kvm_posted_intr_nested_ipi #endif /* CONFIG_TRACING */ #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 6ca9fd6..aaf8d28 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -83,7 +83,6 @@ */ #define X86_PLATFORM_IPI_VECTOR 0xf7 -#define POSTED_INTR_WAKEUP_VECTOR 0xf1 /* * IRQ work vector: */ @@ -98,6 +97,8 @@ /* Vector for KVM to deliver posted interrupt IPI */ #ifdef CONFIG_HAVE_KVM #define POSTED_INTR_VECTOR 0xf2 +#define POSTED_INTR_WAKEUP_VECTOR 0xf1 +#define POSTED_INTR_NESTED_VECTOR 0xf0 #endif /* diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 4aa03c5..4ed0aba 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -155,6 +155,12 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_printf(p, "%10u ", irq_stats(j)->kvm_posted_intr_ipis); seq_puts(p, " Posted-interrupt notification event\n"); + seq_printf(p, "%*s: ", prec, "NPI"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", + irq_stats(j)->kvm_posted_intr_nested_ipis); + seq_puts(p, " Nested posted-interrupt event\n"); + seq_printf(p, "%*s: ", prec, "PIW"); for_each_online_cpu(j) seq_printf(p, "%10u ", @@ -313,6 +319,19 @@ __visible void smp_kvm_posted_intr_wakeup_ipi(struct pt_regs *regs) exiting_irq(); set_irq_regs(old_regs); } + +/* + * Handler for POSTED_INTERRUPT_NESTED_VECTOR. + */ +__visible void smp_kvm_posted_intr_nested_ipi(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + entering_ack_irq(); + inc_irq_stat(kvm_posted_intr_nested_ipis); + exiting_irq(); + set_irq_regs(old_regs); +} #endif __visible void __irq_entry smp_trace_x86_platform_ipi(struct pt_regs *regs) diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 7468c69..c7fd185 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -150,6 +150,8 @@ static void __init apic_intr_init(void) alloc_intr_gate(POSTED_INTR_VECTOR, kvm_posted_intr_ipi); /* IPI for KVM to deliver interrupt to wake up tasks */ alloc_intr_gate(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi); + /* IPI for KVM to deliver nested posted interrupt */ + alloc_intr_gate(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi); #endif /* IPI vectors for APIC spurious and error interrupts */ -- cgit v1.1 From 06a5524f091b6b41b0007810c74cc9315923a145 Mon Sep 17 00:00:00 2001 From: Wincy Van Date: Fri, 28 Apr 2017 13:13:59 +0800 Subject: KVM: nVMX: Fix posted intr delivery when vcpu is in guest mode The PI vector for L0 and L1 must be different. If dest vcpu0 is in guest mode while vcpu1 is delivering a non-nested PI to vcpu0, there wont't be any vmexit so that the non-nested interrupt will be delayed. Signed-off-by: Wincy Van Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d04092f..497cf64 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4987,9 +4987,12 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) } } -static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu) +static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, + bool nested) { #ifdef CONFIG_SMP + int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR; + if (vcpu->mode == IN_GUEST_MODE) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -5007,8 +5010,7 @@ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu) */ WARN_ON_ONCE(pi_test_sn(&vmx->pi_desc)); - apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), - POSTED_INTR_VECTOR); + apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); return true; } #endif @@ -5023,7 +5025,7 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, if (is_guest_mode(vcpu) && vector == vmx->nested.posted_intr_nv) { /* the PIR and ON have been set by L1. */ - kvm_vcpu_trigger_posted_interrupt(vcpu); + kvm_vcpu_trigger_posted_interrupt(vcpu, true); /* * If a posted intr is not recognized by hardware, * we will accomplish it in the next vmentry. @@ -5057,7 +5059,7 @@ static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) if (pi_test_and_set_on(&vmx->pi_desc)) return; - if (!kvm_vcpu_trigger_posted_interrupt(vcpu)) + if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false)) kvm_vcpu_kick(vcpu); } @@ -10064,13 +10066,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, /* Posted interrupts setting is only taken from vmcs12. */ if (nested_cpu_has_posted_intr(vmcs12)) { - /* - * Note that we use L0's vector here and in - * vmx_deliver_nested_posted_interrupt. - */ vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; vmx->nested.pi_pending = false; - vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); + vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); } else { exec_control &= ~PIN_BASED_POSTED_INTR; } @@ -10941,7 +10939,9 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, */ vmx_flush_tlb(vcpu); } - + /* Restore posted intr vector. */ + if (nested_cpu_has_posted_intr(vmcs12)) + vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); -- cgit v1.1 From 2d6144e366fb39609aecf7a658e2e10af37627e9 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 25 Jul 2017 03:40:46 -0700 Subject: KVM: nVMX: Fix loss of L2's NMI blocking state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Run kvm-unit-tests/eventinj.flat in L1 w/ ept=0 on both L0 and L1: Before NMI IRET test Sending NMI to self NMI isr running stack 0x461000 Sending nested NMI to self After nested NMI to self Nested NMI isr running rip=40038e After iret After NMI to self FAIL: NMI Commit 4c4a6f790ee862 (KVM: nVMX: track NMI blocking state separately for each VMCS) tracks NMI blocking state separately for vmcs01 and vmcs02. However it is not enough: - The L2 (kvm-unit-tests/eventinj.flat) generates NMI that will fault on IRET, so the L2 can generate #PF which can be intercepted by L0. - L0 walks L1's guest page table and sees the mapping is invalid, it resumes the L1 guest and injects the #PF into L1. At this point the vmcs02 has nmi_known_unmasked=true. - L1 sets set bit 3 (blocking by NMI) in the interruptibility-state field of vmcs12 (and fixes the shadow page table) before resuming L2 guest. - L1 executes VMRESUME to resume L2, causing a vmexit to L0 - during VMRESUME emulation, prepare_vmcs02 sets bit 3 in the interruptibility-state field of vmcs02, but nmi_known_unmasked is still true. - L2 immediately exits to L0 with another page fault, because L0 still has not updated the NGVA->HPA page tables. However, nmi_known_unmasked is true so vmx_recover_nmi_blocking does not do anything. The fix is to update nmi_known_unmasked when preparing vmcs02 from vmcs12. Cc: Paolo Bonzini Cc: Radim Krčmář Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 497cf64..39a6222 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -10042,6 +10042,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs12->vm_entry_instruction_len); vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, vmcs12->guest_interruptibility_info); + vmx->loaded_vmcs->nmi_known_unmasked = + !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); } else { vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); } -- cgit v1.1 From 1d518c6820daf4e00d29adfba980aee05f605f0f Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 25 Jul 2017 00:43:15 -0700 Subject: KVM: LAPIC: Fix reentrancy issues with preempt notifiers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Preempt can occur in the preemption timer expiration handler: CPU0 CPU1 preemption timer vmexit handle_preemption_timer(vCPU0) kvm_lapic_expired_hv_timer hv_timer_is_use == true sched_out sched_in kvm_arch_vcpu_load kvm_lapic_restart_hv_timer restart_apic_timer start_hv_timer already-expired timer or sw timer triggerd in the window start_sw_timer cancel_hv_timer /* back in kvm_lapic_expired_hv_timer */ cancel_hv_timer WARN_ON(!apic->lapic_timer.hv_timer_in_use); ==> Oops This can be reproduced if CONFIG_PREEMPT is enabled. ------------[ cut here ]------------ WARNING: CPU: 4 PID: 2972 at /home/kernel/linux/arch/x86/kvm//lapic.c:1563 kvm_lapic_expired_hv_timer+0x9e/0xb0 [kvm] CPU: 4 PID: 2972 Comm: qemu-system-x86 Tainted: G OE 4.13.0-rc2+ #16 RIP: 0010:kvm_lapic_expired_hv_timer+0x9e/0xb0 [kvm] Call Trace: handle_preemption_timer+0xe/0x20 [kvm_intel] vmx_handle_exit+0xb8/0xd70 [kvm_intel] kvm_arch_vcpu_ioctl_run+0xdd1/0x1be0 [kvm] ? kvm_arch_vcpu_load+0x47/0x230 [kvm] ? kvm_arch_vcpu_load+0x62/0x230 [kvm] kvm_vcpu_ioctl+0x340/0x700 [kvm] ? kvm_vcpu_ioctl+0x340/0x700 [kvm] ? __fget+0xfc/0x210 do_vfs_ioctl+0xa4/0x6a0 ? __fget+0x11d/0x210 SyS_ioctl+0x79/0x90 do_syscall_64+0x81/0x220 entry_SYSCALL64_slow_path+0x25/0x25 ------------[ cut here ]------------ WARNING: CPU: 4 PID: 2972 at /home/kernel/linux/arch/x86/kvm//lapic.c:1498 cancel_hv_timer.isra.40+0x4f/0x60 [kvm] CPU: 4 PID: 2972 Comm: qemu-system-x86 Tainted: G W OE 4.13.0-rc2+ #16 RIP: 0010:cancel_hv_timer.isra.40+0x4f/0x60 [kvm] Call Trace: kvm_lapic_expired_hv_timer+0x3e/0xb0 [kvm] handle_preemption_timer+0xe/0x20 [kvm_intel] vmx_handle_exit+0xb8/0xd70 [kvm_intel] kvm_arch_vcpu_ioctl_run+0xdd1/0x1be0 [kvm] ? kvm_arch_vcpu_load+0x47/0x230 [kvm] ? kvm_arch_vcpu_load+0x62/0x230 [kvm] kvm_vcpu_ioctl+0x340/0x700 [kvm] ? kvm_vcpu_ioctl+0x340/0x700 [kvm] ? __fget+0xfc/0x210 do_vfs_ioctl+0xa4/0x6a0 ? __fget+0x11d/0x210 SyS_ioctl+0x79/0x90 do_syscall_64+0x81/0x220 entry_SYSCALL64_slow_path+0x25/0x25 This patch fixes it by making the caller of cancel_hv_timer, start_hv_timer and start_sw_timer be in preemption-disabled regions, which trivially avoid any reentrancy issue with preempt notifier. Cc: Paolo Bonzini Cc: Radim Krčmář Signed-off-by: Wanpeng Li [Add more WARNs. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 2819d4c1..589dcc1 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1495,11 +1495,10 @@ EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use); static void cancel_hv_timer(struct kvm_lapic *apic) { + WARN_ON(preemptible()); WARN_ON(!apic->lapic_timer.hv_timer_in_use); - preempt_disable(); kvm_x86_ops->cancel_hv_timer(apic->vcpu); apic->lapic_timer.hv_timer_in_use = false; - preempt_enable(); } static bool start_hv_timer(struct kvm_lapic *apic) @@ -1507,6 +1506,7 @@ static bool start_hv_timer(struct kvm_lapic *apic) struct kvm_timer *ktimer = &apic->lapic_timer; int r; + WARN_ON(preemptible()); if (!kvm_x86_ops->set_hv_timer) return false; @@ -1538,6 +1538,8 @@ static bool start_hv_timer(struct kvm_lapic *apic) static void start_sw_timer(struct kvm_lapic *apic) { struct kvm_timer *ktimer = &apic->lapic_timer; + + WARN_ON(preemptible()); if (apic->lapic_timer.hv_timer_in_use) cancel_hv_timer(apic); if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending)) @@ -1552,15 +1554,20 @@ static void start_sw_timer(struct kvm_lapic *apic) static void restart_apic_timer(struct kvm_lapic *apic) { + preempt_disable(); if (!start_hv_timer(apic)) start_sw_timer(apic); + preempt_enable(); } void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - WARN_ON(!apic->lapic_timer.hv_timer_in_use); + preempt_disable(); + /* If the preempt notifier has already run, it also called apic_timer_expired */ + if (!apic->lapic_timer.hv_timer_in_use) + goto out; WARN_ON(swait_active(&vcpu->wq)); cancel_hv_timer(apic); apic_timer_expired(apic); @@ -1569,6 +1576,8 @@ void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu) advance_periodic_target_expiration(apic); restart_apic_timer(apic); } +out: + preempt_enable(); } EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer); @@ -1582,9 +1591,11 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; + preempt_disable(); /* Possibly the TSC deadline timer is not enabled yet */ if (apic->lapic_timer.hv_timer_in_use) start_sw_timer(apic); + preempt_enable(); } EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer); -- cgit v1.1 From 20c6c189045539d29f4854d92b7ea9c329e1edfc Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Tue, 25 Jul 2017 14:50:53 -0700 Subject: x86/boot: Disable the address-of-packed-member compiler warning The clang warning 'address-of-packed-member' is disabled for the general kernel code, also disable it for the x86 boot code. This suppresses a bunch of warnings like this when building with clang: ./arch/x86/include/asm/processor.h:535:30: warning: taking address of packed member 'sp0' of class or structure 'x86_hw_tss' may result in an unaligned pointer value [-Waddress-of-packed-member] return this_cpu_read_stable(cpu_tss.x86_tss.sp0); ^~~~~~~~~~~~~~~~~~~ ./arch/x86/include/asm/percpu.h:391:59: note: expanded from macro 'this_cpu_read_stable' #define this_cpu_read_stable(var) percpu_stable_op("mov", var) ^~~ ./arch/x86/include/asm/percpu.h:228:16: note: expanded from macro 'percpu_stable_op' : "p" (&(var))); ^~~ Signed-off-by: Matthias Kaehlcke Cc: Doug Anderson Cc: Linus Torvalds Cc: Masahiro Yamada Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170725215053.135586-1-mka@chromium.org Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/Makefile | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 2c860ad..8a95827 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -34,6 +34,7 @@ KBUILD_CFLAGS += $(cflags-y) KBUILD_CFLAGS += -mno-mmx -mno-sse KBUILD_CFLAGS += $(call cc-option,-ffreestanding) KBUILD_CFLAGS += $(call cc-option,-fno-stack-protector) +KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ GCOV_PROFILE := n -- cgit v1.1 From 4815d3c56d1e10449a44089a47544d9ba84fad0d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 28 Jul 2017 14:45:03 +0200 Subject: cpufreq: x86: Make scaling_cur_freq behave more as expected After commit f8475cef9008 "x86: use common aperfmperf_khz_on_cpu() to calculate KHz using APERF/MPERF" the scaling_cur_freq policy attribute in sysfs only behaves as expected on x86 with APERF/MPERF registers available when it is read from at least twice in a row. The value returned by the first read may not be meaningful, because the computations in there use cached values from the previous iteration of aperfmperf_snapshot_khz() which may be stale. To prevent that from happening, modify arch_freq_get_on_cpu() to call aperfmperf_snapshot_khz() twice, with a short delay between these calls, if the previous invocation of aperfmperf_snapshot_khz() was too far back in the past (specifically, more that 1s ago). Also, as pointed out by Doug Smythies, aperf_delta is limited now and the multiplication of it by cpu_khz won't overflow, so simplify the s->khz computations too. Fixes: f8475cef9008 "x86: use common aperfmperf_khz_on_cpu() to calculate KHz using APERF/MPERF" Reported-by: Doug Smythies Signed-off-by: Rafael J. Wysocki --- arch/x86/kernel/cpu/aperfmperf.c | 40 ++++++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index d869c86..7cf7c70 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -8,20 +8,25 @@ * This file is licensed under GPLv2. */ -#include +#include +#include #include #include #include struct aperfmperf_sample { unsigned int khz; - unsigned long jiffies; + ktime_t time; u64 aperf; u64 mperf; }; static DEFINE_PER_CPU(struct aperfmperf_sample, samples); +#define APERFMPERF_CACHE_THRESHOLD_MS 10 +#define APERFMPERF_REFRESH_DELAY_MS 20 +#define APERFMPERF_STALE_THRESHOLD_MS 1000 + /* * aperfmperf_snapshot_khz() * On the current CPU, snapshot APERF, MPERF, and jiffies @@ -33,9 +38,11 @@ static void aperfmperf_snapshot_khz(void *dummy) u64 aperf, aperf_delta; u64 mperf, mperf_delta; struct aperfmperf_sample *s = this_cpu_ptr(&samples); + ktime_t now = ktime_get(); + s64 time_delta = ktime_ms_delta(now, s->time); - /* Don't bother re-computing within 10 ms */ - if (time_before(jiffies, s->jiffies + HZ/100)) + /* Don't bother re-computing within the cache threshold time. */ + if (time_delta < APERFMPERF_CACHE_THRESHOLD_MS) return; rdmsrl(MSR_IA32_APERF, aperf); @@ -51,22 +58,21 @@ static void aperfmperf_snapshot_khz(void *dummy) if (mperf_delta == 0) return; - /* - * if (cpu_khz * aperf_delta) fits into ULLONG_MAX, then - * khz = (cpu_khz * aperf_delta) / mperf_delta - */ - if (div64_u64(ULLONG_MAX, cpu_khz) > aperf_delta) - s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta); - else /* khz = aperf_delta / (mperf_delta / cpu_khz) */ - s->khz = div64_u64(aperf_delta, - div64_u64(mperf_delta, cpu_khz)); - s->jiffies = jiffies; + s->time = now; s->aperf = aperf; s->mperf = mperf; + + /* If the previous iteration was too long ago, discard it. */ + if (time_delta > APERFMPERF_STALE_THRESHOLD_MS) + s->khz = 0; + else + s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta); } unsigned int arch_freq_get_on_cpu(int cpu) { + unsigned int khz; + if (!cpu_khz) return 0; @@ -74,6 +80,12 @@ unsigned int arch_freq_get_on_cpu(int cpu) return 0; smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1); + khz = per_cpu(samples.khz, cpu); + if (khz) + return khz; + + msleep(APERFMPERF_REFRESH_DELAY_MS); + smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1); return per_cpu(samples.khz, cpu); } -- cgit v1.1 From bb68cfe2f5a7f43058aed299fdbb73eb281734ed Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 31 Jul 2017 22:07:09 +0200 Subject: x86/hpet: Cure interface abuse in the resume path The HPET resume path abuses irq_domain_[de]activate_irq() to restore the MSI message in the HPET chip for the boot CPU on resume and it relies on an implementation detail of the interrupt core code, which magically makes the HPET unmask call invoked via a irq_disable/enable pair. This worked as long as the irq code did unconditionally invoke the unmask() callback. With the recent changes which keep track of the masked state to avoid expensive hardware access, this does not longer work. As a consequence the HPET timer interrupts are not unmasked which breaks resume as the boot CPU waits forever that a timer interrupt arrives. Make the restore of the MSI message explicit and invoke the unmask() function directly. While at it get rid of the pointless affinity setting as nothing can change the affinity of the interrupt and the vector across suspend/resume. The restore of the MSI message reestablishes the previous affinity setting which is the correct one. Fixes: bf22ff45bed6 ("genirq: Avoid unnecessary low level irq function calls") Reported-and-tested-by: Tomi Sarvela Reported-by: Martin Peres Signed-off-by: Thomas Gleixner Acked-by: "Rafael J. Wysocki" Cc: jeffy.chen@rock-chips.com Cc: Peter Zijlstra Cc: Marc Zyngier Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1707312158590.2287@nanos --- arch/x86/kernel/hpet.c | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 16f82a3..8ce4212 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -345,21 +345,10 @@ static int hpet_shutdown(struct clock_event_device *evt, int timer) return 0; } -static int hpet_resume(struct clock_event_device *evt, int timer) -{ - if (!timer) { - hpet_enable_legacy_int(); - } else { - struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); - - irq_domain_deactivate_irq(irq_get_irq_data(hdev->irq)); - irq_domain_activate_irq(irq_get_irq_data(hdev->irq)); - disable_hardirq(hdev->irq); - irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu)); - enable_irq(hdev->irq); - } +static int hpet_resume(struct clock_event_device *evt) +{ + hpet_enable_legacy_int(); hpet_print_config(); - return 0; } @@ -417,7 +406,7 @@ static int hpet_legacy_set_periodic(struct clock_event_device *evt) static int hpet_legacy_resume(struct clock_event_device *evt) { - return hpet_resume(evt, 0); + return hpet_resume(evt); } static int hpet_legacy_next_event(unsigned long delta, @@ -510,8 +499,14 @@ static int hpet_msi_set_periodic(struct clock_event_device *evt) static int hpet_msi_resume(struct clock_event_device *evt) { struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); + struct irq_data *data = irq_get_irq_data(hdev->irq); + struct msi_msg msg; - return hpet_resume(evt, hdev->num); + /* Restore the MSI msg and unmask the interrupt */ + irq_chip_compose_msi_msg(data, &msg); + hpet_msi_write(hdev, &msg); + hpet_msi_unmask(data); + return 0; } static int hpet_msi_next_event(unsigned long delta, -- cgit v1.1 From 7313c698050387a11c21afb0c6b4c61f21f7c042 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 27 Jul 2017 10:31:25 +0200 Subject: KVM: nVMX: do not fill vm_exit_intr_error_code in prepare_vmcs12 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Do this in the caller of nested_vmx_vmexit instead. nested_vmx_check_exception was doing a vmwrite to the vmcs02's VM_EXIT_INTR_ERROR_CODE field, so that prepare_vmcs12 would move the field to vmcs12->vm_exit_intr_error_code. However that isn't possible on pre-Haswell machines. Moving the vmcs12 write to the callers fixes it. Reported-by: Jim Mattson Signed-off-by: Paolo Bonzini [Changed nested_vmx_reflect_vmexit() return type to (int)1 from (bool)1, thanks to fengguang.wu@intel.com] Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 52 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 34 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 39a6222..19465b7 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2442,7 +2442,7 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu) return 0; if (vcpu->arch.exception.nested_apf) { - vmcs_write32(VM_EXIT_INTR_ERROR_CODE, vcpu->arch.exception.error_code); + vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code; nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, PF_VECTOR | INTR_TYPE_HARD_EXCEPTION | INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK, @@ -2450,6 +2450,7 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu) return 1; } + vmcs12->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, vmcs_read32(VM_EXIT_INTR_INFO), vmcs_readl(EXIT_QUALIFICATION)); @@ -2667,7 +2668,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) * reason is that if one of these bits is necessary, it will appear * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control * fields of vmcs01 and vmcs02, will turn these bits off - and - * nested_vmx_exit_handled() will not pass related exits to L1. + * nested_vmx_exit_reflected() will not pass related exits to L1. * These rules have exceptions below. */ @@ -8019,12 +8020,11 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, * should handle it ourselves in L0 (and then continue L2). Only call this * when in is_guest_mode (L2). */ -static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) +static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) { u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - u32 exit_reason = vmx->exit_reason; trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, vmcs_readl(EXIT_QUALIFICATION), @@ -8169,6 +8169,29 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) } } +static int nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason) +{ + u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + + /* + * At this point, the exit interruption info in exit_intr_info + * is only valid for EXCEPTION_NMI exits. For EXTERNAL_INTERRUPT + * we need to query the in-kernel LAPIC. + */ + WARN_ON(exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT); + if ((exit_intr_info & + (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) == + (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) { + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + vmcs12->vm_exit_intr_error_code = + vmcs_read32(VM_EXIT_INTR_ERROR_CODE); + } + + nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info, + vmcs_readl(EXIT_QUALIFICATION)); + return 1; +} + static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) { *info1 = vmcs_readl(EXIT_QUALIFICATION); @@ -8415,12 +8438,8 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) if (vmx->emulation_required) return handle_invalid_guest_state(vcpu); - if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) { - nested_vmx_vmexit(vcpu, exit_reason, - vmcs_read32(VM_EXIT_INTR_INFO), - vmcs_readl(EXIT_QUALIFICATION)); - return 1; - } + if (is_guest_mode(vcpu) && nested_vmx_exit_reflected(vcpu, exit_reason)) + return nested_vmx_reflect_vmexit(vcpu, exit_reason); if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { dump_vmcs(); @@ -9509,12 +9528,14 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, WARN_ON(!is_guest_mode(vcpu)); - if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code)) + if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code)) { + vmcs12->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason, vmcs_read32(VM_EXIT_INTR_INFO), vmcs_readl(EXIT_QUALIFICATION)); - else + } else { kvm_inject_page_fault(vcpu, fault); + } } static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, @@ -10847,13 +10868,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs12->vm_exit_reason = exit_reason; vmcs12->exit_qualification = exit_qualification; - vmcs12->vm_exit_intr_info = exit_intr_info; - if ((vmcs12->vm_exit_intr_info & - (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) == - (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) - vmcs12->vm_exit_intr_error_code = - vmcs_read32(VM_EXIT_INTR_ERROR_CODE); + vmcs12->idt_vectoring_info_field = 0; vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); -- cgit v1.1 From b96fb439774e1bfb7d027ad324fa48606167cb52 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 27 Jul 2017 12:29:32 +0200 Subject: KVM: nVMX: fixes to nested virt interrupt injection There are three issues in nested_vmx_check_exception: 1) it is not taking PFEC_MATCH/PFEC_MASK into account, as reported by Wanpeng Li; 2) it should rebuild the interruption info and exit qualification fields from scratch, as reported by Jim Mattson, because the values from the L2->L0 vmexit may be invalid (e.g. if an emulated instruction causes a page fault, the EPT misconfig's exit qualification is incorrect). 3) CR2 and DR6 should not be written for exception intercept vmexits (CR2 only for AMD). This patch fixes the first two and adds a comment about the last, outlining the fix. Cc: Jim Mattson Cc: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 10 +++++++ arch/x86/kvm/vmx.c | 87 ++++++++++++++++++++++++++++++++++++++---------------- 2 files changed, 72 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 4d8141e..1107626 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2430,6 +2430,16 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; svm->vmcb->control.exit_code_hi = 0; svm->vmcb->control.exit_info_1 = error_code; + + /* + * FIXME: we should not write CR2 when L1 intercepts an L2 #PF exception. + * The fix is to add the ancillary datum (CR2 or DR6) to structs + * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6 can be + * written only when inject_pending_event runs (DR6 would written here + * too). This should be conditional on a new capability---if the + * capability is disabled, kvm_multiple_exception would write the + * ancillary information to CR2 or DR6, for backwards ABI-compatibility. + */ if (svm->vcpu.arch.exception.nested_apf) svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token; else diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 19465b7..714d436 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -927,6 +927,10 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var); static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx); static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); static int alloc_identity_pagetable(struct kvm *kvm); +static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu); +static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked); +static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, + u16 error_code); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -2428,6 +2432,30 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) vmx_set_interrupt_shadow(vcpu, 0); } +static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu, + unsigned long exit_qual) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + unsigned int nr = vcpu->arch.exception.nr; + u32 intr_info = nr | INTR_INFO_VALID_MASK; + + if (vcpu->arch.exception.has_error_code) { + vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code; + intr_info |= INTR_INFO_DELIVER_CODE_MASK; + } + + if (kvm_exception_is_soft(nr)) + intr_info |= INTR_TYPE_SOFT_EXCEPTION; + else + intr_info |= INTR_TYPE_HARD_EXCEPTION; + + if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) && + vmx_get_nmi_mask(vcpu)) + intr_info |= INTR_INFO_UNBLOCK_NMI; + + nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual); +} + /* * KVM wants to inject page-faults which it got to the guest. This function * checks whether in a nested guest, we need to inject them to L1 or L2. @@ -2437,24 +2465,38 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu) struct vmcs12 *vmcs12 = get_vmcs12(vcpu); unsigned int nr = vcpu->arch.exception.nr; - if (!((vmcs12->exception_bitmap & (1u << nr)) || - (nr == PF_VECTOR && vcpu->arch.exception.nested_apf))) - return 0; + if (nr == PF_VECTOR) { + if (vcpu->arch.exception.nested_apf) { + nested_vmx_inject_exception_vmexit(vcpu, + vcpu->arch.apf.nested_apf_token); + return 1; + } + /* + * FIXME: we must not write CR2 when L1 intercepts an L2 #PF exception. + * The fix is to add the ancillary datum (CR2 or DR6) to structs + * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6 + * can be written only when inject_pending_event runs. This should be + * conditional on a new capability---if the capability is disabled, + * kvm_multiple_exception would write the ancillary information to + * CR2 or DR6, for backwards ABI-compatibility. + */ + if (nested_vmx_is_page_fault_vmexit(vmcs12, + vcpu->arch.exception.error_code)) { + nested_vmx_inject_exception_vmexit(vcpu, vcpu->arch.cr2); + return 1; + } + } else { + unsigned long exit_qual = 0; + if (nr == DB_VECTOR) + exit_qual = vcpu->arch.dr6; - if (vcpu->arch.exception.nested_apf) { - vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code; - nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, - PF_VECTOR | INTR_TYPE_HARD_EXCEPTION | - INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK, - vcpu->arch.apf.nested_apf_token); - return 1; + if (vmcs12->exception_bitmap & (1u << nr)) { + nested_vmx_inject_exception_vmexit(vcpu, exit_qual); + return 1; + } } - vmcs12->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); - nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, - vmcs_read32(VM_EXIT_INTR_INFO), - vmcs_readl(EXIT_QUALIFICATION)); - return 1; + return 0; } static void vmx_queue_exception(struct kvm_vcpu *vcpu) @@ -9529,10 +9571,11 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, WARN_ON(!is_guest_mode(vcpu)); if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code)) { - vmcs12->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); - nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason, - vmcs_read32(VM_EXIT_INTR_INFO), - vmcs_readl(EXIT_QUALIFICATION)); + vmcs12->vm_exit_intr_error_code = fault->error_code; + nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, + PF_VECTOR | INTR_TYPE_HARD_EXCEPTION | + INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK, + fault->address); } else { kvm_inject_page_fault(vcpu, fault); } @@ -10115,12 +10158,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when * !enable_ept, EB.PF is 1, so the "or" will always be 1. - * - * A problem with this approach (when !enable_ept) is that L1 may be - * injected with more page faults than it asked for. This could have - * caused problems, but in practice existing hypervisors don't care. - * To fix this, we will need to emulate the PFEC checking (on the L1 - * page tables), using walk_addr(), when injecting PFs to L1. */ vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, enable_ept ? vmcs12->page_fault_error_code_mask : 0); -- cgit v1.1 From 337c017ccdf2653d0040099433fc1a2b1beb5926 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 1 Aug 2017 05:20:03 -0700 Subject: KVM: async_pf: make rcu irq exit if not triggered from idle task MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit WARNING: CPU: 5 PID: 1242 at kernel/rcu/tree_plugin.h:323 rcu_note_context_switch+0x207/0x6b0 CPU: 5 PID: 1242 Comm: unity-settings- Not tainted 4.13.0-rc2+ #1 RIP: 0010:rcu_note_context_switch+0x207/0x6b0 Call Trace: __schedule+0xda/0xba0 ? kvm_async_pf_task_wait+0x1b2/0x270 schedule+0x40/0x90 kvm_async_pf_task_wait+0x1cc/0x270 ? prepare_to_swait+0x22/0x70 do_async_page_fault+0x77/0xb0 ? do_async_page_fault+0x77/0xb0 async_page_fault+0x28/0x30 RIP: 0010:__d_lookup_rcu+0x90/0x1e0 I encounter this when trying to stress the async page fault in L1 guest w/ L2 guests running. Commit 9b132fbe5419 (Add rcu user eqs exception hooks for async page fault) adds rcu_irq_enter/exit() to kvm_async_pf_task_wait() to exit cpu idle eqs when needed, to protect the code that needs use rcu. However, we need to call the pair even if the function calls schedule(), as seen from the above backtrace. This patch fixes it by informing the RCU subsystem exit/enter the irq towards/away from idle for both n.halted and !n.halted. Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Paul E. McKenney Cc: stable@vger.kernel.org Signed-off-by: Wanpeng Li Reviewed-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kernel/kvm.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 71c17a5..d04e30e 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -151,6 +151,8 @@ void kvm_async_pf_task_wait(u32 token) if (hlist_unhashed(&n.link)) break; + rcu_irq_exit(); + if (!n.halted) { local_irq_enable(); schedule(); @@ -159,11 +161,11 @@ void kvm_async_pf_task_wait(u32 token) /* * We cannot reschedule. So halt. */ - rcu_irq_exit(); native_safe_halt(); local_irq_disable(); - rcu_irq_enter(); } + + rcu_irq_enter(); } if (!n.halted) finish_swait(&n.wq, &wait); -- cgit v1.1 From f4ef19108608c81769db69976999d056c070a6f0 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 1 Aug 2017 16:05:25 -0700 Subject: KVM: X86: Fix loss of pending INIT due to race MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When SMP VM start, AP may lost INIT because of receiving INIT between kvm_vcpu_ioctl_x86_get/set_vcpu_events. vcpu 0 vcpu 1 kvm_vcpu_ioctl_x86_get_vcpu_events events->smi.latched_init = 0 send INIT to vcpu1 set vcpu1's pending_events kvm_vcpu_ioctl_x86_set_vcpu_events if (events->smi.latched_init == 0) clear INIT in pending_events This patch fixes it by just update SMM related flags if we are in SMM. Thanks Peng Hao for the report and original commit message. Reported-by: Peng Hao Cc: Paolo Bonzini Cc: Radim Krčmář Signed-off-by: Wanpeng Li Reviewed-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/x86.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6c97c82..037055a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3159,15 +3159,18 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, kvm_set_hflags(vcpu, hflags); vcpu->arch.smi_pending = events->smi.pending; - if (events->smi.smm_inside_nmi) - vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK; - else - vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK; - if (lapic_in_kernel(vcpu)) { - if (events->smi.latched_init) - set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); + + if (events->smi.smm) { + if (events->smi.smm_inside_nmi) + vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK; else - clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); + vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK; + if (lapic_in_kernel(vcpu)) { + if (events->smi.latched_init) + set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); + else + clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); + } } } -- cgit v1.1 From ebd28fcb55e288030abb5bca4869603b3e1f5f7c Mon Sep 17 00:00:00 2001 From: "Longpeng(Mike)" Date: Wed, 2 Aug 2017 11:20:51 +0800 Subject: KVM: X86: init irq->level in kvm_pv_kick_cpu_op MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 'lapic_irq' is a local variable and its 'level' field isn't initialized, so 'level' is random, it doesn't matter but makes UBSAN unhappy: UBSAN: Undefined behaviour in .../lapic.c:... load of value 10 is not a valid value for type '_Bool' ... Call Trace: [] dump_stack+0x1e/0x20 [] ubsan_epilogue+0x12/0x55 [] __ubsan_handle_load_invalid_value+0x118/0x162 [] kvm_apic_set_irq+0xc3/0xf0 [kvm] [] kvm_irq_delivery_to_apic_fast+0x450/0x910 [kvm] [] kvm_irq_delivery_to_apic+0xfa/0x7a0 [kvm] [] kvm_emulate_hypercall+0x62e/0x760 [kvm] [] handle_vmcall+0x1a/0x30 [kvm_intel] [] vmx_handle_exit+0x7a2/0x1fa0 [kvm_intel] ... Signed-off-by: Longpeng(Mike) Signed-off-by: Radim Krčmář --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 037055a..d734aa8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6218,6 +6218,7 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid) lapic_irq.shorthand = 0; lapic_irq.dest_mode = 0; + lapic_irq.level = 0; lapic_irq.dest_id = apicid; lapic_irq.msi_redir_hint = false; -- cgit v1.1 From 9f744c59746078280ef28163aa136ef3f625804e Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 27 Jul 2017 15:54:46 +0200 Subject: KVM: nVMX: do not pin the VMCS12 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since the current implementation of VMCS12 does a memcpy in and out of guest memory, we do not need current_vmcs12 and current_vmcs12_page anymore. current_vmptr is enough to read and write the VMCS12. And David Matlack noted: This patch also fixes dirty tracking (memslot->dirty_bitmap) of the VMCS12 page by using kvm_write_guest. nested_release_page() only marks the struct page dirty. Signed-off-by: Paolo Bonzini Reviewed-by: David Hildenbrand [Added David Matlack's note and nested_release_page_clean() fix.] Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 714d436..082cdb9 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -416,9 +416,6 @@ struct nested_vmx { /* The guest-physical address of the current VMCS L1 keeps for L2 */ gpa_t current_vmptr; - /* The host-usable pointer to the above */ - struct page *current_vmcs12_page; - struct vmcs12 *current_vmcs12; /* * Cache of the guest's VMCS, existing outside of guest memory. * Loaded from guest memory during VMPTRLD. Flushed to guest @@ -7182,10 +7179,6 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) if (vmx->nested.current_vmptr == -1ull) return; - /* current_vmptr and current_vmcs12 are always set/reset together */ - if (WARN_ON(vmx->nested.current_vmcs12 == NULL)) - return; - if (enable_shadow_vmcs) { /* copy to memory all shadowed fields in case they were modified */ @@ -7198,13 +7191,11 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) vmx->nested.posted_intr_nv = -1; /* Flush VMCS12 to guest memory */ - memcpy(vmx->nested.current_vmcs12, vmx->nested.cached_vmcs12, - VMCS12_SIZE); + kvm_vcpu_write_guest_page(&vmx->vcpu, + vmx->nested.current_vmptr >> PAGE_SHIFT, + vmx->nested.cached_vmcs12, 0, VMCS12_SIZE); - kunmap(vmx->nested.current_vmcs12_page); - nested_release_page(vmx->nested.current_vmcs12_page); vmx->nested.current_vmptr = -1ull; - vmx->nested.current_vmcs12 = NULL; } /* @@ -7622,14 +7613,14 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) } nested_release_vmcs12(vmx); - vmx->nested.current_vmcs12 = new_vmcs12; - vmx->nested.current_vmcs12_page = page; /* * Load VMCS12 from guest memory since it is not already * cached. */ - memcpy(vmx->nested.cached_vmcs12, - vmx->nested.current_vmcs12, VMCS12_SIZE); + memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE); + kunmap(page); + nested_release_page_clean(page); + set_current_vmptr(vmx, vmptr); } @@ -9284,7 +9275,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) vmx->nested.posted_intr_nv = -1; vmx->nested.current_vmptr = -1ull; - vmx->nested.current_vmcs12 = NULL; vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED; -- cgit v1.1 From 8ca44e88c32f86721c6ceca8e5e9b0b40c98907d Mon Sep 17 00:00:00 2001 From: David Matlack Date: Tue, 1 Aug 2017 14:00:39 -0700 Subject: kvm: nVMX: don't flush VMCS12 during VMXOFF or VCPU teardown MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit According to the Intel SDM, software cannot rely on the current VMCS to be coherent after a VMXOFF or shutdown. So this is a valid way to handle VMCS12 flushes. 24.11.1 Software Use of Virtual-Machine Control Structures ... If a logical processor leaves VMX operation, any VMCSs active on that logical processor may be corrupted (see below). To prevent such corruption of a VMCS that may be used either after a return to VMX operation or on another logical processor, software should execute VMCLEAR for that VMCS before executing the VMXOFF instruction or removing power from the processor (e.g., as part of a transition to the S3 and S4 power states). ... This fixes a "suspicious rcu_dereference_check() usage!" warning during kvm_vm_release() because nested_release_vmcs12() calls kvm_vcpu_write_guest_page() without holding kvm->srcu. Signed-off-by: David Matlack Reviewed-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 082cdb9..2099b14 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -419,7 +419,7 @@ struct nested_vmx { /* * Cache of the guest's VMCS, existing outside of guest memory. * Loaded from guest memory during VMPTRLD. Flushed to guest - * memory during VMXOFF, VMCLEAR, VMPTRLD. + * memory during VMCLEAR and VMPTRLD. */ struct vmcs12 *cached_vmcs12; /* @@ -7174,6 +7174,12 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) return 1; } +static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) +{ + vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS); + vmcs_write64(VMCS_LINK_POINTER, -1ull); +} + static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) { if (vmx->nested.current_vmptr == -1ull) @@ -7184,9 +7190,7 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) they were modified */ copy_shadow_to_vmcs12(vmx); vmx->nested.sync_shadow_vmcs = false; - vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, - SECONDARY_EXEC_SHADOW_VMCS); - vmcs_write64(VMCS_LINK_POINTER, -1ull); + vmx_disable_shadow_vmcs(vmx); } vmx->nested.posted_intr_nv = -1; @@ -7209,12 +7213,14 @@ static void free_nested(struct vcpu_vmx *vmx) vmx->nested.vmxon = false; free_vpid(vmx->nested.vpid02); - nested_release_vmcs12(vmx); + vmx->nested.posted_intr_nv = -1; + vmx->nested.current_vmptr = -1ull; if (vmx->nested.msr_bitmap) { free_page((unsigned long)vmx->nested.msr_bitmap); vmx->nested.msr_bitmap = NULL; } if (enable_shadow_vmcs) { + vmx_disable_shadow_vmcs(vmx); vmcs_clear(vmx->vmcs01.shadow_vmcs); free_vmcs(vmx->vmcs01.shadow_vmcs); vmx->vmcs01.shadow_vmcs = NULL; -- cgit v1.1 From c9f04407f2e0b3fc9ff7913c65fcfcb0a4b61570 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Tue, 1 Aug 2017 14:00:40 -0700 Subject: KVM: nVMX: mark vmcs12 pages dirty on L2 exit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The host physical addresses of L1's Virtual APIC Page and Posted Interrupt descriptor are loaded into the VMCS02. The CPU may write to these pages via their host physical address while L2 is running, bypassing address-translation-based dirty tracking (e.g. EPT write protection). Mark them dirty on every exit from L2 to prevent them from getting out of sync with dirty tracking. Also mark the virtual APIC page and the posted interrupt descriptor dirty when KVM is virtualizing posted interrupt processing. Signed-off-by: David Matlack Reviewed-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 53 +++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 43 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2099b14..cbad87e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4995,6 +4995,28 @@ static bool vmx_get_enable_apicv(void) return enable_apicv; } +static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + gfn_t gfn; + + /* + * Don't need to mark the APIC access page dirty; it is never + * written to by the CPU during APIC virtualization. + */ + + if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { + gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT; + kvm_vcpu_mark_page_dirty(vcpu, gfn); + } + + if (nested_cpu_has_posted_intr(vmcs12)) { + gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT; + kvm_vcpu_mark_page_dirty(vcpu, gfn); + } +} + + static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -5002,18 +5024,15 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) void *vapic_page; u16 status; - if (vmx->nested.pi_desc && - vmx->nested.pi_pending) { - vmx->nested.pi_pending = false; - if (!pi_test_and_clear_on(vmx->nested.pi_desc)) - return; - - max_irr = find_last_bit( - (unsigned long *)vmx->nested.pi_desc->pir, 256); + if (!vmx->nested.pi_desc || !vmx->nested.pi_pending) + return; - if (max_irr == 256) - return; + vmx->nested.pi_pending = false; + if (!pi_test_and_clear_on(vmx->nested.pi_desc)) + return; + max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256); + if (max_irr != 256) { vapic_page = kmap(vmx->nested.virtual_apic_page); __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page); kunmap(vmx->nested.virtual_apic_page); @@ -5025,6 +5044,8 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) vmcs_write16(GUEST_INTR_STATUS, status); } } + + nested_mark_vmcs12_pages_dirty(vcpu); } static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, @@ -8072,6 +8093,18 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) vmcs_read32(VM_EXIT_INTR_ERROR_CODE), KVM_ISA_VMX); + /* + * The host physical addresses of some pages of guest memory + * are loaded into VMCS02 (e.g. L1's Virtual APIC Page). The CPU + * may write to these pages via their host physical address while + * L2 is running, bypassing any address-translation-based dirty + * tracking (e.g. EPT write protection). + * + * Mark them dirty on every exit from L2 to prevent them from + * getting out of sync with dirty tracking. + */ + nested_mark_vmcs12_pages_dirty(vcpu); + if (vmx->nested.nested_run_pending) return false; -- cgit v1.1 From 6550c4df7e50d09f86385ce20fd7e969a5638da3 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 31 Jul 2017 19:25:27 -0700 Subject: KVM: nVMX: Fix interrupt window request with "Acknowledge interrupt on exit" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ------------[ cut here ]------------ WARNING: CPU: 5 PID: 2288 at arch/x86/kvm/vmx.c:11124 nested_vmx_vmexit+0xd64/0xd70 [kvm_intel] CPU: 5 PID: 2288 Comm: qemu-system-x86 Not tainted 4.13.0-rc2+ #7 RIP: 0010:nested_vmx_vmexit+0xd64/0xd70 [kvm_intel] Call Trace: vmx_check_nested_events+0x131/0x1f0 [kvm_intel] ? vmx_check_nested_events+0x131/0x1f0 [kvm_intel] kvm_arch_vcpu_ioctl_run+0x5dd/0x1be0 [kvm] ? vmx_vcpu_load+0x1be/0x220 [kvm_intel] ? kvm_arch_vcpu_load+0x62/0x230 [kvm] kvm_vcpu_ioctl+0x340/0x700 [kvm] ? kvm_vcpu_ioctl+0x340/0x700 [kvm] ? __fget+0xfc/0x210 do_vfs_ioctl+0xa4/0x6a0 ? __fget+0x11d/0x210 SyS_ioctl+0x79/0x90 do_syscall_64+0x8f/0x750 ? trace_hardirqs_on_thunk+0x1a/0x1c entry_SYSCALL64_slow_path+0x25/0x25 This can be reproduced by booting L1 guest w/ 'noapic' grub parameter, which means that tells the kernel to not make use of any IOAPICs that may be present in the system. Actually external_intr variable in nested_vmx_vmexit() is the req_int_win variable passed from vcpu_enter_guest() which means that the L0's userspace requests an irq window. I observed the scenario (!kvm_cpu_has_interrupt(vcpu) && L0's userspace reqeusts an irq window) is true, so there is no interrupt which L1 requires to inject to L2, we should not attempt to emualte "Acknowledge interrupt on exit" for the irq window requirement in this scenario. This patch fixes it by not attempt to emulate "Acknowledge interrupt on exit" if there is no L1 requirement to inject an interrupt to L2. Cc: Paolo Bonzini Cc: Radim Krčmář Signed-off-by: Wanpeng Li [Added code comment to make it obvious that the behavior is not correct. We should do a userspace exit with open interrupt window instead of the nested VM exit. This patch still improves the behavior, so it was accepted as a (temporary) workaround.] Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index cbad87e..9b21b12 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11131,8 +11131,15 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vmx_switch_vmcs(vcpu, &vmx->vmcs01); - if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) - && nested_exit_intr_ack_set(vcpu)) { + /* + * TODO: SDM says that with acknowledge interrupt on exit, bit 31 of + * the VM-exit interrupt information (valid interrupt) is always set to + * 1 on EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't need + * kvm_cpu_has_interrupt(). See the commit message for details. + */ + if (nested_exit_intr_ack_set(vcpu) && + exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && + kvm_cpu_has_interrupt(vcpu)) { int irq = kvm_cpu_get_interrupt(vcpu); WARN_ON(irq < 0); vmcs12->vm_exit_intr_info = irq | -- cgit v1.1 From 8861249c740fc4af9ddc5aee321eafefb960d7c6 Mon Sep 17 00:00:00 2001 From: "megha.dey@linux.intel.com" Date: Wed, 2 Aug 2017 13:49:09 -0700 Subject: crypto: x86/sha1 - Fix reads beyond the number of blocks passed It was reported that the sha1 AVX2 function(sha1_transform_avx2) is reading ahead beyond its intended data, and causing a crash if the next block is beyond page boundary: http://marc.info/?l=linux-crypto-vger&m=149373371023377 This patch makes sure that there is no overflow for any buffer length. It passes the tests written by Jan Stancek that revealed this problem: https://github.com/jstancek/sha1-avx2-crash I have re-enabled sha1-avx2 by reverting commit b82ce24426a4071da9529d726057e4e642948667 Cc: Fixes: b82ce24426a4 ("crypto: sha1-ssse3 - Disable avx2") Originally-by: Ilya Albrekht Tested-by: Jan Stancek Signed-off-by: Megha Dey Reported-by: Jan Stancek Signed-off-by: Herbert Xu --- arch/x86/crypto/sha1_avx2_x86_64_asm.S | 67 ++++++++++++++++++---------------- arch/x86/crypto/sha1_ssse3_glue.c | 2 +- 2 files changed, 37 insertions(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S index 1cd792d..1eab79c 100644 --- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S +++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S @@ -117,11 +117,10 @@ .set T1, REG_T1 .endm -#define K_BASE %r8 #define HASH_PTR %r9 +#define BLOCKS_CTR %r8 #define BUFFER_PTR %r10 #define BUFFER_PTR2 %r13 -#define BUFFER_END %r11 #define PRECALC_BUF %r14 #define WK_BUF %r15 @@ -205,14 +204,14 @@ * blended AVX2 and ALU instruction scheduling * 1 vector iteration per 8 rounds */ - vmovdqu ((i * 2) + PRECALC_OFFSET)(BUFFER_PTR), W_TMP + vmovdqu (i * 2)(BUFFER_PTR), W_TMP .elseif ((i & 7) == 1) - vinsertf128 $1, (((i-1) * 2)+PRECALC_OFFSET)(BUFFER_PTR2),\ + vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\ WY_TMP, WY_TMP .elseif ((i & 7) == 2) vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY .elseif ((i & 7) == 4) - vpaddd K_XMM(K_BASE), WY, WY_TMP + vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP .elseif ((i & 7) == 7) vmovdqu WY_TMP, PRECALC_WK(i&~7) @@ -255,7 +254,7 @@ vpxor WY, WY_TMP, WY_TMP .elseif ((i & 7) == 7) vpxor WY_TMP2, WY_TMP, WY - vpaddd K_XMM(K_BASE), WY, WY_TMP + vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP vmovdqu WY_TMP, PRECALC_WK(i&~7) PRECALC_ROTATE_WY @@ -291,7 +290,7 @@ vpsrld $30, WY, WY vpor WY, WY_TMP, WY .elseif ((i & 7) == 7) - vpaddd K_XMM(K_BASE), WY, WY_TMP + vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP vmovdqu WY_TMP, PRECALC_WK(i&~7) PRECALC_ROTATE_WY @@ -446,6 +445,16 @@ .endm +/* Add constant only if (%2 > %3) condition met (uses RTA as temp) + * %1 + %2 >= %3 ? %4 : 0 + */ +.macro ADD_IF_GE a, b, c, d + mov \a, RTA + add $\d, RTA + cmp $\c, \b + cmovge RTA, \a +.endm + /* * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining */ @@ -463,13 +472,16 @@ lea (2*4*80+32)(%rsp), WK_BUF # Precalc WK for first 2 blocks - PRECALC_OFFSET = 0 + ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64 .set i, 0 .rept 160 PRECALC i .set i, i + 1 .endr - PRECALC_OFFSET = 128 + + /* Go to next block if needed */ + ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128 + ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 xchg WK_BUF, PRECALC_BUF .align 32 @@ -479,8 +491,8 @@ _loop: * we use K_BASE value as a signal of a last block, * it is set below by: cmovae BUFFER_PTR, K_BASE */ - cmp K_BASE, BUFFER_PTR - jne _begin + test BLOCKS_CTR, BLOCKS_CTR + jnz _begin .align 32 jmp _end .align 32 @@ -512,10 +524,10 @@ _loop0: .set j, j+2 .endr - add $(2*64), BUFFER_PTR /* move to next odd-64-byte block */ - cmp BUFFER_END, BUFFER_PTR /* is current block the last one? */ - cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */ - + /* Update Counter */ + sub $1, BLOCKS_CTR + /* Move to the next block only if needed*/ + ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128 /* * rounds * 60,62,64,66,68 @@ -532,8 +544,8 @@ _loop0: UPDATE_HASH 12(HASH_PTR), D UPDATE_HASH 16(HASH_PTR), E - cmp K_BASE, BUFFER_PTR /* is current block the last one? */ - je _loop + test BLOCKS_CTR, BLOCKS_CTR + jz _loop mov TB, B @@ -575,10 +587,10 @@ _loop2: .set j, j+2 .endr - add $(2*64), BUFFER_PTR2 /* move to next even-64-byte block */ - - cmp BUFFER_END, BUFFER_PTR2 /* is current block the last one */ - cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */ + /* update counter */ + sub $1, BLOCKS_CTR + /* Move to the next block only if needed*/ + ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 jmp _loop3 _loop3: @@ -641,19 +653,12 @@ _loop3: avx2_zeroupper - lea K_XMM_AR(%rip), K_BASE - + /* Setup initial values */ mov CTX, HASH_PTR mov BUF, BUFFER_PTR - lea 64(BUF), BUFFER_PTR2 - - shl $6, CNT /* mul by 64 */ - add BUF, CNT - add $64, CNT - mov CNT, BUFFER_END - cmp BUFFER_END, BUFFER_PTR2 - cmovae K_BASE, BUFFER_PTR2 + mov BUF, BUFFER_PTR2 + mov CNT, BLOCKS_CTR xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index f960a04..fc61739 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c @@ -201,7 +201,7 @@ asmlinkage void sha1_transform_avx2(u32 *digest, const char *data, static bool avx2_usable(void) { - if (false && avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) + if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_BMI1) && boot_cpu_has(X86_FEATURE_BMI2)) return true; -- cgit v1.1 From bfe334924ccd9f4a53f30240c03cf2f43f5b2df1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 2 Aug 2017 19:39:30 +0200 Subject: perf/x86: Fix RDPMC vs. mm_struct tracking Vince reported the following rdpmc() testcase failure: > Failing test case: > > fd=perf_event_open(); > addr=mmap(fd); > exec() // without closing or unmapping the event > fd=perf_event_open(); > addr=mmap(fd); > rdpmc() // GPFs due to rdpmc being disabled The problem is of course that exec() plays tricks with what is current->mm, only destroying the old mappings after having installed the new mm. Fix this confusion by passing along vma->vm_mm instead of relying on current->mm. Reported-by: Vince Weaver Tested-by: Vince Weaver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: stable@vger.kernel.org Fixes: 1e0fb9ec679c ("perf: Add pmu callbacks to track event mapping and unmapping") Link: http://lkml.kernel.org/r/20170802173930.cstykcqefmqt7jau@hirez.programming.kicks-ass.net [ Minor cleanups. ] Signed-off-by: Ingo Molnar --- arch/x86/events/core.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 8e3db8f..af12e29 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2114,7 +2114,7 @@ static void refresh_pce(void *ignored) load_mm_cr4(this_cpu_read(cpu_tlbstate.loaded_mm)); } -static void x86_pmu_event_mapped(struct perf_event *event) +static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm) { if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) return; @@ -2129,22 +2129,20 @@ static void x86_pmu_event_mapped(struct perf_event *event) * For now, this can't happen because all callers hold mmap_sem * for write. If this changes, we'll need a different solution. */ - lockdep_assert_held_exclusive(¤t->mm->mmap_sem); + lockdep_assert_held_exclusive(&mm->mmap_sem); - if (atomic_inc_return(¤t->mm->context.perf_rdpmc_allowed) == 1) - on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1); + if (atomic_inc_return(&mm->context.perf_rdpmc_allowed) == 1) + on_each_cpu_mask(mm_cpumask(mm), refresh_pce, NULL, 1); } -static void x86_pmu_event_unmapped(struct perf_event *event) +static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm) { - if (!current->mm) - return; if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) return; - if (atomic_dec_and_test(¤t->mm->context.perf_rdpmc_allowed)) - on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1); + if (atomic_dec_and_test(&mm->context.perf_rdpmc_allowed)) + on_each_cpu_mask(mm_cpumask(mm), refresh_pce, NULL, 1); } static int x86_pmu_event_idx(struct perf_event *event) -- cgit v1.1 From e93c17301ac55321fc18e0f8316e924e58a83c8c Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 7 Aug 2017 19:43:13 -0700 Subject: x86/asm/64: Clear AC on NMI entries This closes a hole in our SMAP implementation. This patch comes from grsecurity. Good catch! Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/314cc9f294e8f14ed85485727556ad4f15bb1659.1502159503.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index d271fb7..6d078b8 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1211,6 +1211,8 @@ ENTRY(nmi) * other IST entries. */ + ASM_CLAC + /* Use %rdx as our temp variable throughout */ pushq %rdx -- cgit v1.1 From 10e66760fa8ee11f254a69433fc132d04758a5fc Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Thu, 3 Aug 2017 12:58:18 +0200 Subject: x86/smpboot: Unbreak CPU0 hotplug A hang on CPU0 onlining after a preceding offlining is observed. Trace shows that CPU0 is stuck in check_tsc_sync_target() waiting for source CPU to run check_tsc_sync_source() but this never happens. Source CPU, in its turn, is stuck on synchronize_sched() which is called from native_cpu_up() -> do_boot_cpu() -> unregister_nmi_handler(). So it's a classic ABBA deadlock, due to the use of synchronize_sched() in unregister_nmi_handler(). Fix the bug by moving unregister_nmi_handler() from do_boot_cpu() to native_cpu_up() after cpu onlining is done. Signed-off-by: Vitaly Kuznetsov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170803105818.9934-1-vkuznets@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index b474c8d..54b9e89 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -971,7 +971,8 @@ void common_cpu_up(unsigned int cpu, struct task_struct *idle) * Returns zero if CPU booted OK, else error code from * ->wakeup_secondary_cpu. */ -static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) +static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, + int *cpu0_nmi_registered) { volatile u32 *trampoline_status = (volatile u32 *) __va(real_mode_header->trampoline_status); @@ -979,7 +980,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) unsigned long start_ip = real_mode_header->trampoline_start; unsigned long boot_error = 0; - int cpu0_nmi_registered = 0; unsigned long timeout; idle->thread.sp = (unsigned long)task_pt_regs(idle); @@ -1035,7 +1035,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) boot_error = apic->wakeup_secondary_cpu(apicid, start_ip); else boot_error = wakeup_cpu_via_init_nmi(cpu, start_ip, apicid, - &cpu0_nmi_registered); + cpu0_nmi_registered); if (!boot_error) { /* @@ -1080,12 +1080,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) */ smpboot_restore_warm_reset_vector(); } - /* - * Clean up the nmi handler. Do this after the callin and callout sync - * to avoid impact of possible long unregister time. - */ - if (cpu0_nmi_registered) - unregister_nmi_handler(NMI_LOCAL, "wake_cpu0"); return boot_error; } @@ -1093,8 +1087,9 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) int native_cpu_up(unsigned int cpu, struct task_struct *tidle) { int apicid = apic->cpu_present_to_apicid(cpu); + int cpu0_nmi_registered = 0; unsigned long flags; - int err; + int err, ret = 0; WARN_ON(irqs_disabled()); @@ -1131,10 +1126,11 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle) common_cpu_up(cpu, tidle); - err = do_boot_cpu(apicid, cpu, tidle); + err = do_boot_cpu(apicid, cpu, tidle, &cpu0_nmi_registered); if (err) { pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu); - return -EIO; + ret = -EIO; + goto unreg_nmi; } /* @@ -1150,7 +1146,15 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle) touch_nmi_watchdog(); } - return 0; +unreg_nmi: + /* + * Clean up the nmi handler. Do this after the callin and callout sync + * to avoid impact of possible long unregister time. + */ + if (cpu0_nmi_registered) + unregister_nmi_handler(NMI_LOCAL, "wake_cpu0"); + + return ret; } /** -- cgit v1.1 From 8e2f3bce05e056575c2c84a344a8291fdabb5f21 Mon Sep 17 00:00:00 2001 From: Doug Smythies Date: Tue, 8 Aug 2017 14:12:49 -0700 Subject: cpufreq: x86: Disable interrupts during MSRs reading According to Intel 64 and IA-32 Architectures SDM, Volume 3, Chapter 14.2, "Software needs to exercise care to avoid delays between the two RDMSRs (for example interrupts)". So, disable interrupts during reading MSRs IA32_APERF and IA32_MPERF. See also: commit 4ab60c3f32c7 (cpufreq: intel_pstate: Disable interrupts during MSRs reading). Signed-off-by: Doug Smythies Reviewed-by: Len Brown Signed-off-by: Rafael J. Wysocki --- arch/x86/kernel/cpu/aperfmperf.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index 7cf7c70..0ee8332 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -40,13 +40,16 @@ static void aperfmperf_snapshot_khz(void *dummy) struct aperfmperf_sample *s = this_cpu_ptr(&samples); ktime_t now = ktime_get(); s64 time_delta = ktime_ms_delta(now, s->time); + unsigned long flags; /* Don't bother re-computing within the cache threshold time. */ if (time_delta < APERFMPERF_CACHE_THRESHOLD_MS) return; + local_irq_save(flags); rdmsrl(MSR_IA32_APERF, aperf); rdmsrl(MSR_IA32_MPERF, mperf); + local_irq_restore(flags); aperf_delta = aperf - s->aperf; mperf_delta = mperf - s->mperf; -- cgit v1.1 From 5442c26995527245c94c4a49e535eae8a60a5299 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 1 Aug 2017 20:55:52 +0200 Subject: x86/cpufeature, kvm/svm: Rename (shorten) the new "virtualized VMSAVE/VMLOAD" CPUID flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit "virtual_vmload_vmsave" is what is going to land in /proc/cpuinfo now as per v4.13-rc4, for a single feature bit which is clearly too long. So rename it to what it is called in the processor manual. "v_vmsave_vmload" is a bit shorter, after all. We could go more aggressively here but having it the same as in the processor manual is advantageous. Signed-off-by: Borislav Petkov Acked-by: Radim Krčmář Cc: Janakarajan Natarajan Cc: Jörg Rödel Cc: Linus Torvalds Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kvm-ML Link: http://lkml.kernel.org/r/20170801185552.GA3743@nazgul.tnic Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeatures.h | 2 +- arch/x86/kvm/svm.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index ca3c48c..5a28e8e 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -286,7 +286,7 @@ #define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */ #define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */ #define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ -#define X86_FEATURE_VIRTUAL_VMLOAD_VMSAVE (15*32+15) /* Virtual VMLOAD VMSAVE */ +#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */ #define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 1107626..56ba053 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1100,7 +1100,7 @@ static __init int svm_hardware_setup(void) if (vls) { if (!npt_enabled || - !boot_cpu_has(X86_FEATURE_VIRTUAL_VMLOAD_VMSAVE) || + !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) || !IS_ENABLED(CONFIG_X86_64)) { vls = false; } else { -- cgit v1.1 From b45e4c45b13275a6b4a3f83ae8301a1963fbe5d0 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 10 Aug 2017 16:57:09 +0100 Subject: x86: Mark various structures and functions as 'static' Mark a couple of structures and functions as 'static', pointed out by Sparse: warning: symbol 'bts_pmu' was not declared. Should it be static? warning: symbol 'p4_event_aliases' was not declared. Should it be static? warning: symbol 'rapl_attr_groups' was not declared. Should it be static? symbol 'process_uv2_message' was not declared. Should it be static? Signed-off-by: Colin Ian King Acked-by: Andrew Banman # for the UV change Cc: Alexander Shishkin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Cc: Will Deacon Cc: kernel-janitors@vger.kernel.org Link: http://lkml.kernel.org/r/20170810155709.7094-1-colin.king@canonical.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/bts.c | 2 +- arch/x86/events/intel/p4.c | 2 +- arch/x86/events/intel/rapl.c | 2 +- arch/x86/platform/uv/tlb_uv.c | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index 8ae8c5c..ddd8d35 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -69,7 +69,7 @@ struct bts_buffer { struct bts_phys buf[0]; }; -struct pmu bts_pmu; +static struct pmu bts_pmu; static size_t buf_size(struct page *page) { diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c index eb05335..d32c0ee 100644 --- a/arch/x86/events/intel/p4.c +++ b/arch/x86/events/intel/p4.c @@ -587,7 +587,7 @@ static __initconst const u64 p4_hw_cache_event_ids * P4_CONFIG_ALIASABLE or bits for P4_PEBS_METRIC, they are * either up to date automatically or not applicable at all. */ -struct p4_event_alias { +static struct p4_event_alias { u64 original; u64 alternative; } p4_event_aliases[] = { diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index a45e211..8e2457c 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -559,7 +559,7 @@ static struct attribute_group rapl_pmu_format_group = { .attrs = rapl_formats_attr, }; -const struct attribute_group *rapl_attr_groups[] = { +static const struct attribute_group *rapl_attr_groups[] = { &rapl_pmu_attr_group, &rapl_pmu_format_group, &rapl_pmu_events_group, diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 3e4bdb4..f44c0bc 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -26,7 +26,7 @@ static struct bau_operations ops __ro_after_init; /* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */ -static int timeout_base_ns[] = { +static const int timeout_base_ns[] = { 20, 160, 1280, @@ -1216,7 +1216,7 @@ static struct bau_pq_entry *find_another_by_swack(struct bau_pq_entry *msg, * set a bit in the UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE register. * Such a message must be ignored. */ -void process_uv2_message(struct msg_desc *mdp, struct bau_control *bcp) +static void process_uv2_message(struct msg_desc *mdp, struct bau_control *bcp) { unsigned long mmr_image; unsigned char swack_vec; -- cgit v1.1 From c138d81163d82db044dcaf1141395713f03bf0bf Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 28 Jul 2017 12:23:12 +0200 Subject: x86: provide an init_mem_mapping hypervisor hook Provide a hook in hypervisor_x86 called after setting up initial memory mapping. This is needed e.g. by Xen HVM guests to map the hypervisor shared info page. Signed-off-by: Juergen Gross Reviewed-by: Boris Ostrovsky Acked-by: Ingo Molnar Signed-off-by: Juergen Gross --- arch/x86/include/asm/hypervisor.h | 10 ++++++++++ arch/x86/mm/init.c | 3 +++ 2 files changed, 13 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h index 2112615..0ead9db 100644 --- a/arch/x86/include/asm/hypervisor.h +++ b/arch/x86/include/asm/hypervisor.h @@ -43,6 +43,9 @@ struct hypervisor_x86 { /* pin current vcpu to specified physical cpu (run rarely) */ void (*pin_vcpu)(int); + + /* called during init_mem_mapping() to setup early mappings. */ + void (*init_mem_mapping)(void); }; extern const struct hypervisor_x86 *x86_hyper; @@ -57,8 +60,15 @@ extern const struct hypervisor_x86 x86_hyper_kvm; extern void init_hypervisor_platform(void); extern bool hypervisor_x2apic_available(void); extern void hypervisor_pin_vcpu(int cpu); + +static inline void hypervisor_init_mem_mapping(void) +{ + if (x86_hyper && x86_hyper->init_mem_mapping) + x86_hyper->init_mem_mapping(); +} #else static inline void init_hypervisor_platform(void) { } static inline bool hypervisor_x2apic_available(void) { return false; } +static inline void hypervisor_init_mem_mapping(void) { } #endif /* CONFIG_HYPERVISOR_GUEST */ #endif /* _ASM_X86_HYPERVISOR_H */ diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 673541e..bf3f106 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -18,6 +18,7 @@ #include /* for MAX_DMA_PFN */ #include #include +#include /* * We need to define the tracepoints somewhere, and tlb.c @@ -636,6 +637,8 @@ void __init init_mem_mapping(void) load_cr3(swapper_pg_dir); __flush_tlb_all(); + hypervisor_init_mem_mapping(); + early_memtest(0, max_pfn_mapped << PAGE_SHIFT); } -- cgit v1.1 From 10231f69eb039550864ff3eb395da0c63c03ed5f Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 28 Jul 2017 12:23:13 +0200 Subject: xen: split up xen_hvm_init_shared_info() Instead of calling xen_hvm_init_shared_info() on boot and resume split it up into a boot time function searching for the pfn to use and a mapping function doing the hypervisor mapping call. Signed-off-by: Juergen Gross Reviewed-by: Boris Ostrovsky Acked-by: Ingo Molnar Signed-off-by: Juergen Gross --- arch/x86/xen/enlighten_hvm.c | 45 +++++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index 87d7913..d23531f 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -21,29 +21,9 @@ #include "mmu.h" #include "smp.h" -void __ref xen_hvm_init_shared_info(void) +void xen_hvm_init_shared_info(void) { struct xen_add_to_physmap xatp; - u64 pa; - - if (HYPERVISOR_shared_info == &xen_dummy_shared_info) { - /* - * Search for a free page starting at 4kB physical address. - * Low memory is preferred to avoid an EPT large page split up - * by the mapping. - * Starting below X86_RESERVE_LOW (usually 64kB) is fine as - * the BIOS used for HVM guests is well behaved and won't - * clobber memory other than the first 4kB. - */ - for (pa = PAGE_SIZE; - !e820__mapped_all(pa, pa + PAGE_SIZE, E820_TYPE_RAM) || - memblock_is_reserved(pa); - pa += PAGE_SIZE) - ; - - memblock_reserve(pa, PAGE_SIZE); - HYPERVISOR_shared_info = __va(pa); - } xatp.domid = DOMID_SELF; xatp.idx = 0; @@ -53,6 +33,28 @@ void __ref xen_hvm_init_shared_info(void) BUG(); } +static void __init reserve_shared_info(void) +{ + u64 pa; + + /* + * Search for a free page starting at 4kB physical address. + * Low memory is preferred to avoid an EPT large page split up + * by the mapping. + * Starting below X86_RESERVE_LOW (usually 64kB) is fine as + * the BIOS used for HVM guests is well behaved and won't + * clobber memory other than the first 4kB. + */ + for (pa = PAGE_SIZE; + !e820__mapped_all(pa, pa + PAGE_SIZE, E820_TYPE_RAM) || + memblock_is_reserved(pa); + pa += PAGE_SIZE) + ; + + memblock_reserve(pa, PAGE_SIZE); + HYPERVISOR_shared_info = __va(pa); +} + static void __init init_hvm_pv_info(void) { int major, minor; @@ -153,6 +155,7 @@ static void __init xen_hvm_guest_init(void) init_hvm_pv_info(); + reserve_shared_info(); xen_hvm_init_shared_info(); /* -- cgit v1.1 From 4ca83dcf4e3bc0c98836dbb97553792ca7ea5429 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 28 Jul 2017 12:23:14 +0200 Subject: xen: fix hvm guest with kaslr enabled A Xen HVM guest running with KASLR enabled will die rather soon today because the shared info page mapping is using va() too early. This was introduced by commit a5d5f328b0e2baa5ee7c119fd66324eb79eeeb66 ("xen: allocate page for shared info page from low memory"). In order to fix this use early_memremap() to get a temporary virtual address for shared info until va() can be used safely. Signed-off-by: Juergen Gross Reviewed-by: Boris Ostrovsky Acked-by: Ingo Molnar Signed-off-by: Juergen Gross --- arch/x86/xen/enlighten_hvm.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index d23531f..de503c2 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -21,6 +22,8 @@ #include "mmu.h" #include "smp.h" +static unsigned long shared_info_pfn; + void xen_hvm_init_shared_info(void) { struct xen_add_to_physmap xatp; @@ -28,7 +31,7 @@ void xen_hvm_init_shared_info(void) xatp.domid = DOMID_SELF; xatp.idx = 0; xatp.space = XENMAPSPACE_shared_info; - xatp.gpfn = virt_to_pfn(HYPERVISOR_shared_info); + xatp.gpfn = shared_info_pfn; if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) BUG(); } @@ -51,8 +54,16 @@ static void __init reserve_shared_info(void) pa += PAGE_SIZE) ; + shared_info_pfn = PHYS_PFN(pa); + memblock_reserve(pa, PAGE_SIZE); - HYPERVISOR_shared_info = __va(pa); + HYPERVISOR_shared_info = early_memremap(pa, PAGE_SIZE); +} + +static void __init xen_hvm_init_mem_mapping(void) +{ + early_memunmap(HYPERVISOR_shared_info, PAGE_SIZE); + HYPERVISOR_shared_info = __va(PFN_PHYS(shared_info_pfn)); } static void __init init_hvm_pv_info(void) @@ -221,5 +232,6 @@ const struct hypervisor_x86 x86_hyper_xen_hvm = { .init_platform = xen_hvm_guest_init, .pin_vcpu = xen_pin_vcpu, .x2apic_available = xen_x2apic_para_available, + .init_mem_mapping = xen_hvm_init_mem_mapping, }; EXPORT_SYMBOL(x86_hyper_xen_hvm); -- cgit v1.1 From 84393817db09bb436e934f8f8cc981cbca9ea4dc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 15 Aug 2017 13:03:47 +0200 Subject: x86/mtrr: Prevent CPU hotplug lock recursion Larry reported a CPU hotplug lock recursion in the MTRR code. ============================================ WARNING: possible recursive locking detected systemd-udevd/153 is trying to acquire lock: (cpu_hotplug_lock.rw_sem){.+.+.+}, at: [] stop_machine+0x16/0x30 but task is already holding lock: (cpu_hotplug_lock.rw_sem){.+.+.+}, at: [] mtrr_add_page+0x83/0x470 .... cpus_read_lock+0x48/0x90 stop_machine+0x16/0x30 mtrr_add_page+0x18b/0x470 mtrr_add+0x3e/0x70 mtrr_add_page() holds the hotplug rwsem already and calls stop_machine() which acquires it again. Call stop_machine_cpuslocked() instead. Reported-and-tested-by: Larry Finger Reported-by: Dmitry Vyukov Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1708140920250.1865@nanos Cc: "Paul E. McKenney" Cc: Borislav Petkov --- arch/x86/kernel/cpu/mtrr/main.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index c5bb63b..40d5a8a 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -237,6 +237,18 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ stop_machine(mtrr_rendezvous_handler, &data, cpu_online_mask); } +static void set_mtrr_cpuslocked(unsigned int reg, unsigned long base, + unsigned long size, mtrr_type type) +{ + struct set_mtrr_data data = { .smp_reg = reg, + .smp_base = base, + .smp_size = size, + .smp_type = type + }; + + stop_machine_cpuslocked(mtrr_rendezvous_handler, &data, cpu_online_mask); +} + static void set_mtrr_from_inactive_cpu(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type) { @@ -370,7 +382,7 @@ int mtrr_add_page(unsigned long base, unsigned long size, /* Search for an empty MTRR */ i = mtrr_if->get_free_region(base, size, replace); if (i >= 0) { - set_mtrr(i, base, size, type); + set_mtrr_cpuslocked(i, base, size, type); if (likely(replace < 0)) { mtrr_usage_table[i] = 1; } else { @@ -378,7 +390,7 @@ int mtrr_add_page(unsigned long base, unsigned long size, if (increment) mtrr_usage_table[i]++; if (unlikely(replace != i)) { - set_mtrr(replace, 0, 0, 0); + set_mtrr_cpuslocked(replace, 0, 0, 0); mtrr_usage_table[replace] = 0; } } @@ -506,7 +518,7 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) goto out; } if (--mtrr_usage_table[reg] < 1) - set_mtrr(reg, 0, 0, 0); + set_mtrr_cpuslocked(reg, 0, 0, 0); error = reg; out: mutex_unlock(&mtrr_mutex); -- cgit v1.1 From 47ac5484fd961420e5ec0bb5b972fde381f57365 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 15 Aug 2017 17:39:52 +0200 Subject: x86: Fix norandmaps/ADDR_NO_RANDOMIZE Documentation/admin-guide/kernel-parameters.txt says: norandmaps Don't use address space randomization. Equivalent to echo 0 > /proc/sys/kernel/randomize_va_space but it doesn't work because arch_rnd() which is used to randomize mm->mmap_base returns a random value unconditionally. And as Kirill pointed out, ADDR_NO_RANDOMIZE is broken by the same reason. Just shift the PF_RANDOMIZE check from arch_mmap_rnd() to arch_rnd(). Fixes: 1b028f784e8c ("x86/mm: Introduce mmap_compat_base() for 32-bit mmap()") Signed-off-by: Oleg Nesterov Signed-off-by: Thomas Gleixner Acked-by: Kirill A. Shutemov Acked-by: Cyrill Gorcunov Reviewed-by: Dmitry Safonov Cc: stable@vger.kernel.org Cc: Andy Lutomirski Cc: Andrew Morton Cc: Borislav Petkov Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20170815153952.GA1076@redhat.com --- arch/x86/mm/mmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 229d04a..c94df12 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -79,13 +79,13 @@ static int mmap_is_legacy(void) static unsigned long arch_rnd(unsigned int rndbits) { + if (!(current->flags & PF_RANDOMIZE)) + return 0; return (get_random_long() & ((1UL << rndbits) - 1)) << PAGE_SHIFT; } unsigned long arch_mmap_rnd(void) { - if (!(current->flags & PF_RANDOMIZE)) - return 0; return arch_rnd(mmap_is_ia32() ? mmap32_rnd_bits : mmap64_rnd_bits); } -- cgit v1.1 From 01578e36163cdd0e4fd61d9976de15f13364e26d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 15 Aug 2017 17:40:11 +0200 Subject: x86/elf: Remove the unnecessary ADDR_NO_RANDOMIZE checks The ADDR_NO_RANDOMIZE checks in stack_maxrandom_size() and randomize_stack_top() are not required. PF_RANDOMIZE is set by load_elf_binary() only if ADDR_NO_RANDOMIZE is not set, no need to re-check after that. Signed-off-by: Oleg Nesterov Signed-off-by: Thomas Gleixner Reviewed-by: Dmitry Safonov Cc: stable@vger.kernel.org Cc: Andy Lutomirski Cc: Andrew Morton Cc: Borislav Petkov Cc: Linus Torvalds Cc: "Kirill A. Shutemov" Link: http://lkml.kernel.org/r/20170815154011.GB1076@redhat.com --- arch/x86/mm/mmap.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index c94df12..a88cfbf 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -50,8 +50,7 @@ unsigned long tasksize_64bit(void) static unsigned long stack_maxrandom_size(unsigned long task_size) { unsigned long max = 0; - if ((current->flags & PF_RANDOMIZE) && - !(current->personality & ADDR_NO_RANDOMIZE)) { + if (current->flags & PF_RANDOMIZE) { max = (-1UL) & __STACK_RND_MASK(task_size == tasksize_32bit()); max <<= PAGE_SHIFT; } -- cgit v1.1 From 187e91fe5e915f4b7f39b824aa422493463e443d Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Wed, 16 Aug 2017 21:08:08 +0200 Subject: x86/boot/64/clang: Use fixup_pointer() to access 'next_early_pgt' __startup_64() is normally using fixup_pointer() to access globals in a position-independent fashion. However 'next_early_pgt' was accessed directly, which wasn't guaranteed to work. Luckily GCC was generating a R_X86_64_PC32 PC-relative relocation for 'next_early_pgt', but Clang emitted a R_X86_64_32S, which led to accessing invalid memory and rebooting the kernel. Signed-off-by: Alexander Potapenko Acked-by: Kirill A. Shutemov Cc: Dmitry Vyukov Cc: Kirill A. Shutemov Cc: Linus Torvalds Cc: Michael Davidson Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: c88d71508e36 ("x86/boot/64: Rewrite startup_64() in C") Link: http://lkml.kernel.org/r/20170816190808.131748-1-glider@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/head64.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 46c3c73..9ba7954 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -53,6 +53,7 @@ void __head __startup_64(unsigned long physaddr) pudval_t *pud; pmdval_t *pmd, pmd_entry; int i; + unsigned int *next_pgt_ptr; /* Is the address too large? */ if (physaddr >> MAX_PHYSMEM_BITS) @@ -91,9 +92,9 @@ void __head __startup_64(unsigned long physaddr) * creates a bunch of nonsense entries but that is fine -- * it avoids problems around wraparound. */ - - pud = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr); - pmd = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr); + next_pgt_ptr = fixup_pointer(&next_early_pgt, physaddr); + pud = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr); + pmd = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr); if (IS_ENABLED(CONFIG_X86_5LEVEL)) { p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr); -- cgit v1.1 From 45bd07ad82622fb7c8dd7504d976b7dd11568965 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Thu, 20 Jul 2017 17:00:32 +0530 Subject: x86: Constify attribute_group structures attribute_groups are not supposed to change at runtime and none of the groups is modified. Mark the non-const structs as const. [ tglx: Folded into one big patch ] Signed-off-by: Arvind Yadav Signed-off-by: Thomas Gleixner Cc: tony.luck@intel.com Cc: bp@alien8.de Link: http://lkml.kernel.org/r/1500550238-15655-2-git-send-email-arvind.yadav.cs@gmail.com --- arch/x86/events/intel/uncore.c | 2 +- arch/x86/events/intel/uncore_nhmex.c | 12 ++++----- arch/x86/events/intel/uncore_snb.c | 6 ++--- arch/x86/events/intel/uncore_snbep.c | 42 ++++++++++++++++---------------- arch/x86/kernel/cpu/mcheck/therm_throt.c | 2 +- arch/x86/kernel/cpu/microcode/core.c | 4 +-- arch/x86/kernel/ksysfs.c | 4 +-- 7 files changed, 36 insertions(+), 36 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 44ec523..1c5390f 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -721,7 +721,7 @@ static struct attribute *uncore_pmu_attrs[] = { NULL, }; -static struct attribute_group uncore_pmu_attr_group = { +static const struct attribute_group uncore_pmu_attr_group = { .attrs = uncore_pmu_attrs, }; diff --git a/arch/x86/events/intel/uncore_nhmex.c b/arch/x86/events/intel/uncore_nhmex.c index cda5693..6a5cbe9 100644 --- a/arch/x86/events/intel/uncore_nhmex.c +++ b/arch/x86/events/intel/uncore_nhmex.c @@ -272,7 +272,7 @@ static struct attribute *nhmex_uncore_ubox_formats_attr[] = { NULL, }; -static struct attribute_group nhmex_uncore_ubox_format_group = { +static const struct attribute_group nhmex_uncore_ubox_format_group = { .name = "format", .attrs = nhmex_uncore_ubox_formats_attr, }; @@ -299,7 +299,7 @@ static struct attribute *nhmex_uncore_cbox_formats_attr[] = { NULL, }; -static struct attribute_group nhmex_uncore_cbox_format_group = { +static const struct attribute_group nhmex_uncore_cbox_format_group = { .name = "format", .attrs = nhmex_uncore_cbox_formats_attr, }; @@ -407,7 +407,7 @@ static struct attribute *nhmex_uncore_bbox_formats_attr[] = { NULL, }; -static struct attribute_group nhmex_uncore_bbox_format_group = { +static const struct attribute_group nhmex_uncore_bbox_format_group = { .name = "format", .attrs = nhmex_uncore_bbox_formats_attr, }; @@ -484,7 +484,7 @@ static struct attribute *nhmex_uncore_sbox_formats_attr[] = { NULL, }; -static struct attribute_group nhmex_uncore_sbox_format_group = { +static const struct attribute_group nhmex_uncore_sbox_format_group = { .name = "format", .attrs = nhmex_uncore_sbox_formats_attr, }; @@ -898,7 +898,7 @@ static struct attribute *nhmex_uncore_mbox_formats_attr[] = { NULL, }; -static struct attribute_group nhmex_uncore_mbox_format_group = { +static const struct attribute_group nhmex_uncore_mbox_format_group = { .name = "format", .attrs = nhmex_uncore_mbox_formats_attr, }; @@ -1163,7 +1163,7 @@ static struct attribute *nhmex_uncore_rbox_formats_attr[] = { NULL, }; -static struct attribute_group nhmex_uncore_rbox_format_group = { +static const struct attribute_group nhmex_uncore_rbox_format_group = { .name = "format", .attrs = nhmex_uncore_rbox_formats_attr, }; diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index a3dcc12..db1127c 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -130,7 +130,7 @@ static struct attribute *snb_uncore_formats_attr[] = { NULL, }; -static struct attribute_group snb_uncore_format_group = { +static const struct attribute_group snb_uncore_format_group = { .name = "format", .attrs = snb_uncore_formats_attr, }; @@ -289,7 +289,7 @@ static struct attribute *snb_uncore_imc_formats_attr[] = { NULL, }; -static struct attribute_group snb_uncore_imc_format_group = { +static const struct attribute_group snb_uncore_imc_format_group = { .name = "format", .attrs = snb_uncore_imc_formats_attr, }; @@ -769,7 +769,7 @@ static struct attribute *nhm_uncore_formats_attr[] = { NULL, }; -static struct attribute_group nhm_uncore_format_group = { +static const struct attribute_group nhm_uncore_format_group = { .name = "format", .attrs = nhm_uncore_formats_attr, }; diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 4f91276..db1fe37 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -602,27 +602,27 @@ static struct uncore_event_desc snbep_uncore_qpi_events[] = { { /* end: all zeroes */ }, }; -static struct attribute_group snbep_uncore_format_group = { +static const struct attribute_group snbep_uncore_format_group = { .name = "format", .attrs = snbep_uncore_formats_attr, }; -static struct attribute_group snbep_uncore_ubox_format_group = { +static const struct attribute_group snbep_uncore_ubox_format_group = { .name = "format", .attrs = snbep_uncore_ubox_formats_attr, }; -static struct attribute_group snbep_uncore_cbox_format_group = { +static const struct attribute_group snbep_uncore_cbox_format_group = { .name = "format", .attrs = snbep_uncore_cbox_formats_attr, }; -static struct attribute_group snbep_uncore_pcu_format_group = { +static const struct attribute_group snbep_uncore_pcu_format_group = { .name = "format", .attrs = snbep_uncore_pcu_formats_attr, }; -static struct attribute_group snbep_uncore_qpi_format_group = { +static const struct attribute_group snbep_uncore_qpi_format_group = { .name = "format", .attrs = snbep_uncore_qpi_formats_attr, }; @@ -1431,27 +1431,27 @@ static struct attribute *ivbep_uncore_qpi_formats_attr[] = { NULL, }; -static struct attribute_group ivbep_uncore_format_group = { +static const struct attribute_group ivbep_uncore_format_group = { .name = "format", .attrs = ivbep_uncore_formats_attr, }; -static struct attribute_group ivbep_uncore_ubox_format_group = { +static const struct attribute_group ivbep_uncore_ubox_format_group = { .name = "format", .attrs = ivbep_uncore_ubox_formats_attr, }; -static struct attribute_group ivbep_uncore_cbox_format_group = { +static const struct attribute_group ivbep_uncore_cbox_format_group = { .name = "format", .attrs = ivbep_uncore_cbox_formats_attr, }; -static struct attribute_group ivbep_uncore_pcu_format_group = { +static const struct attribute_group ivbep_uncore_pcu_format_group = { .name = "format", .attrs = ivbep_uncore_pcu_formats_attr, }; -static struct attribute_group ivbep_uncore_qpi_format_group = { +static const struct attribute_group ivbep_uncore_qpi_format_group = { .name = "format", .attrs = ivbep_uncore_qpi_formats_attr, }; @@ -1887,7 +1887,7 @@ static struct attribute *knl_uncore_ubox_formats_attr[] = { NULL, }; -static struct attribute_group knl_uncore_ubox_format_group = { +static const struct attribute_group knl_uncore_ubox_format_group = { .name = "format", .attrs = knl_uncore_ubox_formats_attr, }; @@ -1927,7 +1927,7 @@ static struct attribute *knl_uncore_cha_formats_attr[] = { NULL, }; -static struct attribute_group knl_uncore_cha_format_group = { +static const struct attribute_group knl_uncore_cha_format_group = { .name = "format", .attrs = knl_uncore_cha_formats_attr, }; @@ -2037,7 +2037,7 @@ static struct attribute *knl_uncore_pcu_formats_attr[] = { NULL, }; -static struct attribute_group knl_uncore_pcu_format_group = { +static const struct attribute_group knl_uncore_pcu_format_group = { .name = "format", .attrs = knl_uncore_pcu_formats_attr, }; @@ -2187,7 +2187,7 @@ static struct attribute *knl_uncore_irp_formats_attr[] = { NULL, }; -static struct attribute_group knl_uncore_irp_format_group = { +static const struct attribute_group knl_uncore_irp_format_group = { .name = "format", .attrs = knl_uncore_irp_formats_attr, }; @@ -2385,7 +2385,7 @@ static struct attribute *hswep_uncore_ubox_formats_attr[] = { NULL, }; -static struct attribute_group hswep_uncore_ubox_format_group = { +static const struct attribute_group hswep_uncore_ubox_format_group = { .name = "format", .attrs = hswep_uncore_ubox_formats_attr, }; @@ -2439,7 +2439,7 @@ static struct attribute *hswep_uncore_cbox_formats_attr[] = { NULL, }; -static struct attribute_group hswep_uncore_cbox_format_group = { +static const struct attribute_group hswep_uncore_cbox_format_group = { .name = "format", .attrs = hswep_uncore_cbox_formats_attr, }; @@ -2621,7 +2621,7 @@ static struct attribute *hswep_uncore_sbox_formats_attr[] = { NULL, }; -static struct attribute_group hswep_uncore_sbox_format_group = { +static const struct attribute_group hswep_uncore_sbox_format_group = { .name = "format", .attrs = hswep_uncore_sbox_formats_attr, }; @@ -3314,7 +3314,7 @@ static struct attribute *skx_uncore_cha_formats_attr[] = { NULL, }; -static struct attribute_group skx_uncore_chabox_format_group = { +static const struct attribute_group skx_uncore_chabox_format_group = { .name = "format", .attrs = skx_uncore_cha_formats_attr, }; @@ -3427,7 +3427,7 @@ static struct attribute *skx_uncore_iio_formats_attr[] = { NULL, }; -static struct attribute_group skx_uncore_iio_format_group = { +static const struct attribute_group skx_uncore_iio_format_group = { .name = "format", .attrs = skx_uncore_iio_formats_attr, }; @@ -3484,7 +3484,7 @@ static struct attribute *skx_uncore_formats_attr[] = { NULL, }; -static struct attribute_group skx_uncore_format_group = { +static const struct attribute_group skx_uncore_format_group = { .name = "format", .attrs = skx_uncore_formats_attr, }; @@ -3605,7 +3605,7 @@ static struct attribute *skx_upi_uncore_formats_attr[] = { NULL, }; -static struct attribute_group skx_upi_uncore_format_group = { +static const struct attribute_group skx_upi_uncore_format_group = { .name = "format", .attrs = skx_upi_uncore_formats_attr, }; diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index d7cc190..f7370ab 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -122,7 +122,7 @@ static struct attribute *thermal_throttle_attrs[] = { NULL }; -static struct attribute_group thermal_attr_group = { +static const struct attribute_group thermal_attr_group = { .attrs = thermal_throttle_attrs, .name = "thermal_throttle" }; diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 9cb98ee..86e8f0b 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -561,7 +561,7 @@ static struct attribute *mc_default_attrs[] = { NULL }; -static struct attribute_group mc_attr_group = { +static const struct attribute_group mc_attr_group = { .attrs = mc_default_attrs, .name = "microcode", }; @@ -707,7 +707,7 @@ static struct attribute *cpu_root_microcode_attrs[] = { NULL }; -static struct attribute_group cpu_root_microcode_group = { +static const struct attribute_group cpu_root_microcode_group = { .name = "microcode", .attrs = cpu_root_microcode_attrs, }; diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c index 4afc67f..06e1ff5 100644 --- a/arch/x86/kernel/ksysfs.c +++ b/arch/x86/kernel/ksysfs.c @@ -55,7 +55,7 @@ static struct bin_attribute *boot_params_data_attrs[] = { NULL, }; -static struct attribute_group boot_params_attr_group = { +static const struct attribute_group boot_params_attr_group = { .attrs = boot_params_version_attrs, .bin_attrs = boot_params_data_attrs, }; @@ -202,7 +202,7 @@ static struct bin_attribute *setup_data_data_attrs[] = { NULL, }; -static struct attribute_group setup_data_attr_group = { +static const struct attribute_group setup_data_attr_group = { .attrs = setup_data_type_attrs, .bin_attrs = setup_data_data_attrs, }; -- cgit v1.1 From 7edaeb6841dfb27e362288ab8466ebdc4972e867 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 15 Aug 2017 09:50:13 +0200 Subject: kernel/watchdog: Prevent false positives with turbo modes The hardlockup detector on x86 uses a performance counter based on unhalted CPU cycles and a periodic hrtimer. The hrtimer period is about 2/5 of the performance counter period, so the hrtimer should fire 2-3 times before the performance counter NMI fires. The NMI code checks whether the hrtimer fired since the last invocation. If not, it assumess a hard lockup. The calculation of those periods is based on the nominal CPU frequency. Turbo modes increase the CPU clock frequency and therefore shorten the period of the perf/NMI watchdog. With extreme Turbo-modes (3x nominal frequency) the perf/NMI period is shorter than the hrtimer period which leads to false positives. A simple fix would be to shorten the hrtimer period, but that comes with the side effect of more frequent hrtimer and softlockup thread wakeups, which is not desired. Implement a low pass filter, which checks the perf/NMI period against kernel time. If the perf/NMI fires before 4/5 of the watchdog period has elapsed then the event is ignored and postponed to the next perf/NMI. That solves the problem and avoids the overhead of shorter hrtimer periods and more frequent softlockup thread wakeups. Fixes: 58687acba592 ("lockup_detector: Combine nmi_watchdog and softlockup detector") Reported-and-tested-by: Kan Liang Signed-off-by: Thomas Gleixner Cc: dzickus@redhat.com Cc: prarit@redhat.com Cc: ak@linux.intel.com Cc: babu.moger@oracle.com Cc: peterz@infradead.org Cc: eranian@google.com Cc: acme@redhat.com Cc: stable@vger.kernel.org Cc: atomlin@redhat.com Cc: akpm@linux-foundation.org Cc: torvalds@linux-foundation.org Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1708150931310.1886@nanos --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 781521b..9101bfc 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -100,6 +100,7 @@ config X86 select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER select GENERIC_TIME_VSYSCALL + select HARDLOCKUP_CHECK_TIMESTAMP if X86_64 select HAVE_ACPI_APEI if ACPI select HAVE_ACPI_APEI_NMI if ACPI select HAVE_ALIGNED_STRUCT_PAGE if SLUB -- cgit v1.1 From 92e5aae457787d0bc6b255200d2fb116edf69794 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 18 Aug 2017 15:15:51 -0700 Subject: kernel/watchdog: fix Kconfig constraints for perf hardlockup watchdog Commit 05a4a9527931 ("kernel/watchdog: split up config options") lost the perf-based hardlockup detector's dependency on PERF_EVENTS, which can result in broken builds with some powerpc configurations. Restore the dependency. Add it in for x86 too, despite x86 always selecting PERF_EVENTS it seems reasonable to make the dependency explicit. Link: http://lkml.kernel.org/r/20170810114452.6673-1-npiggin@gmail.com Fixes: 05a4a9527931 ("kernel/watchdog: split up config options") Signed-off-by: Nicholas Piggin Acked-by: Don Zickus Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 781521b..29a1bf8 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -163,7 +163,7 @@ config X86 select HAVE_PCSPKR_PLATFORM select HAVE_PERF_EVENTS select HAVE_PERF_EVENTS_NMI - select HAVE_HARDLOCKUP_DETECTOR_PERF if HAVE_PERF_EVENTS_NMI + select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select HAVE_REGS_AND_STACK_ACCESS_API -- cgit v1.1 From c715b72c1ba406f133217b509044c38d8e714a37 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 18 Aug 2017 15:16:31 -0700 Subject: mm: revert x86_64 and arm64 ELF_ET_DYN_BASE base changes Moving the x86_64 and arm64 PIE base from 0x555555554000 to 0x000100000000 broke AddressSanitizer. This is a partial revert of: eab09532d400 ("binfmt_elf: use ELF_ET_DYN_BASE only for PIE") 02445990a96e ("arm64: move ELF_ET_DYN_BASE to 4GB / 4MB") The AddressSanitizer tool has hard-coded expectations about where executable mappings are loaded. The motivation for changing the PIE base in the above commits was to avoid the Stack-Clash CVEs that allowed executable mappings to get too close to heap and stack. This was mainly a problem on 32-bit, but the 64-bit bases were moved too, in an effort to proactively protect those systems (proofs of concept do exist that show 64-bit collisions, but other recent changes to fix stack accounting and setuid behaviors will minimize the impact). The new 32-bit PIE base is fine for ASan (since it matches the ET_EXEC base), so only the 64-bit PIE base needs to be reverted to let x86 and arm64 ASan binaries run again. Future changes to the 64-bit PIE base on these architectures can be made optional once a more dynamic method for dealing with AddressSanitizer is found. (e.g. always loading PIE into the mmap region for marked binaries.) Link: http://lkml.kernel.org/r/20170807201542.GA21271@beast Fixes: eab09532d400 ("binfmt_elf: use ELF_ET_DYN_BASE only for PIE") Fixes: 02445990a96e ("arm64: move ELF_ET_DYN_BASE to 4GB / 4MB") Signed-off-by: Kees Cook Reported-by: Kostya Serebryany Acked-by: Will Deacon Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/elf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 1c18d83..9aeb919 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -247,11 +247,11 @@ extern int force_personality32; /* * This is the base location for PIE (ET_DYN with INTERP) loads. On - * 64-bit, this is raised to 4GB to leave the entire 32-bit address + * 64-bit, this is above 4GB to leave the entire 32-bit address * space open for things that want to use the area for 32-bit pointers. */ #define ELF_ET_DYN_BASE (mmap_is_ia32() ? 0x000400000UL : \ - 0x100000000UL) + (TASK_SIZE / 3 * 2)) /* This yields a mask that user programs can use to figure out what instruction set this CPU supports. This could be done in user space, -- cgit v1.1