From 38356c1fbd8cd0f44a32ede2c97f0eb639d06613 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 22 May 2014 16:40:54 +0200 Subject: x86, MCE: Kill CPU_POST_DEAD In conjunction with cleaning up CPU hotplug, we want to get rid of CPU_POST_DEAD. Kill this instance here and rediscover CMCI banks at the end of CPU_DEAD. Link: http://lkml.kernel.org/r/http://lkml.kernel.org/r/1400750624-19238-1-git-send-email-bp@alien8.de Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index bb92f38..8fecdd3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -2385,6 +2385,10 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) threshold_cpu_callback(action, cpu); mce_device_remove(cpu); mce_intel_hcpu_update(cpu); + + /* intentionally ignoring frozen here */ + if (!(action & CPU_TASKS_FROZEN)) + cmci_rediscover(); break; case CPU_DOWN_PREPARE: smp_call_function_single(cpu, mce_disable_cpu, &action, 1); @@ -2396,11 +2400,6 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) break; } - if (action == CPU_POST_DEAD) { - /* intentionally ignoring frozen here */ - cmci_rediscover(); - } - return NOTIFY_OK; } -- cgit v1.1 From 27c934158c5be0bebfb2970da521b9d9efc0058b Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 20 Jun 2014 23:16:45 +0200 Subject: x86, MCE: Robustify mcheck_init_device BorisO reports that misc_register() fails often on xen. The current code unregisters the CPU hotplug notifier in that case. If then a CPU is offlined and onlined back again, we end up with a second timer running on that CPU, leading to soft lockups and system hangs. So let's leave the hotcpu notifier always registered - even if mce_device_create failed for some cores and never unreg it so that we can deal with the timer handling accordingly. Reported-and-Tested-by: Boris Ostrovsky Link: http://lkml.kernel.org/r/1403274493-1371-1-git-send-email-boris.ostrovsky@oracle.com Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 8fecdd3..4fc5797 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -2450,6 +2450,12 @@ static __init int mcheck_init_device(void) for_each_online_cpu(i) { err = mce_device_create(i); if (err) { + /* + * Register notifier anyway (and do not unreg it) so + * that we don't leave undeleted timers, see notifier + * callback above. + */ + __register_hotcpu_notifier(&mce_cpu_notifier); cpu_notifier_register_done(); goto err_device_create; } @@ -2470,10 +2476,6 @@ static __init int mcheck_init_device(void) err_register: unregister_syscore_ops(&mce_syscore_ops); - cpu_notifier_register_begin(); - __unregister_hotcpu_notifier(&mce_cpu_notifier); - cpu_notifier_register_done(); - err_device_create: /* * We didn't keep track of which devices were created above, but -- cgit v1.1 From 9dae3d0d9e64c3cb8bb172f041d4e66d4b92088a Mon Sep 17 00:00:00 2001 From: Tomasz Nowicki Date: Tue, 22 Jul 2014 11:20:11 +0200 Subject: apei, mce: Factor out APEI architecture specific MCE calls. This commit abstracts MCE calls and provides weak corresponding default implementation for those architectures which do not need arch specific actions. Each platform willing to do additional architectural actions should provides desired function definition. It allows us to avoid wrap code into #ifdef in generic code and prevent new platform from introducing dummy stub function too. Initially, there are two APEI arch-specific calls: - arch_apei_enable_cmcff() - arch_apei_report_mem_error() Both interact with MCE driver for X86 architecture. Signed-off-by: Tomasz Nowicki Acked-by: Borislav Petkov Signed-off-by: Tony Luck --- arch/x86/kernel/acpi/Makefile | 1 + arch/x86/kernel/acpi/apei.c | 56 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+) create mode 100644 arch/x86/kernel/acpi/apei.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile index 163b225..3242e59 100644 --- a/arch/x86/kernel/acpi/Makefile +++ b/arch/x86/kernel/acpi/Makefile @@ -1,5 +1,6 @@ obj-$(CONFIG_ACPI) += boot.o obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o +obj-$(CONFIG_ACPI_APEI) += apei.o ifneq ($(CONFIG_ACPI_PROCESSOR),) obj-y += cstate.o diff --git a/arch/x86/kernel/acpi/apei.c b/arch/x86/kernel/acpi/apei.c new file mode 100644 index 0000000..12b13de --- /dev/null +++ b/arch/x86/kernel/acpi/apei.c @@ -0,0 +1,56 @@ +/* + * Arch-specific APEI-related functions. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include + +#include + +int arch_apei_enable_cmcff(struct acpi_hest_header *hest_hdr, void *data) +{ +#ifdef CONFIG_X86_MCE + int i; + struct acpi_hest_ia_corrected *cmc; + struct acpi_hest_ia_error_bank *mc_bank; + + if (hest_hdr->type != ACPI_HEST_TYPE_IA32_CORRECTED_CHECK) + return 0; + + cmc = (struct acpi_hest_ia_corrected *)hest_hdr; + if (!cmc->enabled) + return 0; + + /* + * We expect HEST to provide a list of MC banks that report errors + * in firmware first mode. Otherwise, return non-zero value to + * indicate that we are done parsing HEST. + */ + if (!(cmc->flags & ACPI_HEST_FIRMWARE_FIRST) || + !cmc->num_hardware_banks) + return 1; + + pr_info("HEST: Enabling Firmware First mode for corrected errors.\n"); + + mc_bank = (struct acpi_hest_ia_error_bank *)(cmc + 1); + for (i = 0; i < cmc->num_hardware_banks; i++, mc_bank++) + mce_disable_bank(mc_bank->bank_number); +#endif + return 1; +} + +void arch_apei_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) +{ +#ifdef CONFIG_X86_MCE + apei_mce_report_mem_error(sev, mem_err); +#endif +} -- cgit v1.1 From 44a69f6195628f6f940566d133a72987559e102d Mon Sep 17 00:00:00 2001 From: Tomasz Nowicki Date: Tue, 22 Jul 2014 11:20:12 +0200 Subject: acpi, apei, ghes: Make NMI error notification to be GHES architecture extension. Currently APEI depends on x86 architecture. It is because of NMI hardware error notification of GHES which is currently supported by x86 only. However, many other APEI features can be still used perfectly by other architectures. This commit adds two symbols: 1. HAVE_ACPI_APEI for those archs which support APEI. 2. HAVE_ACPI_APEI_NMI which is used for NMI code isolation in ghes.c file. NMI related data and functions are grouped so they can be wrapped inside one #ifdef section. Appropriate function stubs are provided for !NMI case. Note there is no functional changes for x86 due to hard selected HAVE_ACPI_APEI and HAVE_ACPI_APEI_NMI symbols. Signed-off-by: Tomasz Nowicki Acked-by: Borislav Petkov Signed-off-by: Tony Luck --- arch/x86/Kconfig | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d24887b..4387344 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -132,6 +132,8 @@ config X86 select GENERIC_CPU_AUTOPROBE select HAVE_ARCH_AUDITSYSCALL select ARCH_SUPPORTS_ATOMIC_RMW + select HAVE_ACPI_APEI if ACPI + select HAVE_ACPI_APEI_NMI if ACPI config INSTRUCTION_DECODER def_bool y -- cgit v1.1 From 594c7255dce7a13cac50cf2470cc56e2c3b0494e Mon Sep 17 00:00:00 2001 From: Tomasz Nowicki Date: Tue, 22 Jul 2014 11:20:13 +0200 Subject: acpi, apei, ghes: Factor out ioremap virtual memory for IRQ and NMI context. GHES currently maps two pages with atomic_ioremap. From now on, NMI is architectural depended so there is no need to allocate an NMI page for platforms without NMI support. To make it possible to not use a second page, swap the existing page order so that the IRQ context page is first, and the optional NMI context page is second. Then, use HAVE_ACPI_APEI_NMI to decide how many pages are to be allocated. Signed-off-by: Tomasz Nowicki Acked-by: Borislav Petkov Signed-off-by: Tony Luck --- arch/x86/kernel/acpi/apei.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/apei.c b/arch/x86/kernel/acpi/apei.c index 12b13de..c280df6 100644 --- a/arch/x86/kernel/acpi/apei.c +++ b/arch/x86/kernel/acpi/apei.c @@ -15,6 +15,7 @@ #include #include +#include int arch_apei_enable_cmcff(struct acpi_hest_header *hest_hdr, void *data) { @@ -54,3 +55,8 @@ void arch_apei_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) apei_mce_report_mem_error(sev, mem_err); #endif } + +void arch_apei_flush_tlb_one(unsigned long addr) +{ + __flush_tlb_one(addr); +} -- cgit v1.1