From c3d1fb567a634dcdff4c6f6095b2053260988336 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Mon, 1 Jul 2013 21:08:47 +0530 Subject: mce: acpi/apei: Honour Firmware First for MCA banks listed in APEI HEST CMC The Corrected Machine Check structure (CMC) in HEST has a flag which can be set by the firmware to indicate to the OS that it prefers to process the corrected error events first. In this scenario, the OS is expected to not monitor for corrected errors (through CMCI/polling). Instead, the firmware notifies the OS on corrected error events through GHES. Linux already has support for GHES. This patch adds support for parsing CMC structure and to disable CMCI/polling if the firmware first flag is set. Further, the list of machine check bank structures at the end of CMC is used to determine which MCA banks function in FF mode, so that we continue to monitor error events on the other banks. Signed-off-by: Naveen N. Rao Acked-by: Borislav Petkov Signed-off-by: Tony Luck --- drivers/acpi/apei/hest.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'drivers/acpi') diff --git a/drivers/acpi/apei/hest.c b/drivers/acpi/apei/hest.c index f5ef5d5..b108b11 100644 --- a/drivers/acpi/apei/hest.c +++ b/drivers/acpi/apei/hest.c @@ -36,6 +36,7 @@ #include #include #include +#include #include "apei-internal.h" @@ -121,6 +122,40 @@ int apei_hest_parse(apei_hest_func_t func, void *data) } EXPORT_SYMBOL_GPL(apei_hest_parse); +/* + * Check if firmware advertises firmware first mode. We need FF bit to be set + * along with a set of MC banks which work in FF mode. + */ +static int __init hest_parse_cmc(struct acpi_hest_header *hest_hdr, void *data) +{ + int i; + struct acpi_hest_ia_corrected *cmc; + struct acpi_hest_ia_error_bank *mc_bank; + + if (hest_hdr->type != ACPI_HEST_TYPE_IA32_CORRECTED_CHECK) + return 0; + + cmc = (struct acpi_hest_ia_corrected *)hest_hdr; + if (!cmc->enabled) + return 0; + + /* + * We expect HEST to provide a list of MC banks that report errors + * in firmware first mode. Otherwise, return non-zero value to + * indicate that we are done parsing HEST. + */ + if (!(cmc->flags & ACPI_HEST_FIRMWARE_FIRST) || !cmc->num_hardware_banks) + return 1; + + pr_info(HEST_PFX "Enabling Firmware First mode for corrected errors.\n"); + + mc_bank = (struct acpi_hest_ia_error_bank *)(cmc + 1); + for (i = 0; i < cmc->num_hardware_banks; i++, mc_bank++) + mce_disable_bank(mc_bank->bank_number); + + return 1; +} + struct ghes_arr { struct platform_device **ghes_devs; unsigned int count; @@ -227,6 +262,8 @@ void __init acpi_hest_init(void) goto err; } + apei_hest_parse(hest_parse_cmc, NULL); + if (!ghes_disable) { rc = apei_hest_parse(hest_parse_ghes_count, &ghes_count); if (rc) -- cgit v1.1 From 9ad95879cd1b22ed016c804f8d686ff83a41a9d4 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Mon, 1 Jul 2013 21:08:54 +0530 Subject: mce: acpi/apei: Add a boot option to disable ff mode for corrected errors Add a boot option to disable firmware first mode for corrected errors. Signed-off-by: Naveen N. Rao Acked-by: Borislav Petkov Signed-off-by: Tony Luck --- drivers/acpi/apei/hest.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/acpi') diff --git a/drivers/acpi/apei/hest.c b/drivers/acpi/apei/hest.c index b108b11..5020245 100644 --- a/drivers/acpi/apei/hest.c +++ b/drivers/acpi/apei/hest.c @@ -262,7 +262,8 @@ void __init acpi_hest_init(void) goto err; } - apei_hest_parse(hest_parse_cmc, NULL); + if (!acpi_disable_cmcff) + apei_hest_parse(hest_parse_cmc, NULL); if (!ghes_disable) { rc = apei_hest_parse(hest_parse_ghes_count, &ghes_count); -- cgit v1.1 From cf870c70a194443f8fc654ddc9d6cfd02c58003b Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Wed, 10 Jul 2013 14:57:01 +0530 Subject: mce: acpi/apei: Soft-offline a page on firmware GHES notification If the firmware indicates in GHES error data entry that the error threshold has exceeded for a corrected error event, then we try to soft-offline the page. This could be called in interrupt context, so we queue this up similar to how we handle memory failure scenarios. Signed-off-by: Naveen N. Rao Acked-by: Borislav Petkov Signed-off-by: Tony Luck --- drivers/acpi/apei/ghes.c | 38 +++++++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 9 deletions(-) (limited to 'drivers/acpi') diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index fcd7d91..a8f362a 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -409,6 +409,34 @@ static void ghes_clear_estatus(struct ghes *ghes) ghes->flags &= ~GHES_TO_CLEAR; } +static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, int sev) +{ +#ifdef CONFIG_ACPI_APEI_MEMORY_FAILURE + unsigned long pfn; + int sec_sev = ghes_severity(gdata->error_severity); + struct cper_sec_mem_err *mem_err; + mem_err = (struct cper_sec_mem_err *)(gdata + 1); + + if (sec_sev == GHES_SEV_CORRECTED && + (gdata->flags & CPER_SEC_ERROR_THRESHOLD_EXCEEDED) && + (mem_err->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS)) { + pfn = mem_err->physical_addr >> PAGE_SHIFT; + if (pfn_valid(pfn)) + memory_failure_queue(pfn, 0, MF_SOFT_OFFLINE); + else if (printk_ratelimit()) + pr_warn(FW_WARN GHES_PFX + "Invalid address in generic error data: %#llx\n", + mem_err->physical_addr); + } + if (sev == GHES_SEV_RECOVERABLE && + sec_sev == GHES_SEV_RECOVERABLE && + mem_err->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS) { + pfn = mem_err->physical_addr >> PAGE_SHIFT; + memory_failure_queue(pfn, 0, 0); + } +#endif +} + static void ghes_do_proc(struct ghes *ghes, const struct acpi_hest_generic_status *estatus) { @@ -428,15 +456,7 @@ static void ghes_do_proc(struct ghes *ghes, apei_mce_report_mem_error(sev == GHES_SEV_CORRECTED, mem_err); #endif -#ifdef CONFIG_ACPI_APEI_MEMORY_FAILURE - if (sev == GHES_SEV_RECOVERABLE && - sec_sev == GHES_SEV_RECOVERABLE && - mem_err->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS) { - unsigned long pfn; - pfn = mem_err->physical_addr >> PAGE_SHIFT; - memory_failure_queue(pfn, 0, 0); - } -#endif + ghes_handle_memory_failure(gdata, sev); } #ifdef CONFIG_ACPI_APEI_PCIEAER else if (!uuid_le_cmp(*(uuid_le *)gdata->section_type, -- cgit v1.1