From f9431992b6227069bc54800d55531c6f78d276a7 Mon Sep 17 00:00:00 2001 From: Doug Thompson Date: Mon, 27 Apr 2009 19:46:08 +0200 Subject: amd64_edac: add ECC reporting initializers Borislav: - convert to the new {rd|wr}msr_on_cpus interfaces. - convert pvt->old_mcgctl to a bitmask thus saving some bytes - fix/cleanup comments - fix function return value patterns - add a proper bugfix found by Doug to amd64_check_ecc_enabled where we missed checking for the ECC enabled bit in NB CFG. - cleanup debug calls Reviewed-by: Mauro Carvalho Chehab Signed-off-by: Doug Thompson Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 207 ++++++++++++++++++++++++++++++++++++++++++++++ drivers/edac/amd64_edac.h | 3 +- 2 files changed, 209 insertions(+), 1 deletion(-) (limited to 'drivers/edac') diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 5a6e714..3b6c421 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2771,3 +2771,210 @@ static int amd64_init_csrows(struct mem_ctl_info *mci) return empty; } +/* + * Only if 'ecc_enable_override' is set AND BIOS had ECC disabled, do "we" + * enable it. + */ +static void amd64_enable_ecc_error_reporting(struct mem_ctl_info *mci) +{ + struct amd64_pvt *pvt = mci->pvt_info; + const cpumask_t *cpumask = cpumask_of_node(pvt->mc_node_id); + int cpu, idx = 0, err = 0; + struct msr msrs[cpumask_weight(cpumask)]; + u32 value; + u32 mask = K8_NBCTL_CECCEn | K8_NBCTL_UECCEn; + + if (!ecc_enable_override) + return; + + memset(msrs, 0, sizeof(msrs)); + + amd64_printk(KERN_WARNING, + "'ecc_enable_override' parameter is active, " + "Enabling AMD ECC hardware now: CAUTION\n"); + + err = pci_read_config_dword(pvt->misc_f3_ctl, K8_NBCTL, &value); + if (err) + debugf0("Reading K8_NBCTL failed\n"); + + /* turn on UECCn and CECCEn bits */ + pvt->old_nbctl = value & mask; + pvt->nbctl_mcgctl_saved = 1; + + value |= mask; + pci_write_config_dword(pvt->misc_f3_ctl, K8_NBCTL, value); + + rdmsr_on_cpus(cpumask, K8_MSR_MCGCTL, msrs); + + for_each_cpu(cpu, cpumask) { + if (msrs[idx].l & K8_MSR_MCGCTL_NBE) + set_bit(idx, &pvt->old_mcgctl); + + msrs[idx].l |= K8_MSR_MCGCTL_NBE; + idx++; + } + wrmsr_on_cpus(cpumask, K8_MSR_MCGCTL, msrs); + + err = pci_read_config_dword(pvt->misc_f3_ctl, K8_NBCFG, &value); + if (err) + debugf0("Reading K8_NBCFG failed\n"); + + debugf0("NBCFG(1)= 0x%x CHIPKILL= %s ECC_ENABLE= %s\n", value, + (value & K8_NBCFG_CHIPKILL) ? "Enabled" : "Disabled", + (value & K8_NBCFG_ECC_ENABLE) ? "Enabled" : "Disabled"); + + if (!(value & K8_NBCFG_ECC_ENABLE)) { + amd64_printk(KERN_WARNING, + "This node reports that DRAM ECC is " + "currently Disabled; ENABLING now\n"); + + /* Attempt to turn on DRAM ECC Enable */ + value |= K8_NBCFG_ECC_ENABLE; + pci_write_config_dword(pvt->misc_f3_ctl, K8_NBCFG, value); + + err = pci_read_config_dword(pvt->misc_f3_ctl, K8_NBCFG, &value); + if (err) + debugf0("Reading K8_NBCFG failed\n"); + + if (!(value & K8_NBCFG_ECC_ENABLE)) { + amd64_printk(KERN_WARNING, + "Hardware rejects Enabling DRAM ECC checking\n" + "Check memory DIMM configuration\n"); + } else { + amd64_printk(KERN_DEBUG, + "Hardware accepted DRAM ECC Enable\n"); + } + } + debugf0("NBCFG(2)= 0x%x CHIPKILL= %s ECC_ENABLE= %s\n", value, + (value & K8_NBCFG_CHIPKILL) ? "Enabled" : "Disabled", + (value & K8_NBCFG_ECC_ENABLE) ? "Enabled" : "Disabled"); + + pvt->ctl_error_info.nbcfg = value; +} + +static void amd64_restore_ecc_error_reporting(struct amd64_pvt *pvt) +{ + const cpumask_t *cpumask = cpumask_of_node(pvt->mc_node_id); + int cpu, idx = 0, err = 0; + struct msr msrs[cpumask_weight(cpumask)]; + u32 value; + u32 mask = K8_NBCTL_CECCEn | K8_NBCTL_UECCEn; + + if (!pvt->nbctl_mcgctl_saved) + return; + + memset(msrs, 0, sizeof(msrs)); + + err = pci_read_config_dword(pvt->misc_f3_ctl, K8_NBCTL, &value); + if (err) + debugf0("Reading K8_NBCTL failed\n"); + value &= ~mask; + value |= pvt->old_nbctl; + + /* restore the NB Enable MCGCTL bit */ + pci_write_config_dword(pvt->misc_f3_ctl, K8_NBCTL, value); + + rdmsr_on_cpus(cpumask, K8_MSR_MCGCTL, msrs); + + for_each_cpu(cpu, cpumask) { + msrs[idx].l &= ~K8_MSR_MCGCTL_NBE; + msrs[idx].l |= + test_bit(idx, &pvt->old_mcgctl) << K8_MSR_MCGCTL_NBE; + idx++; + } + + wrmsr_on_cpus(cpumask, K8_MSR_MCGCTL, msrs); +} + +static void check_mcg_ctl(void *ret) +{ + u64 msr_val = 0; + u8 nbe; + + rdmsrl(MSR_IA32_MCG_CTL, msr_val); + nbe = msr_val & K8_MSR_MCGCTL_NBE; + + debugf0("core: %u, MCG_CTL: 0x%llx, NB MSR is %s\n", + raw_smp_processor_id(), msr_val, + (nbe ? "enabled" : "disabled")); + + if (!nbe) + *(int *)ret = 0; +} + +/* check MCG_CTL on all the cpus on this node */ +static int amd64_mcg_ctl_enabled_on_cpus(const cpumask_t *mask) +{ + int ret = 1; + preempt_disable(); + smp_call_function_many(mask, check_mcg_ctl, &ret, 1); + preempt_enable(); + + return ret; +} + +/* + * EDAC requires that the BIOS have ECC enabled before taking over the + * processing of ECC errors. This is because the BIOS can properly initialize + * the memory system completely. A command line option allows to force-enable + * hardware ECC later in amd64_enable_ecc_error_reporting(). + */ +static int amd64_check_ecc_enabled(struct amd64_pvt *pvt) +{ + u32 value; + int err = 0, ret = 0; + u8 ecc_enabled = 0; + + err = pci_read_config_dword(pvt->misc_f3_ctl, K8_NBCFG, &value); + if (err) + debugf0("Reading K8_NBCTL failed\n"); + + ecc_enabled = !!(value & K8_NBCFG_ECC_ENABLE); + + ret = amd64_mcg_ctl_enabled_on_cpus(cpumask_of_node(pvt->mc_node_id)); + + debugf0("K8_NBCFG=0x%x, DRAM ECC is %s\n", value, + (value & K8_NBCFG_ECC_ENABLE ? "enabled" : "disabled")); + + if (!ecc_enabled || !ret) { + if (!ecc_enabled) { + amd64_printk(KERN_WARNING, "This node reports that " + "Memory ECC is currently " + "disabled.\n"); + + amd64_printk(KERN_WARNING, "bit 0x%lx in register " + "F3x%x of the MISC_CONTROL device (%s) " + "should be enabled\n", K8_NBCFG_ECC_ENABLE, + K8_NBCFG, pci_name(pvt->misc_f3_ctl)); + } + if (!ret) { + amd64_printk(KERN_WARNING, "bit 0x%016lx in MSR 0x%08x " + "of node %d should be enabled\n", + K8_MSR_MCGCTL_NBE, MSR_IA32_MCG_CTL, + pvt->mc_node_id); + } + if (!ecc_enable_override) { + amd64_printk(KERN_WARNING, "WARNING: ECC is NOT " + "currently enabled by the BIOS. Module " + "will NOT be loaded.\n" + " Either Enable ECC in the BIOS, " + "or use the 'ecc_enable_override' " + "parameter.\n" + " Might be a BIOS bug, if BIOS says " + "ECC is enabled\n" + " Use of the override can cause " + "unknown side effects.\n"); + ret = -ENODEV; + } + } else { + amd64_printk(KERN_INFO, + "ECC is enabled by BIOS, Proceeding " + "with EDAC module initialization\n"); + + /* CLEAR the override, since BIOS controlled it */ + ecc_enable_override = 0; + } + + return ret; +} + diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index 6f5d5d6..e7aa760 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -70,6 +70,7 @@ #include #include #include +#include #include "edac_core.h" #define amd64_printk(level, fmt, arg...) \ @@ -549,7 +550,7 @@ struct amd64_pvt { /* Save old hw registers' values before we modified them */ u32 nbctl_mcgctl_saved; /* When true, following 2 are valid */ u32 old_nbctl; - u32 *old_mcgctl; /* per core on this node */ + unsigned long old_mcgctl; /* per core on this node */ /* MC Type Index value: socket F vs Family 10h */ u32 mc_type_index; -- cgit v1.1