diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-12-11 11:28:43 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-12-11 11:28:43 -0800 |
commit | 9ada9fd5dfaf0076eadf42cecc68f7adc1717c58 (patch) | |
tree | 543ef3147e5a63c1a877efbf2f1a9e0125273c85 | |
parent | c45564e91604ca4d03505ba4d567541c7e4f86fe (diff) | |
parent | 3bfe5aae8edd8436d26cddfeab783492d8950821 (diff) | |
download | op-kernel-dev-9ada9fd5dfaf0076eadf42cecc68f7adc1717c58.zip op-kernel-dev-9ada9fd5dfaf0076eadf42cecc68f7adc1717c58.tar.gz |
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/bp/bp
Pull EDAC fixes from Borislav Petkov:
- EDAC core error path fix, from Denis Kirjanov.
- Generalization of AMD MCE bank names and some minor error reporting
improvements.
- EDAC core cleanups and simplifications, from Wei Yongjun.
- amd64_edac fixes for sysfs-reported values, from Josh Hunt.
- some heavy amd64_edac error reporting path shaving, leading to
removing a bunch of code.
- amd64_edac error injection method improvements.
- EDAC core cleanups and fixes
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/bp/bp: (24 commits)
EDAC, pci_sysfs: Use for_each_pci_dev to simplify the code
EDAC: Handle error path in edac_mc_sysfs_init() properly
MCE, AMD: Dump error status
MCE, AMD: Report decoded error type first
MCE, AMD: Dump CPU f/m/s triple with the error
MCE, AMD: Remove functional unit references
EDAC: Convert to use simple_open()
EDAC, Calxeda highbank: Convert to use simple_open()
EDAC: Fix mc size reported in sysfs
EDAC: Fix csrow size reported in sysfs
EDAC: Pass mci parent
EDAC: Add memory controller flags
amd64_edac: Fix csrows size and pages computation
amd64_edac: Use DBAM_DIMM macro
amd64_edac: Fix K8 chip select reporting
amd64_edac: Reorganize error reporting path
amd64_edac: Do not check whether error address is valid
amd64_edac: Improve error injection
amd64_edac: Cleanup error injection code
amd64_edac: Small fixlets and cleanups
...
-rw-r--r-- | drivers/edac/Kconfig | 8 | ||||
-rw-r--r-- | drivers/edac/amd64_edac.c | 297 | ||||
-rw-r--r-- | drivers/edac/amd64_edac.h | 59 | ||||
-rw-r--r-- | drivers/edac/amd64_edac_inj.c | 128 | ||||
-rw-r--r-- | drivers/edac/edac_mc.c | 51 | ||||
-rw-r--r-- | drivers/edac/edac_mc_sysfs.c | 39 | ||||
-rw-r--r-- | drivers/edac/edac_module.c | 27 | ||||
-rw-r--r-- | drivers/edac/edac_pci.c | 3 | ||||
-rw-r--r-- | drivers/edac/edac_pci_sysfs.c | 12 | ||||
-rw-r--r-- | drivers/edac/highbank_mc_edac.c | 8 | ||||
-rw-r--r-- | drivers/edac/mce_amd.c | 254 | ||||
-rw-r--r-- | drivers/edac/mce_amd.h | 11 | ||||
-rw-r--r-- | include/linux/edac.h | 3 |
13 files changed, 454 insertions, 446 deletions
diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index 409b92b..bb82d6b 100644 --- a/drivers/edac/Kconfig +++ b/drivers/edac/Kconfig @@ -42,10 +42,10 @@ config EDAC_LEGACY_SYSFS config EDAC_DEBUG bool "Debugging" help - This turns on debugging information for the entire EDAC - sub-system. You can insert module with "debug_level=x", current - there're four debug levels (x=0,1,2,3 from low to high). - Usually you should select 'N'. + This turns on debugging information for the entire EDAC subsystem. + You do so by inserting edac_module with "edac_debug_level=x." Valid + levels are 0-4 (from low to high) and by default it is set to 2. + Usually you should select 'N' here. config EDAC_DECODE_MCE tristate "Decode MCEs in human-readable form (only on AMD for now)" diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index cc8e7c7..f74a684 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -60,8 +60,8 @@ struct scrubrate { { 0x00, 0UL}, /* scrubbing off */ }; -static int __amd64_read_pci_cfg_dword(struct pci_dev *pdev, int offset, - u32 *val, const char *func) +int __amd64_read_pci_cfg_dword(struct pci_dev *pdev, int offset, + u32 *val, const char *func) { int err = 0; @@ -423,7 +423,6 @@ int amd64_get_dram_hole_info(struct mem_ctl_info *mci, u64 *hole_base, u64 *hole_offset, u64 *hole_size) { struct amd64_pvt *pvt = mci->pvt_info; - u64 base; /* only revE and later have the DRAM Hole Address Register */ if (boot_cpu_data.x86 == 0xf && pvt->ext_model < K8_REV_E) { @@ -462,10 +461,8 @@ int amd64_get_dram_hole_info(struct mem_ctl_info *mci, u64 *hole_base, * addresses in the hole so that they start at 0x100000000. */ - base = dhar_base(pvt); - - *hole_base = base; - *hole_size = (0x1ull << 32) - base; + *hole_base = dhar_base(pvt); + *hole_size = (1ULL << 32) - *hole_base; if (boot_cpu_data.x86 > 0xf) *hole_offset = f10_dhar_offset(pvt); @@ -513,15 +510,15 @@ static u64 sys_addr_to_dram_addr(struct mem_ctl_info *mci, u64 sys_addr) { struct amd64_pvt *pvt = mci->pvt_info; u64 dram_base, hole_base, hole_offset, hole_size, dram_addr; - int ret = 0; + int ret; dram_base = get_dram_base(pvt, pvt->mc_node_id); ret = amd64_get_dram_hole_info(mci, &hole_base, &hole_offset, &hole_size); if (!ret) { - if ((sys_addr >= (1ull << 32)) && - (sys_addr < ((1ull << 32) + hole_size))) { + if ((sys_addr >= (1ULL << 32)) && + (sys_addr < ((1ULL << 32) + hole_size))) { /* use DHAR to translate SysAddr to DramAddr */ dram_addr = sys_addr - hole_offset; @@ -712,10 +709,10 @@ static inline u64 input_addr_to_sys_addr(struct mem_ctl_info *mci, /* Map the Error address to a PAGE and PAGE OFFSET. */ static inline void error_address_to_page_and_offset(u64 error_address, - u32 *page, u32 *offset) + struct err_info *err) { - *page = (u32) (error_address >> PAGE_SHIFT); - *offset = ((u32) error_address) & ~PAGE_MASK; + err->page = (u32) (error_address >> PAGE_SHIFT); + err->offset = ((u32) error_address) & ~PAGE_MASK; } /* @@ -1026,59 +1023,44 @@ static void read_dram_base_limit_regs(struct amd64_pvt *pvt, unsigned range) } static void k8_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr, - u16 syndrome) + struct err_info *err) { - struct mem_ctl_info *src_mci; struct amd64_pvt *pvt = mci->pvt_info; - int channel, csrow; - u32 page, offset; - error_address_to_page_and_offset(sys_addr, &page, &offset); + error_address_to_page_and_offset(sys_addr, err); /* * Find out which node the error address belongs to. This may be * different from the node that detected the error. */ - src_mci = find_mc_by_sys_addr(mci, sys_addr); - if (!src_mci) { + err->src_mci = find_mc_by_sys_addr(mci, sys_addr); + if (!err->src_mci) { amd64_mc_err(mci, "failed to map error addr 0x%lx to a node\n", (unsigned long)sys_addr); - edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 1, - page, offset, syndrome, - -1, -1, -1, - "failed to map error addr to a node", - ""); + err->err_code = ERR_NODE; return; } /* Now map the sys_addr to a CSROW */ - csrow = sys_addr_to_csrow(src_mci, sys_addr); - if (csrow < 0) { - edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 1, - page, offset, syndrome, - -1, -1, -1, - "failed to map error addr to a csrow", - ""); + err->csrow = sys_addr_to_csrow(err->src_mci, sys_addr); + if (err->csrow < 0) { + err->err_code = ERR_CSROW; return; } /* CHIPKILL enabled */ if (pvt->nbcfg & NBCFG_CHIPKILL) { - channel = get_channel_from_ecc_syndrome(mci, syndrome); - if (channel < 0) { + err->channel = get_channel_from_ecc_syndrome(mci, err->syndrome); + if (err->channel < 0) { /* * Syndrome didn't map, so we don't know which of the * 2 DIMMs is in error. So we need to ID 'both' of them * as suspect. */ - amd64_mc_warn(src_mci, "unknown syndrome 0x%04x - " + amd64_mc_warn(err->src_mci, "unknown syndrome 0x%04x - " "possible error reporting race\n", - syndrome); - edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 1, - page, offset, syndrome, - csrow, -1, -1, - "unknown syndrome - possible error reporting race", - ""); + err->syndrome); + err->err_code = ERR_CHANNEL; return; } } else { @@ -1090,13 +1072,8 @@ static void k8_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr, * was obtained from email communication with someone at AMD. * (Wish the email was placed in this comment - norsk) */ - channel = ((sys_addr & BIT(3)) != 0); + err->channel = ((sys_addr & BIT(3)) != 0); } - - edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, src_mci, 1, - page, offset, syndrome, - csrow, channel, -1, - "", ""); } static int ddr2_cs_size(unsigned i, bool dct_width) @@ -1482,7 +1459,7 @@ static u64 f1x_swap_interleaved_region(struct amd64_pvt *pvt, u64 sys_addr) /* For a given @dram_range, check if @sys_addr falls within it. */ static int f1x_match_to_this_node(struct amd64_pvt *pvt, unsigned range, - u64 sys_addr, int *nid, int *chan_sel) + u64 sys_addr, int *chan_sel) { int cs_found = -EINVAL; u64 chan_addr; @@ -1555,15 +1532,14 @@ static int f1x_match_to_this_node(struct amd64_pvt *pvt, unsigned range, cs_found = f1x_lookup_addr_in_dct(chan_addr, node_id, channel); - if (cs_found >= 0) { - *nid = node_id; + if (cs_found >= 0) *chan_sel = channel; - } + return cs_found; } static int f1x_translate_sysaddr_to_cs(struct amd64_pvt *pvt, u64 sys_addr, - int *node, int *chan_sel) + int *chan_sel) { int cs_found = -EINVAL; unsigned range; @@ -1577,8 +1553,7 @@ static int f1x_translate_sysaddr_to_cs(struct amd64_pvt *pvt, u64 sys_addr, (get_dram_limit(pvt, range) >= sys_addr)) { cs_found = f1x_match_to_this_node(pvt, range, - sys_addr, node, - chan_sel); + sys_addr, chan_sel); if (cs_found >= 0) break; } @@ -1594,22 +1569,15 @@ static int f1x_translate_sysaddr_to_cs(struct amd64_pvt *pvt, u64 sys_addr, * (MCX_ADDR). */ static void f1x_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr, - u16 syndrome) + struct err_info *err) { struct amd64_pvt *pvt = mci->pvt_info; - u32 page, offset; - int nid, csrow, chan = 0; - error_address_to_page_and_offset(sys_addr, &page, &offset); + error_address_to_page_and_offset(sys_addr, err); - csrow = f1x_translate_sysaddr_to_cs(pvt, sys_addr, &nid, &chan); - - if (csrow < 0) { - edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 1, - page, offset, syndrome, - -1, -1, -1, - "failed to map error addr to a csrow", - ""); + err->csrow = f1x_translate_sysaddr_to_cs(pvt, sys_addr, &err->channel); + if (err->csrow < 0) { + err->err_code = ERR_CSROW; return; } @@ -1619,12 +1587,7 @@ static void f1x_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr, * this point. */ if (dct_ganging_enabled(pvt)) - chan = get_channel_from_ecc_syndrome(mci, syndrome); - - edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 1, - page, offset, syndrome, - csrow, chan, -1, - "", ""); + err->channel = get_channel_from_ecc_syndrome(mci, err->syndrome); } /* @@ -1633,14 +1596,11 @@ static void f1x_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr, */ static void amd64_debug_display_dimm_sizes(struct amd64_pvt *pvt, u8 ctrl) { - int dimm, size0, size1, factor = 0; + int dimm, size0, size1; u32 *dcsb = ctrl ? pvt->csels[1].csbases : pvt->csels[0].csbases; u32 dbam = ctrl ? pvt->dbam1 : pvt->dbam0; if (boot_cpu_data.x86 == 0xf) { - if (pvt->dclr0 & WIDTH_128) - factor = 1; - /* K8 families < revF not supported yet */ if (pvt->ext_model < K8_REV_F) return; @@ -1671,8 +1631,8 @@ static void amd64_debug_display_dimm_sizes(struct amd64_pvt *pvt, u8 ctrl) DBAM_DIMM(dimm, dbam)); amd64_info(EDAC_MC ": %d: %5dMB %d: %5dMB\n", - dimm * 2, size0 << factor, - dimm * 2 + 1, size1 << factor); + dimm * 2, size0, + dimm * 2 + 1, size1); } } @@ -1893,101 +1853,56 @@ static int get_channel_from_ecc_syndrome(struct mem_ctl_info *mci, u16 syndrome) return map_err_sym_to_channel(err_sym, pvt->ecc_sym_sz); } -/* - * Handle any Correctable Errors (CEs) that have occurred. Check for valid ERROR - * ADDRESS and process. - */ -static void amd64_handle_ce(struct mem_ctl_info *mci, struct mce *m) -{ - struct amd64_pvt *pvt = mci->pvt_info; - u64 sys_addr; - u16 syndrome; - - /* Ensure that the Error Address is VALID */ - if (!(m->status & MCI_STATUS_ADDRV)) { - amd64_mc_err(mci, "HW has no ERROR_ADDRESS available\n"); - edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 1, - 0, 0, 0, - -1, -1, -1, - "HW has no ERROR_ADDRESS available", - ""); - return; - } - - sys_addr = get_error_address(m); - syndrome = extract_syndrome(m->status); - - amd64_mc_err(mci, "CE ERROR_ADDRESS= 0x%llx\n", sys_addr); - - pvt->ops->map_sysaddr_to_csrow(mci, sys_addr, syndrome); -} - -/* Handle any Un-correctable Errors (UEs) */ -static void amd64_handle_ue(struct mem_ctl_info *mci, struct mce *m) +static void __log_bus_error(struct mem_ctl_info *mci, struct err_info *err, + u8 ecc_type) { - struct mem_ctl_info *log_mci, *src_mci = NULL; - int csrow; - u64 sys_addr; - u32 page, offset; - - log_mci = mci; + enum hw_event_mc_err_type err_type; + const char *string; - if (!(m->status & MCI_STATUS_ADDRV)) { - amd64_mc_err(mci, "HW has no ERROR_ADDRESS available\n"); - edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 1, - 0, 0, 0, - -1, -1, -1, - "HW has no ERROR_ADDRESS available", - ""); + if (ecc_type == 2) + err_type = HW_EVENT_ERR_CORRECTED; + else if (ecc_type == 1) + err_type = HW_EVENT_ERR_UNCORRECTED; + else { + WARN(1, "Something is rotten in the state of Denmark.\n"); return; } - sys_addr = get_error_address(m); - error_address_to_page_and_offset(sys_addr, &page, &offset); - - /* - * Find out which node the error address belongs to. This may be - * different from the node that detected the error. - */ - src_mci = find_mc_by_sys_addr(mci, sys_addr); - if (!src_mci) { - amd64_mc_err(mci, "ERROR ADDRESS (0x%lx) NOT mapped to a MC\n", - (unsigned long)sys_addr); - edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 1, - page, offset, 0, - -1, -1, -1, - "ERROR ADDRESS NOT mapped to a MC", - ""); - return; + switch (err->err_code) { + case DECODE_OK: + string = ""; + break; + case ERR_NODE: + string = "Failed to map error addr to a node"; + break; + case ERR_CSROW: + string = "Failed to map error addr to a csrow"; + break; + case ERR_CHANNEL: + string = "unknown syndrome - possible error reporting race"; + break; + default: + string = "WTF error"; + break; } - log_mci = src_mci; - - csrow = sys_addr_to_csrow(log_mci, sys_addr); - if (csrow < 0) { - amd64_mc_err(mci, "ERROR_ADDRESS (0x%lx) NOT mapped to CS\n", - (unsigned long)sys_addr); - edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 1, - page, offset, 0, - -1, -1, -1, - "ERROR ADDRESS NOT mapped to CS", - ""); - } else { - edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 1, - page, offset, 0, - csrow, -1, -1, - "", ""); - } + edac_mc_handle_error(err_type, mci, 1, + err->page, err->offset, err->syndrome, + err->csrow, err->channel, -1, + string, ""); } static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci, struct mce *m) { - u16 ec = EC(m->status); - u8 xec = XEC(m->status, 0x1f); + struct amd64_pvt *pvt = mci->pvt_info; u8 ecc_type = (m->status >> 45) & 0x3; + u8 xec = XEC(m->status, 0x1f); + u16 ec = EC(m->status); + u64 sys_addr; + struct err_info err; - /* Bail early out if this was an 'observed' error */ + /* Bail out early if this was an 'observed' error */ if (PP(ec) == NBSL_PP_OBS) return; @@ -1995,10 +1910,16 @@ static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci, if (xec && xec != F10_NBSL_EXT_ERR_ECC) return; + memset(&err, 0, sizeof(err)); + + sys_addr = get_error_address(m); + if (ecc_type == 2) - amd64_handle_ce(mci, m); - else if (ecc_type == 1) - amd64_handle_ue(mci, m); + err.syndrome = extract_syndrome(m->status); + + pvt->ops->map_sysaddr_to_csrow(mci, sys_addr, &err); + + __log_bus_error(mci, &err, ecc_type); } void amd64_decode_bus_error(int node_id, struct mce *m) @@ -2166,6 +2087,7 @@ static u32 amd64_csrow_nr_pages(struct amd64_pvt *pvt, u8 dct, int csrow_nr) u32 cs_mode, nr_pages; u32 dbam = dct ? pvt->dbam1 : pvt->dbam0; + /* * The math on this doesn't look right on the surface because x/2*4 can * be simplified to x*2 but this expression makes use of the fact that @@ -2173,13 +2095,13 @@ static u32 amd64_csrow_nr_pages(struct amd64_pvt *pvt, u8 dct, int csrow_nr) * number of bits to shift the DBAM register to extract the proper CSROW * field. */ - cs_mode = (dbam >> ((csrow_nr / 2) * 4)) & 0xF; + cs_mode = DBAM_DIMM(csrow_nr / 2, dbam); nr_pages = pvt->ops->dbam_to_cs(pvt, dct, cs_mode) << (20 - PAGE_SHIFT); - edac_dbg(0, " (csrow=%d) DBAM map index= %d\n", csrow_nr, cs_mode); - edac_dbg(0, " nr_pages/channel= %u channel-count = %d\n", - nr_pages, pvt->channel_count); + edac_dbg(0, "csrow: %d, channel: %d, DBAM idx: %d\n", + csrow_nr, dct, cs_mode); + edac_dbg(0, "nr_pages/channel: %u\n", nr_pages); return nr_pages; } @@ -2190,15 +2112,14 @@ static u32 amd64_csrow_nr_pages(struct amd64_pvt *pvt, u8 dct, int csrow_nr) */ static int init_csrows(struct mem_ctl_info *mci) { + struct amd64_pvt *pvt = mci->pvt_info; struct csrow_info *csrow; struct dimm_info *dimm; - struct amd64_pvt *pvt = mci->pvt_info; - u64 base, mask; - u32 val; - int i, j, empty = 1; - enum mem_type mtype; enum edac_type edac_mode; + enum mem_type mtype; + int i, j, empty = 1; int nr_pages = 0; + u32 val; amd64_read_pci_cfg(pvt->F3, NBCFG, &val); @@ -2208,29 +2129,35 @@ static int init_csrows(struct mem_ctl_info *mci) pvt->mc_node_id, val, !!(val & NBCFG_CHIPKILL), !!(val & NBCFG_ECC_ENABLE)); + /* + * We iterate over DCT0 here but we look at DCT1 in parallel, if needed. + */ for_each_chip_select(i, 0, pvt) { - csrow = mci->csrows[i]; + bool row_dct0 = !!csrow_enabled(i, 0, pvt); + bool row_dct1 = false; + + if (boot_cpu_data.x86 != 0xf) + row_dct1 = !!csrow_enabled(i, 1, pvt); - if (!csrow_enabled(i, 0, pvt) && !csrow_enabled(i, 1, pvt)) { - edac_dbg(1, "----CSROW %d VALID for MC node %d\n", - i, pvt->mc_node_id); + if (!row_dct0 && !row_dct1) continue; - } + csrow = mci->csrows[i]; empty = 0; - if (csrow_enabled(i, 0, pvt)) + + edac_dbg(1, "MC node: %d, csrow: %d\n", + pvt->mc_node_id, i); + + if (row_dct0) nr_pages = amd64_csrow_nr_pages(pvt, 0, i); - if (csrow_enabled(i, 1, pvt)) - nr_pages += amd64_csrow_nr_pages(pvt, 1, i); - get_cs_base_and_mask(pvt, i, 0, &base, &mask); - /* 8 bytes of resolution */ + /* K8 has only one DCT */ + if (boot_cpu_data.x86 != 0xf && row_dct1) + nr_pages += amd64_csrow_nr_pages(pvt, 1, i); mtype = amd64_determine_memory_type(pvt, i); - edac_dbg(1, " for MC node %d csrow %d:\n", pvt->mc_node_id, i); - edac_dbg(1, " nr_pages: %u\n", - nr_pages * pvt->channel_count); + edac_dbg(1, "Total csrow%d pages: %u\n", i, nr_pages); /* * determine whether CHIPKILL or JUST ECC or NO ECC is operating @@ -2247,6 +2174,7 @@ static int init_csrows(struct mem_ctl_info *mci) dimm->edac_mode = edac_mode; dimm->nr_pages = nr_pages; } + csrow->nr_pages = nr_pages; } return empty; @@ -2591,6 +2519,7 @@ static int amd64_init_one_instance(struct pci_dev *F2) mci->pvt_info = pvt; mci->pdev = &pvt->F2->dev; + mci->csbased = 1; setup_mci_misc_attrs(mci, fam_type); diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index 8c41396..e864f40 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -219,7 +219,7 @@ #define DBAM1 0x180 /* Extract the DIMM 'type' on the i'th DIMM from the DBAM reg value passed */ -#define DBAM_DIMM(i, reg) ((((reg) >> (4*i))) & 0xF) +#define DBAM_DIMM(i, reg) ((((reg) >> (4*(i)))) & 0xF) #define DBAM_MAX_VALUE 11 @@ -267,18 +267,20 @@ #define online_spare_bad_dramcs(pvt, c) (((pvt)->online_spare >> (4 + 4 * (c))) & 0x7) #define F10_NB_ARRAY_ADDR 0xB8 -#define F10_NB_ARRAY_DRAM_ECC BIT(31) +#define F10_NB_ARRAY_DRAM BIT(31) /* Bits [2:1] are used to select 16-byte section within a 64-byte cacheline */ -#define SET_NB_ARRAY_ADDRESS(section) (((section) & 0x3) << 1) +#define SET_NB_ARRAY_ADDR(section) (((section) & 0x3) << 1) #define F10_NB_ARRAY_DATA 0xBC -#define SET_NB_DRAM_INJECTION_WRITE(word, bits) \ - (BIT(((word) & 0xF) + 20) | \ - BIT(17) | bits) -#define SET_NB_DRAM_INJECTION_READ(word, bits) \ - (BIT(((word) & 0xF) + 20) | \ - BIT(16) | bits) +#define F10_NB_ARR_ECC_WR_REQ BIT(17) +#define SET_NB_DRAM_INJECTION_WRITE(inj) \ + (BIT(((inj.word) & 0xF) + 20) | \ + F10_NB_ARR_ECC_WR_REQ | inj.bit_map) +#define SET_NB_DRAM_INJECTION_READ(inj) \ + (BIT(((inj.word) & 0xF) + 20) | \ + BIT(16) | inj.bit_map) + #define NBCAP 0xE8 #define NBCAP_CHIPKILL BIT(4) @@ -305,9 +307,9 @@ enum amd_families { /* Error injection control structure */ struct error_injection { - u32 section; - u32 word; - u32 bit_map; + u32 section; + u32 word; + u32 bit_map; }; /* low and high part of PCI config space regs */ @@ -374,6 +376,23 @@ struct amd64_pvt { struct error_injection injection; }; +enum err_codes { + DECODE_OK = 0, + ERR_NODE = -1, + ERR_CSROW = -2, + ERR_CHANNEL = -3, +}; + +struct err_info { + int err_code; + struct mem_ctl_info *src_mci; + int csrow; + int channel; + u16 syndrome; + u32 page; + u32 offset; +}; + static inline u64 get_dram_base(struct amd64_pvt *pvt, unsigned i) { u64 addr = ((u64)pvt->ranges[i].base.lo & 0xffff0000) << 8; @@ -447,7 +466,7 @@ static inline void amd64_remove_sysfs_inject_files(struct mem_ctl_info *mci) struct low_ops { int (*early_channel_count) (struct amd64_pvt *pvt); void (*map_sysaddr_to_csrow) (struct mem_ctl_info *mci, u64 sys_addr, - u16 syndrome); + struct err_info *); int (*dbam_to_cs) (struct amd64_pvt *pvt, u8 dct, unsigned cs_mode); int (*read_dct_pci_cfg) (struct amd64_pvt *pvt, int offset, u32 *val, const char *func); @@ -459,6 +478,8 @@ struct amd64_family_type { struct low_ops ops; }; +int __amd64_read_pci_cfg_dword(struct pci_dev *pdev, int offset, + u32 *val, const char *func); int __amd64_write_pci_cfg_dword(struct pci_dev *pdev, int offset, u32 val, const char *func); @@ -475,3 +496,15 @@ int amd64_get_dram_hole_info(struct mem_ctl_info *mci, u64 *hole_base, u64 *hole_offset, u64 *hole_size); #define to_mci(k) container_of(k, struct mem_ctl_info, dev) + +/* Injection helpers */ +static inline void disable_caches(void *dummy) +{ + write_cr0(read_cr0() | X86_CR0_CD); + wbinvd(); +} + +static inline void enable_caches(void *dummy) +{ + write_cr0(read_cr0() & ~X86_CR0_CD); +} diff --git a/drivers/edac/amd64_edac_inj.c b/drivers/edac/amd64_edac_inj.c index 53d972e..8c171fa 100644 --- a/drivers/edac/amd64_edac_inj.c +++ b/drivers/edac/amd64_edac_inj.c @@ -22,20 +22,19 @@ static ssize_t amd64_inject_section_store(struct device *dev, struct mem_ctl_info *mci = to_mci(dev); struct amd64_pvt *pvt = mci->pvt_info; unsigned long value; - int ret = 0; + int ret; ret = strict_strtoul(data, 10, &value); - if (ret != -EINVAL) { + if (ret < 0) + return ret; - if (value > 3) { - amd64_warn("%s: invalid section 0x%lx\n", __func__, value); - return -EINVAL; - } - - pvt->injection.section = (u32) value; - return count; + if (value > 3) { + amd64_warn("%s: invalid section 0x%lx\n", __func__, value); + return -EINVAL; } - return ret; + + pvt->injection.section = (u32) value; + return count; } static ssize_t amd64_inject_word_show(struct device *dev, @@ -60,20 +59,19 @@ static ssize_t amd64_inject_word_store(struct device *dev, struct mem_ctl_info *mci = to_mci(dev); struct amd64_pvt *pvt = mci->pvt_info; unsigned long value; - int ret = 0; + int ret; ret = strict_strtoul(data, 10, &value); - if (ret != -EINVAL) { + if (ret < 0) + return ret; - if (value > 8) { - amd64_warn("%s: invalid word 0x%lx\n", __func__, value); - return -EINVAL; - } - - pvt->injection.word = (u32) value; - return count; + if (value > 8) { + amd64_warn("%s: invalid word 0x%lx\n", __func__, value); + return -EINVAL; } - return ret; + + pvt->injection.word = (u32) value; + return count; } static ssize_t amd64_inject_ecc_vector_show(struct device *dev, @@ -97,21 +95,19 @@ static ssize_t amd64_inject_ecc_vector_store(struct device *dev, struct mem_ctl_info *mci = to_mci(dev); struct amd64_pvt *pvt = mci->pvt_info; unsigned long value; - int ret = 0; + int ret; ret = strict_strtoul(data, 16, &value); - if (ret != -EINVAL) { + if (ret < 0) + return ret; - if (value & 0xFFFF0000) { - amd64_warn("%s: invalid EccVector: 0x%lx\n", - __func__, value); - return -EINVAL; - } - - pvt->injection.bit_map = (u32) value; - return count; + if (value & 0xFFFF0000) { + amd64_warn("%s: invalid EccVector: 0x%lx\n", __func__, value); + return -EINVAL; } - return ret; + + pvt->injection.bit_map = (u32) value; + return count; } /* @@ -126,28 +122,25 @@ static ssize_t amd64_inject_read_store(struct device *dev, struct amd64_pvt *pvt = mci->pvt_info; unsigned long value; u32 section, word_bits; - int ret = 0; + int ret; ret = strict_strtoul(data, 10, &value); - if (ret != -EINVAL) { + if (ret < 0) + return ret; - /* Form value to choose 16-byte section of cacheline */ - section = F10_NB_ARRAY_DRAM_ECC | - SET_NB_ARRAY_ADDRESS(pvt->injection.section); - amd64_write_pci_cfg(pvt->F3, F10_NB_ARRAY_ADDR, section); + /* Form value to choose 16-byte section of cacheline */ + section = F10_NB_ARRAY_DRAM | SET_NB_ARRAY_ADDR(pvt->injection.section); - word_bits = SET_NB_DRAM_INJECTION_READ(pvt->injection.word, - pvt->injection.bit_map); + amd64_write_pci_cfg(pvt->F3, F10_NB_ARRAY_ADDR, section); - /* Issue 'word' and 'bit' along with the READ request */ - amd64_write_pci_cfg(pvt->F3, F10_NB_ARRAY_DATA, word_bits); + word_bits = SET_NB_DRAM_INJECTION_READ(pvt->injection); - edac_dbg(0, "section=0x%x word_bits=0x%x\n", - section, word_bits); + /* Issue 'word' and 'bit' along with the READ request */ + amd64_write_pci_cfg(pvt->F3, F10_NB_ARRAY_DATA, word_bits); - return count; - } - return ret; + edac_dbg(0, "section=0x%x word_bits=0x%x\n", section, word_bits); + + return count; } /* @@ -160,30 +153,43 @@ static ssize_t amd64_inject_write_store(struct device *dev, { struct mem_ctl_info *mci = to_mci(dev); struct amd64_pvt *pvt = mci->pvt_info; + u32 section, word_bits, tmp; unsigned long value; - u32 section, word_bits; - int ret = 0; + int ret; ret = strict_strtoul(data, 10, &value); - if (ret != -EINVAL) { + if (ret < 0) + return ret; + + /* Form value to choose 16-byte section of cacheline */ + section = F10_NB_ARRAY_DRAM | SET_NB_ARRAY_ADDR(pvt->injection.section); + + amd64_write_pci_cfg(pvt->F3, F10_NB_ARRAY_ADDR, section); - /* Form value to choose 16-byte section of cacheline */ - section = F10_NB_ARRAY_DRAM_ECC | - SET_NB_ARRAY_ADDRESS(pvt->injection.section); - amd64_write_pci_cfg(pvt->F3, F10_NB_ARRAY_ADDR, section); + word_bits = SET_NB_DRAM_INJECTION_WRITE(pvt->injection); - word_bits = SET_NB_DRAM_INJECTION_WRITE(pvt->injection.word, - pvt->injection.bit_map); + pr_notice_once("Don't forget to decrease MCE polling interval in\n" + "/sys/bus/machinecheck/devices/machinecheck<CPUNUM>/check_interval\n" + "so that you can get the error report faster.\n"); - /* Issue 'word' and 'bit' along with the READ request */ - amd64_write_pci_cfg(pvt->F3, F10_NB_ARRAY_DATA, word_bits); + on_each_cpu(disable_caches, NULL, 1); - edac_dbg(0, "section=0x%x word_bits=0x%x\n", - section, word_bits); + /* Issue 'word' and 'bit' along with the READ request */ + amd64_write_pci_cfg(pvt->F3, F10_NB_ARRAY_DATA, word_bits); - return count; + retry: + /* wait until injection happens */ + amd64_read_pci_cfg(pvt->F3, F10_NB_ARRAY_DATA, &tmp); + if (tmp & F10_NB_ARR_ECC_WR_REQ) { + cpu_relax(); + goto retry; } - return ret; + + on_each_cpu(enable_caches, NULL, 1); + + edac_dbg(0, "section=0x%x word_bits=0x%x\n", section, word_bits); + + return count; } /* diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index 75c0a1a..281f566 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -974,20 +974,22 @@ static void edac_ce_error(struct mem_ctl_info *mci, long grain) { unsigned long remapped_page; + char *msg_aux = ""; + + if (*msg) + msg_aux = " "; if (edac_mc_get_log_ce()) { if (other_detail && *other_detail) edac_mc_printk(mci, KERN_WARNING, - "%d CE %s on %s (%s %s - %s)\n", - error_count, - msg, label, location, - detail, other_detail); + "%d CE %s%son %s (%s %s - %s)\n", + error_count, msg, msg_aux, label, + location, detail, other_detail); else edac_mc_printk(mci, KERN_WARNING, - "%d CE %s on %s (%s %s)\n", - error_count, - msg, label, location, - detail); + "%d CE %s%son %s (%s %s)\n", + error_count, msg, msg_aux, label, + location, detail); } edac_inc_ce_error(mci, enable_per_layer_report, pos, error_count); @@ -1022,27 +1024,31 @@ static void edac_ue_error(struct mem_ctl_info *mci, const char *other_detail, const bool enable_per_layer_report) { + char *msg_aux = ""; + + if (*msg) + msg_aux = " "; + if (edac_mc_get_log_ue()) { if (other_detail && *other_detail) edac_mc_printk(mci, KERN_WARNING, - "%d UE %s on %s (%s %s - %s)\n", - error_count, - msg, label, location, detail, - other_detail); + "%d UE %s%son %s (%s %s - %s)\n", + error_count, msg, msg_aux, label, + location, detail, other_detail); else edac_mc_printk(mci, KERN_WARNING, - "%d UE %s on %s (%s %s)\n", - error_count, - msg, label, location, detail); + "%d UE %s%son %s (%s %s)\n", + error_count, msg, msg_aux, label, + location, detail); } if (edac_mc_get_panic_on_ue()) { if (other_detail && *other_detail) - panic("UE %s on %s (%s%s - %s)\n", - msg, label, location, detail, other_detail); + panic("UE %s%son %s (%s%s - %s)\n", + msg, msg_aux, label, location, detail, other_detail); else - panic("UE %s on %s (%s%s)\n", - msg, label, location, detail); + panic("UE %s%son %s (%s%s)\n", + msg, msg_aux, label, location, detail); } edac_inc_ue_error(mci, enable_per_layer_report, pos, error_count); @@ -1101,10 +1107,6 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, */ for (i = 0; i < mci->n_layers; i++) { if (pos[i] >= (int)mci->layers[i].size) { - if (type == HW_EVENT_ERR_CORRECTED) - p = "CE"; - else - p = "UE"; edac_mc_printk(mci, KERN_ERR, "INTERNAL ERROR: %s value is out of range (%d >= %d)\n", @@ -1136,6 +1138,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, grain = 0; p = label; *p = '\0'; + for (i = 0; i < mci->tot_dimms; i++) { struct dimm_info *dimm = mci->dimms[i]; @@ -1203,6 +1206,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, /* Fill the RAM location data */ p = location; + for (i = 0; i < mci->n_layers; i++) { if (pos[i] < 0) continue; @@ -1215,7 +1219,6 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, *(p - 1) = '\0'; /* Report the error via the trace interface */ - grain_bits = fls_long(grain) + 1; trace_mc_event(type, msg, label, error_count, mci->mc_idx, top_layer, mid_layer, low_layer, diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c index ed0bc07..de2df92 100644 --- a/drivers/edac/edac_mc_sysfs.c +++ b/drivers/edac/edac_mc_sysfs.c @@ -180,6 +180,9 @@ static ssize_t csrow_size_show(struct device *dev, int i; u32 nr_pages = 0; + if (csrow->mci->csbased) + return sprintf(data, "%u\n", PAGES_TO_MiB(csrow->nr_pages)); + for (i = 0; i < csrow->nr_channels; i++) nr_pages += csrow->channels[i]->dimm->nr_pages; return sprintf(data, "%u\n", PAGES_TO_MiB(nr_pages)); @@ -373,6 +376,7 @@ static int edac_create_csrow_object(struct mem_ctl_info *mci, csrow->dev.bus = &mci->bus; device_initialize(&csrow->dev); csrow->dev.parent = &mci->dev; + csrow->mci = mci; dev_set_name(&csrow->dev, "csrow%d", index); dev_set_drvdata(&csrow->dev, csrow); @@ -777,10 +781,14 @@ static ssize_t mci_size_mb_show(struct device *dev, for (csrow_idx = 0; csrow_idx < mci->nr_csrows; csrow_idx++) { struct csrow_info *csrow = mci->csrows[csrow_idx]; - for (j = 0; j < csrow->nr_channels; j++) { - struct dimm_info *dimm = csrow->channels[j]->dimm; + if (csrow->mci->csbased) { + total_pages += csrow->nr_pages; + } else { + for (j = 0; j < csrow->nr_channels; j++) { + struct dimm_info *dimm = csrow->channels[j]->dimm; - total_pages += dimm->nr_pages; + total_pages += dimm->nr_pages; + } } } @@ -838,14 +846,8 @@ static ssize_t edac_fake_inject_write(struct file *file, return count; } -static int debugfs_open(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - return 0; -} - static const struct file_operations debug_fake_inject_fops = { - .open = debugfs_open, + .open = simple_open, .write = edac_fake_inject_write, .llseek = generic_file_llseek, }; @@ -1124,10 +1126,15 @@ int __init edac_mc_sysfs_init(void) edac_subsys = edac_get_sysfs_subsys(); if (edac_subsys == NULL) { edac_dbg(1, "no edac_subsys\n"); - return -EINVAL; + err = -EINVAL; + goto out; } mci_pdev = kzalloc(sizeof(*mci_pdev), GFP_KERNEL); + if (!mci_pdev) { + err = -ENOMEM; + goto out_put_sysfs; + } mci_pdev->bus = edac_subsys; mci_pdev->type = &mc_attr_type; @@ -1136,11 +1143,18 @@ int __init edac_mc_sysfs_init(void) err = device_add(mci_pdev); if (err < 0) - return err; + goto out_dev_free; edac_dbg(0, "device %s created\n", dev_name(mci_pdev)); return 0; + + out_dev_free: + kfree(mci_pdev); + out_put_sysfs: + edac_put_sysfs_subsys(); + out: + return err; } void __exit edac_mc_sysfs_exit(void) @@ -1148,4 +1162,5 @@ void __exit edac_mc_sysfs_exit(void) put_device(mci_pdev); device_del(mci_pdev); edac_put_sysfs_subsys(); + kfree(mci_pdev); } diff --git a/drivers/edac/edac_module.c b/drivers/edac/edac_module.c index 58a28d8..12c951a 100644 --- a/drivers/edac/edac_module.c +++ b/drivers/edac/edac_module.c @@ -18,9 +18,29 @@ #define EDAC_VERSION "Ver: 3.0.0" #ifdef CONFIG_EDAC_DEBUG + +static int edac_set_debug_level(const char *buf, struct kernel_param *kp) +{ + unsigned long val; + int ret; + + ret = kstrtoul(buf, 0, &val); + if (ret) + return ret; + + if (val < 0 || val > 4) + return -EINVAL; + + return param_set_int(buf, kp); +} + /* Values of 0 to 4 will generate output */ int edac_debug_level = 2; EXPORT_SYMBOL_GPL(edac_debug_level); + +module_param_call(edac_debug_level, edac_set_debug_level, param_get_int, + &edac_debug_level, 0644); +MODULE_PARM_DESC(edac_debug_level, "EDAC debug level: [0-4], default: 2"); #endif /* scope is to module level only */ @@ -132,10 +152,3 @@ module_exit(edac_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Doug Thompson www.softwarebitmaker.com, et al"); MODULE_DESCRIPTION("Core library routines for EDAC reporting"); - -/* refer to *_sysfs.c files for parameters that are exported via sysfs */ - -#ifdef CONFIG_EDAC_DEBUG -module_param(edac_debug_level, int, 0644); -MODULE_PARM_DESC(edac_debug_level, "Debug level"); -#endif diff --git a/drivers/edac/edac_pci.c b/drivers/edac/edac_pci.c index ee87ef9..dd370f9 100644 --- a/drivers/edac/edac_pci.c +++ b/drivers/edac/edac_pci.c @@ -470,7 +470,8 @@ struct edac_pci_ctl_info *edac_pci_create_generic_ctl(struct device *dev, pci->mod_name = mod_name; pci->ctl_name = EDAC_PCI_GENCTL_NAME; - pci->edac_check = edac_pci_generic_check; + if (edac_op_state == EDAC_OPSTATE_POLL) + pci->edac_check = edac_pci_generic_check; pdata->edac_idx = edac_pci_idx++; diff --git a/drivers/edac/edac_pci_sysfs.c b/drivers/edac/edac_pci_sysfs.c index e164c55..dc6e905 100644 --- a/drivers/edac/edac_pci_sysfs.c +++ b/drivers/edac/edac_pci_sysfs.c @@ -645,20 +645,16 @@ typedef void (*pci_parity_check_fn_t) (struct pci_dev *dev); /* * pci_dev parity list iterator - * Scan the PCI device list for one pass, looking for SERRORs - * Master Parity ERRORS or Parity ERRORs on primary or secondary devices + * + * Scan the PCI device list looking for SERRORs, Master Parity ERRORS or + * Parity ERRORs on primary or secondary devices. */ static inline void edac_pci_dev_parity_iterator(pci_parity_check_fn_t fn) { struct pci_dev *dev = NULL; - /* request for kernel access to the next PCI device, if any, - * and while we are looking at it have its reference count - * bumped until we are done with it - */ - while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { + for_each_pci_dev(dev) fn(dev); - } } /* diff --git a/drivers/edac/highbank_mc_edac.c b/drivers/edac/highbank_mc_edac.c index c769f47..7ea4cc2 100644 --- a/drivers/edac/highbank_mc_edac.c +++ b/drivers/edac/highbank_mc_edac.c @@ -113,14 +113,8 @@ static ssize_t highbank_mc_err_inject_write(struct file *file, return count; } -static int debugfs_open(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - return 0; -} - static const struct file_operations highbank_mc_debug_inject_fops = { - .open = debugfs_open, + .open = simple_open, .write = highbank_mc_err_inject_write, .llseek = generic_file_llseek, }; diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index d0c372e..ad63757 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -64,7 +64,7 @@ EXPORT_SYMBOL_GPL(to_msgs); const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" }; EXPORT_SYMBOL_GPL(ii_msgs); -static const char * const f15h_ic_mce_desc[] = { +static const char * const f15h_mc1_mce_desc[] = { "UC during a demand linefill from L2", "Parity error during data load from IC", "Parity error for IC valid bit", @@ -84,7 +84,7 @@ static const char * const f15h_ic_mce_desc[] = { "fetch address FIFO" }; -static const char * const f15h_cu_mce_desc[] = { +static const char * const f15h_mc2_mce_desc[] = { "Fill ECC error on data fills", /* xec = 0x4 */ "Fill parity error on insn fills", "Prefetcher request FIFO parity error", @@ -101,7 +101,7 @@ static const char * const f15h_cu_mce_desc[] = { "PRB address parity error" }; -static const char * const nb_mce_desc[] = { +static const char * const mc4_mce_desc[] = { "DRAM ECC error detected on the NB", "CRC error detected on HT link", "Link-defined sync error packets detected on HT link", @@ -123,7 +123,7 @@ static const char * const nb_mce_desc[] = { "ECC Error in the Probe Filter directory" }; -static const char * const fr_ex_mce_desc[] = { +static const char * const mc5_mce_desc[] = { "CPU Watchdog timer expire", "Wakeup array dest tag", "AG payload array", @@ -139,7 +139,7 @@ static const char * const fr_ex_mce_desc[] = { "DE error occurred" }; -static bool f12h_dc_mce(u16 ec, u8 xec) +static bool f12h_mc0_mce(u16 ec, u8 xec) { bool ret = false; @@ -157,26 +157,26 @@ static bool f12h_dc_mce(u16 ec, u8 xec) return ret; } -static bool f10h_dc_mce(u16 ec, u8 xec) +static bool f10h_mc0_mce(u16 ec, u8 xec) { if (R4(ec) == R4_GEN && LL(ec) == LL_L1) { pr_cont("during data scrub.\n"); return true; } - return f12h_dc_mce(ec, xec); + return f12h_mc0_mce(ec, xec); } -static bool k8_dc_mce(u16 ec, u8 xec) +static bool k8_mc0_mce(u16 ec, u8 xec) { if (BUS_ERROR(ec)) { pr_cont("during system linefill.\n"); return true; } - return f10h_dc_mce(ec, xec); + return f10h_mc0_mce(ec, xec); } -static bool f14h_dc_mce(u16 ec, u8 xec) +static bool f14h_mc0_mce(u16 ec, u8 xec) { u8 r4 = R4(ec); bool ret = true; @@ -228,7 +228,7 @@ static bool f14h_dc_mce(u16 ec, u8 xec) return ret; } -static bool f15h_dc_mce(u16 ec, u8 xec) +static bool f15h_mc0_mce(u16 ec, u8 xec) { bool ret = true; @@ -275,12 +275,12 @@ static bool f15h_dc_mce(u16 ec, u8 xec) return ret; } -static void amd_decode_dc_mce(struct mce *m) +static void decode_mc0_mce(struct mce *m) { u16 ec = EC(m->status); u8 xec = XEC(m->status, xec_mask); - pr_emerg(HW_ERR "Data Cache Error: "); + pr_emerg(HW_ERR "MC0 Error: "); /* TLB error signatures are the same across families */ if (TLB_ERROR(ec)) { @@ -290,13 +290,13 @@ static void amd_decode_dc_mce(struct mce *m) : (xec ? "multimatch" : "parity"))); return; } - } else if (fam_ops->dc_mce(ec, xec)) + } else if (fam_ops->mc0_mce(ec, xec)) ; else - pr_emerg(HW_ERR "Corrupted DC MCE info?\n"); + pr_emerg(HW_ERR "Corrupted MC0 MCE info?\n"); } -static bool k8_ic_mce(u16 ec, u8 xec) +static bool k8_mc1_mce(u16 ec, u8 xec) { u8 ll = LL(ec); bool ret = true; @@ -330,7 +330,7 @@ static bool k8_ic_mce(u16 ec, u8 xec) return ret; } -static bool f14h_ic_mce(u16 ec, u8 xec) +static bool f14h_mc1_mce(u16 ec, u8 xec) { u8 r4 = R4(ec); bool ret = true; @@ -349,7 +349,7 @@ static bool f14h_ic_mce(u16 ec, u8 xec) return ret; } -static bool f15h_ic_mce(u16 ec, u8 xec) +static bool f15h_mc1_mce(u16 ec, u8 xec) { bool ret = true; @@ -358,19 +358,19 @@ static bool f15h_ic_mce(u16 ec, u8 xec) switch (xec) { case 0x0 ... 0xa: - pr_cont("%s.\n", f15h_ic_mce_desc[xec]); + pr_cont("%s.\n", f15h_mc1_mce_desc[xec]); break; case 0xd: - pr_cont("%s.\n", f15h_ic_mce_desc[xec-2]); + pr_cont("%s.\n", f15h_mc1_mce_desc[xec-2]); break; case 0x10: - pr_cont("%s.\n", f15h_ic_mce_desc[xec-4]); + pr_cont("%s.\n", f15h_mc1_mce_desc[xec-4]); break; case 0x11 ... 0x14: - pr_cont("Decoder %s parity error.\n", f15h_ic_mce_desc[xec-4]); + pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]); break; default: @@ -379,12 +379,12 @@ static bool f15h_ic_mce(u16 ec, u8 xec) return ret; } -static void amd_decode_ic_mce(struct mce *m) +static void decode_mc1_mce(struct mce *m) { u16 ec = EC(m->status); u8 xec = XEC(m->status, xec_mask); - pr_emerg(HW_ERR "Instruction Cache Error: "); + pr_emerg(HW_ERR "MC1 Error: "); if (TLB_ERROR(ec)) pr_cont("%s TLB %s.\n", LL_MSG(ec), @@ -393,18 +393,18 @@ static void amd_decode_ic_mce(struct mce *m) bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58))); pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read")); - } else if (fam_ops->ic_mce(ec, xec)) + } else if (fam_ops->mc1_mce(ec, xec)) ; else - pr_emerg(HW_ERR "Corrupted IC MCE info?\n"); + pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n"); } -static void amd_decode_bu_mce(struct mce *m) +static void decode_mc2_mce(struct mce *m) { u16 ec = EC(m->status); u8 xec = XEC(m->status, xec_mask); - pr_emerg(HW_ERR "Bus Unit Error"); + pr_emerg(HW_ERR "MC2 Error"); if (xec == 0x1) pr_cont(" in the write data buffers.\n"); @@ -429,24 +429,24 @@ static void amd_decode_bu_mce(struct mce *m) pr_cont(": %s parity/ECC error during data " "access from L2.\n", R4_MSG(ec)); else - goto wrong_bu_mce; + goto wrong_mc2_mce; } else - goto wrong_bu_mce; + goto wrong_mc2_mce; } else - goto wrong_bu_mce; + goto wrong_mc2_mce; return; -wrong_bu_mce: - pr_emerg(HW_ERR "Corrupted BU MCE info?\n"); + wrong_mc2_mce: + pr_emerg(HW_ERR "Corrupted MC2 MCE info?\n"); } -static void amd_decode_cu_mce(struct mce *m) +static void decode_f15_mc2_mce(struct mce *m) { u16 ec = EC(m->status); u8 xec = XEC(m->status, xec_mask); - pr_emerg(HW_ERR "Combined Unit Error: "); + pr_emerg(HW_ERR "MC2 Error: "); if (TLB_ERROR(ec)) { if (xec == 0x0) @@ -454,63 +454,63 @@ static void amd_decode_cu_mce(struct mce *m) else if (xec == 0x1) pr_cont("Poison data provided for TLB fill.\n"); else - goto wrong_cu_mce; + goto wrong_f15_mc2_mce; } else if (BUS_ERROR(ec)) { if (xec > 2) - goto wrong_cu_mce; + goto wrong_f15_mc2_mce; pr_cont("Error during attempted NB data read.\n"); } else if (MEM_ERROR(ec)) { switch (xec) { case 0x4 ... 0xc: - pr_cont("%s.\n", f15h_cu_mce_desc[xec - 0x4]); + pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x4]); break; case 0x10 ... 0x14: - pr_cont("%s.\n", f15h_cu_mce_desc[xec - 0x7]); + pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x7]); break; default: - goto wrong_cu_mce; + goto wrong_f15_mc2_mce; } } return; -wrong_cu_mce: - pr_emerg(HW_ERR "Corrupted CU MCE info?\n"); + wrong_f15_mc2_mce: + pr_emerg(HW_ERR "Corrupted MC2 MCE info?\n"); } -static void amd_decode_ls_mce(struct mce *m) +static void decode_mc3_mce(struct mce *m) { u16 ec = EC(m->status); u8 xec = XEC(m->status, xec_mask); if (boot_cpu_data.x86 >= 0x14) { - pr_emerg("You shouldn't be seeing an LS MCE on this cpu family," + pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family," " please report on LKML.\n"); return; } - pr_emerg(HW_ERR "Load Store Error"); + pr_emerg(HW_ERR "MC3 Error"); if (xec == 0x0) { u8 r4 = R4(ec); if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR)) - goto wrong_ls_mce; + goto wrong_mc3_mce; pr_cont(" during %s.\n", R4_MSG(ec)); } else - goto wrong_ls_mce; + goto wrong_mc3_mce; return; -wrong_ls_mce: - pr_emerg(HW_ERR "Corrupted LS MCE info?\n"); + wrong_mc3_mce: + pr_emerg(HW_ERR "Corrupted MC3 MCE info?\n"); } -void amd_decode_nb_mce(struct mce *m) +static void decode_mc4_mce(struct mce *m) { struct cpuinfo_x86 *c = &boot_cpu_data; int node_id = amd_get_nb_id(m->extcpu); @@ -518,7 +518,7 @@ void amd_decode_nb_mce(struct mce *m) u8 xec = XEC(m->status, 0x1f); u8 offset = 0; - pr_emerg(HW_ERR "Northbridge Error (node %d): ", node_id); + pr_emerg(HW_ERR "MC4 Error (node %d): ", node_id); switch (xec) { case 0x0 ... 0xe: @@ -527,9 +527,9 @@ void amd_decode_nb_mce(struct mce *m) if (xec == 0x0 || xec == 0x8) { /* no ECCs on F11h */ if (c->x86 == 0x11) - goto wrong_nb_mce; + goto wrong_mc4_mce; - pr_cont("%s.\n", nb_mce_desc[xec]); + pr_cont("%s.\n", mc4_mce_desc[xec]); if (nb_bus_decoder) nb_bus_decoder(node_id, m); @@ -543,14 +543,14 @@ void amd_decode_nb_mce(struct mce *m) else if (BUS_ERROR(ec)) pr_cont("DMA Exclusion Vector Table Walk error.\n"); else - goto wrong_nb_mce; + goto wrong_mc4_mce; return; case 0x19: if (boot_cpu_data.x86 == 0x15) pr_cont("Compute Unit Data Error.\n"); else - goto wrong_nb_mce; + goto wrong_mc4_mce; return; case 0x1c ... 0x1f: @@ -558,46 +558,44 @@ void amd_decode_nb_mce(struct mce *m) break; default: - goto wrong_nb_mce; + goto wrong_mc4_mce; } - pr_cont("%s.\n", nb_mce_desc[xec - offset]); + pr_cont("%s.\n", mc4_mce_desc[xec - offset]); return; -wrong_nb_mce: - pr_emerg(HW_ERR "Corrupted NB MCE info?\n"); + wrong_mc4_mce: + pr_emerg(HW_ERR "Corrupted MC4 MCE info?\n"); } -EXPORT_SYMBOL_GPL(amd_decode_nb_mce); -static void amd_decode_fr_mce(struct mce *m) +static void decode_mc5_mce(struct mce *m) { struct cpuinfo_x86 *c = &boot_cpu_data; u8 xec = XEC(m->status, xec_mask); if (c->x86 == 0xf || c->x86 == 0x11) - goto wrong_fr_mce; + goto wrong_mc5_mce; - pr_emerg(HW_ERR "%s Error: ", - (c->x86 == 0x15 ? "Execution Unit" : "FIROB")); + pr_emerg(HW_ERR "MC5 Error: "); if (xec == 0x0 || xec == 0xc) - pr_cont("%s.\n", fr_ex_mce_desc[xec]); + pr_cont("%s.\n", mc5_mce_desc[xec]); else if (xec < 0xd) - pr_cont("%s parity error.\n", fr_ex_mce_desc[xec]); + pr_cont("%s parity error.\n", mc5_mce_desc[xec]); else - goto wrong_fr_mce; + goto wrong_mc5_mce; return; -wrong_fr_mce: - pr_emerg(HW_ERR "Corrupted FR MCE info?\n"); + wrong_mc5_mce: + pr_emerg(HW_ERR "Corrupted MC5 MCE info?\n"); } -static void amd_decode_fp_mce(struct mce *m) +static void decode_mc6_mce(struct mce *m) { u8 xec = XEC(m->status, xec_mask); - pr_emerg(HW_ERR "Floating Point Unit Error: "); + pr_emerg(HW_ERR "MC6 Error: "); switch (xec) { case 0x1: @@ -621,7 +619,7 @@ static void amd_decode_fp_mce(struct mce *m) break; default: - goto wrong_fp_mce; + goto wrong_mc6_mce; break; } @@ -629,8 +627,8 @@ static void amd_decode_fp_mce(struct mce *m) return; -wrong_fp_mce: - pr_emerg(HW_ERR "Corrupted FP MCE info?\n"); + wrong_mc6_mce: + pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n"); } static inline void amd_decode_err_code(u16 ec) @@ -669,74 +667,94 @@ static bool amd_filter_mce(struct mce *m) return false; } +static const char *decode_error_status(struct mce *m) +{ + if (m->status & MCI_STATUS_UC) { + if (m->status & MCI_STATUS_PCC) + return "System Fatal error."; + if (m->mcgstatus & MCG_STATUS_RIPV) + return "Uncorrected, software restartable error."; + return "Uncorrected, software containable error."; + } + + if (m->status & MCI_STATUS_DEFERRED) + return "Deferred error."; + + return "Corrected error, no action required."; +} + int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) { struct mce *m = (struct mce *)data; - struct cpuinfo_x86 *c = &boot_cpu_data; + struct cpuinfo_x86 *c = &cpu_data(m->extcpu); int ecc; if (amd_filter_mce(m)) return NOTIFY_STOP; - pr_emerg(HW_ERR "CPU:%d\tMC%d_STATUS[%s|%s|%s|%s|%s", - m->extcpu, m->bank, - ((m->status & MCI_STATUS_OVER) ? "Over" : "-"), - ((m->status & MCI_STATUS_UC) ? "UE" : "CE"), - ((m->status & MCI_STATUS_MISCV) ? "MiscV" : "-"), - ((m->status & MCI_STATUS_PCC) ? "PCC" : "-"), - ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-")); - - if (c->x86 == 0x15) - pr_cont("|%s|%s", - ((m->status & BIT_64(44)) ? "Deferred" : "-"), - ((m->status & BIT_64(43)) ? "Poison" : "-")); - - /* do the two bits[14:13] together */ - ecc = (m->status >> 45) & 0x3; - if (ecc) - pr_cont("|%sECC", ((ecc == 2) ? "C" : "U")); - - pr_cont("]: 0x%016llx\n", m->status); - - if (m->status & MCI_STATUS_ADDRV) - pr_emerg(HW_ERR "\tMC%d_ADDR: 0x%016llx\n", m->bank, m->addr); - switch (m->bank) { case 0: - amd_decode_dc_mce(m); + decode_mc0_mce(m); break; case 1: - amd_decode_ic_mce(m); + decode_mc1_mce(m); break; case 2: if (c->x86 == 0x15) - amd_decode_cu_mce(m); + decode_f15_mc2_mce(m); else - amd_decode_bu_mce(m); + decode_mc2_mce(m); break; case 3: - amd_decode_ls_mce(m); + decode_mc3_mce(m); break; case 4: - amd_decode_nb_mce(m); + decode_mc4_mce(m); break; case 5: - amd_decode_fr_mce(m); + decode_mc5_mce(m); break; case 6: - amd_decode_fp_mce(m); + decode_mc6_mce(m); break; default: break; } + pr_emerg(HW_ERR "Error Status: %s\n", decode_error_status(m)); + + pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s", + m->extcpu, + c->x86, c->x86_model, c->x86_mask, + m->bank, + ((m->status & MCI_STATUS_OVER) ? "Over" : "-"), + ((m->status & MCI_STATUS_UC) ? "UE" : "CE"), + ((m->status & MCI_STATUS_MISCV) ? "MiscV" : "-"), + ((m->status & MCI_STATUS_PCC) ? "PCC" : "-"), + ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-")); + + if (c->x86 == 0x15) + pr_cont("|%s|%s", + ((m->status & MCI_STATUS_DEFERRED) ? "Deferred" : "-"), + ((m->status & MCI_STATUS_POISON) ? "Poison" : "-")); + + /* do the two bits[14:13] together */ + ecc = (m->status >> 45) & 0x3; + if (ecc) + pr_cont("|%sECC", ((ecc == 2) ? "C" : "U")); + + pr_cont("]: 0x%016llx\n", m->status); + + if (m->status & MCI_STATUS_ADDRV) + pr_emerg(HW_ERR "MC%d_ADDR: 0x%016llx\n", m->bank, m->addr); + amd_decode_err_code(m->status & 0xffff); return NOTIFY_STOP; @@ -763,35 +781,35 @@ static int __init mce_amd_init(void) switch (c->x86) { case 0xf: - fam_ops->dc_mce = k8_dc_mce; - fam_ops->ic_mce = k8_ic_mce; + fam_ops->mc0_mce = k8_mc0_mce; + fam_ops->mc1_mce = k8_mc1_mce; break; case 0x10: - fam_ops->dc_mce = f10h_dc_mce; - fam_ops->ic_mce = k8_ic_mce; + fam_ops->mc0_mce = f10h_mc0_mce; + fam_ops->mc1_mce = k8_mc1_mce; break; case 0x11: - fam_ops->dc_mce = k8_dc_mce; - fam_ops->ic_mce = k8_ic_mce; + fam_ops->mc0_mce = k8_mc0_mce; + fam_ops->mc1_mce = k8_mc1_mce; break; case 0x12: - fam_ops->dc_mce = f12h_dc_mce; - fam_ops->ic_mce = k8_ic_mce; + fam_ops->mc0_mce = f12h_mc0_mce; + fam_ops->mc1_mce = k8_mc1_mce; break; case 0x14: nb_err_cpumask = 0x3; - fam_ops->dc_mce = f14h_dc_mce; - fam_ops->ic_mce = f14h_ic_mce; + fam_ops->mc0_mce = f14h_mc0_mce; + fam_ops->mc1_mce = f14h_mc1_mce; break; case 0x15: xec_mask = 0x1f; - fam_ops->dc_mce = f15h_dc_mce; - fam_ops->ic_mce = f15h_ic_mce; + fam_ops->mc0_mce = f15h_mc0_mce; + fam_ops->mc1_mce = f15h_mc1_mce; break; default: diff --git a/drivers/edac/mce_amd.h b/drivers/edac/mce_amd.h index 8c87a5e..6796799 100644 --- a/drivers/edac/mce_amd.h +++ b/drivers/edac/mce_amd.h @@ -29,10 +29,8 @@ #define R4(x) (((x) >> 4) & 0xf) #define R4_MSG(x) ((R4(x) < 9) ? rrrr_msgs[R4(x)] : "Wrong R4!") -/* - * F3x4C bits (MCi_STATUS' high half) - */ -#define NBSH_ERR_CPU_VAL BIT(24) +#define MCI_STATUS_DEFERRED BIT_64(44) +#define MCI_STATUS_POISON BIT_64(43) enum tt_ids { TT_INSTR = 0, @@ -78,14 +76,13 @@ extern const char * const ii_msgs[]; * per-family decoder ops */ struct amd_decoder_ops { - bool (*dc_mce)(u16, u8); - bool (*ic_mce)(u16, u8); + bool (*mc0_mce)(u16, u8); + bool (*mc1_mce)(u16, u8); }; void amd_report_gart_errors(bool); void amd_register_ecc_decoder(void (*f)(int, struct mce *)); void amd_unregister_ecc_decoder(void (*f)(int, struct mce *)); -void amd_decode_nb_mce(struct mce *); int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data); #endif /* _EDAC_MCE_AMD_H */ diff --git a/include/linux/edac.h b/include/linux/edac.h index bab9f84..1b8c02b 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -533,6 +533,7 @@ struct csrow_info { u32 ue_count; /* Uncorrectable Errors for this csrow */ u32 ce_count; /* Correctable Errors for this csrow */ + u32 nr_pages; /* combined pages count of all channels */ struct mem_ctl_info *mci; /* the parent */ @@ -667,6 +668,8 @@ struct mem_ctl_info { u32 fake_inject_ue; u16 fake_inject_count; #endif + __u8 csbased : 1, /* csrow-based memory controller */ + __resv : 7; }; #endif |