diff options
32 files changed, 1981 insertions, 1468 deletions
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 7ef73c9..7be9b72 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -715,25 +715,6 @@ static inline u64 input_addr_to_sys_addr(struct mem_ctl_info *mci, input_addr_to_dram_addr(mci, input_addr)); } -/* - * Find the minimum and maximum InputAddr values that map to the given @csrow. - * Pass back these values in *input_addr_min and *input_addr_max. - */ -static void find_csrow_limits(struct mem_ctl_info *mci, int csrow, - u64 *input_addr_min, u64 *input_addr_max) -{ - struct amd64_pvt *pvt; - u64 base, mask; - - pvt = mci->pvt_info; - BUG_ON((csrow < 0) || (csrow >= pvt->csels[0].b_cnt)); - - get_cs_base_and_mask(pvt, csrow, 0, &base, &mask); - - *input_addr_min = base & ~mask; - *input_addr_max = base | mask; -} - /* Map the Error address to a PAGE and PAGE OFFSET. */ static inline void error_address_to_page_and_offset(u64 error_address, u32 *page, u32 *offset) @@ -1058,6 +1039,37 @@ static void k8_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr, int channel, csrow; u32 page, offset; + error_address_to_page_and_offset(sys_addr, &page, &offset); + + /* + * Find out which node the error address belongs to. This may be + * different from the node that detected the error. + */ + src_mci = find_mc_by_sys_addr(mci, sys_addr); + if (!src_mci) { + amd64_mc_err(mci, "failed to map error addr 0x%lx to a node\n", + (unsigned long)sys_addr); + edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, + page, offset, syndrome, + -1, -1, -1, + EDAC_MOD_STR, + "failed to map error addr to a node", + NULL); + return; + } + + /* Now map the sys_addr to a CSROW */ + csrow = sys_addr_to_csrow(src_mci, sys_addr); + if (csrow < 0) { + edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, + page, offset, syndrome, + -1, -1, -1, + EDAC_MOD_STR, + "failed to map error addr to a csrow", + NULL); + return; + } + /* CHIPKILL enabled */ if (pvt->nbcfg & NBCFG_CHIPKILL) { channel = get_channel_from_ecc_syndrome(mci, syndrome); @@ -1067,9 +1079,15 @@ static void k8_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr, * 2 DIMMs is in error. So we need to ID 'both' of them * as suspect. */ - amd64_mc_warn(mci, "unknown syndrome 0x%04x - possible " - "error reporting race\n", syndrome); - edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR); + amd64_mc_warn(src_mci, "unknown syndrome 0x%04x - " + "possible error reporting race\n", + syndrome); + edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, + page, offset, syndrome, + csrow, -1, -1, + EDAC_MOD_STR, + "unknown syndrome - possible error reporting race", + NULL); return; } } else { @@ -1084,28 +1102,10 @@ static void k8_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr, channel = ((sys_addr & BIT(3)) != 0); } - /* - * Find out which node the error address belongs to. This may be - * different from the node that detected the error. - */ - src_mci = find_mc_by_sys_addr(mci, sys_addr); - if (!src_mci) { - amd64_mc_err(mci, "failed to map error addr 0x%lx to a node\n", - (unsigned long)sys_addr); - edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR); - return; - } - - /* Now map the sys_addr to a CSROW */ - csrow = sys_addr_to_csrow(src_mci, sys_addr); - if (csrow < 0) { - edac_mc_handle_ce_no_info(src_mci, EDAC_MOD_STR); - } else { - error_address_to_page_and_offset(sys_addr, &page, &offset); - - edac_mc_handle_ce(src_mci, page, offset, syndrome, csrow, - channel, EDAC_MOD_STR); - } + edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, src_mci, + page, offset, syndrome, + csrow, channel, -1, + EDAC_MOD_STR, "", NULL); } static int ddr2_cs_size(unsigned i, bool dct_width) @@ -1611,15 +1611,20 @@ static void f1x_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr, u32 page, offset; int nid, csrow, chan = 0; + error_address_to_page_and_offset(sys_addr, &page, &offset); + csrow = f1x_translate_sysaddr_to_cs(pvt, sys_addr, &nid, &chan); if (csrow < 0) { - edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR); + edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, + page, offset, syndrome, + -1, -1, -1, + EDAC_MOD_STR, + "failed to map error addr to a csrow", + NULL); return; } - error_address_to_page_and_offset(sys_addr, &page, &offset); - /* * We need the syndromes for channel detection only when we're * ganged. Otherwise @chan should already contain the channel at @@ -1628,16 +1633,10 @@ static void f1x_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr, if (dct_ganging_enabled(pvt)) chan = get_channel_from_ecc_syndrome(mci, syndrome); - if (chan >= 0) - edac_mc_handle_ce(mci, page, offset, syndrome, csrow, chan, - EDAC_MOD_STR); - else - /* - * Channel unknown, report all channels on this CSROW as failed. - */ - for (chan = 0; chan < mci->csrows[csrow].nr_channels; chan++) - edac_mc_handle_ce(mci, page, offset, syndrome, - csrow, chan, EDAC_MOD_STR); + edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, + page, offset, syndrome, + csrow, chan, -1, + EDAC_MOD_STR, "", NULL); } /* @@ -1918,7 +1917,12 @@ static void amd64_handle_ce(struct mem_ctl_info *mci, struct mce *m) /* Ensure that the Error Address is VALID */ if (!(m->status & MCI_STATUS_ADDRV)) { amd64_mc_err(mci, "HW has no ERROR_ADDRESS available\n"); - edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR); + edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, + 0, 0, 0, + -1, -1, -1, + EDAC_MOD_STR, + "HW has no ERROR_ADDRESS available", + NULL); return; } @@ -1942,11 +1946,17 @@ static void amd64_handle_ue(struct mem_ctl_info *mci, struct mce *m) if (!(m->status & MCI_STATUS_ADDRV)) { amd64_mc_err(mci, "HW has no ERROR_ADDRESS available\n"); - edac_mc_handle_ue_no_info(log_mci, EDAC_MOD_STR); + edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, + 0, 0, 0, + -1, -1, -1, + EDAC_MOD_STR, + "HW has no ERROR_ADDRESS available", + NULL); return; } sys_addr = get_error_address(m); + error_address_to_page_and_offset(sys_addr, &page, &offset); /* * Find out which node the error address belongs to. This may be @@ -1956,7 +1966,11 @@ static void amd64_handle_ue(struct mem_ctl_info *mci, struct mce *m) if (!src_mci) { amd64_mc_err(mci, "ERROR ADDRESS (0x%lx) NOT mapped to a MC\n", (unsigned long)sys_addr); - edac_mc_handle_ue_no_info(log_mci, EDAC_MOD_STR); + edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, + page, offset, 0, + -1, -1, -1, + EDAC_MOD_STR, + "ERROR ADDRESS NOT mapped to a MC", NULL); return; } @@ -1966,10 +1980,17 @@ static void amd64_handle_ue(struct mem_ctl_info *mci, struct mce *m) if (csrow < 0) { amd64_mc_err(mci, "ERROR_ADDRESS (0x%lx) NOT mapped to CS\n", (unsigned long)sys_addr); - edac_mc_handle_ue_no_info(log_mci, EDAC_MOD_STR); + edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, + page, offset, 0, + -1, -1, -1, + EDAC_MOD_STR, + "ERROR ADDRESS NOT mapped to CS", + NULL); } else { - error_address_to_page_and_offset(sys_addr, &page, &offset); - edac_mc_handle_ue(log_mci, page, offset, csrow, EDAC_MOD_STR); + edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, + page, offset, 0, + csrow, -1, -1, + EDAC_MOD_STR, "", NULL); } } @@ -2171,7 +2192,7 @@ static u32 amd64_csrow_nr_pages(struct amd64_pvt *pvt, u8 dct, int csrow_nr) nr_pages = pvt->ops->dbam_to_cs(pvt, dct, cs_mode) << (20 - PAGE_SHIFT); debugf0(" (csrow=%d) DBAM map index= %d\n", csrow_nr, cs_mode); - debugf0(" nr_pages= %u channel-count = %d\n", + debugf0(" nr_pages/channel= %u channel-count = %d\n", nr_pages, pvt->channel_count); return nr_pages; @@ -2185,9 +2206,12 @@ static int init_csrows(struct mem_ctl_info *mci) { struct csrow_info *csrow; struct amd64_pvt *pvt = mci->pvt_info; - u64 input_addr_min, input_addr_max, sys_addr, base, mask; + u64 base, mask; u32 val; - int i, empty = 1; + int i, j, empty = 1; + enum mem_type mtype; + enum edac_type edac_mode; + int nr_pages = 0; amd64_read_pci_cfg(pvt->F3, NBCFG, &val); @@ -2211,41 +2235,32 @@ static int init_csrows(struct mem_ctl_info *mci) empty = 0; if (csrow_enabled(i, 0, pvt)) - csrow->nr_pages = amd64_csrow_nr_pages(pvt, 0, i); + nr_pages = amd64_csrow_nr_pages(pvt, 0, i); if (csrow_enabled(i, 1, pvt)) - csrow->nr_pages += amd64_csrow_nr_pages(pvt, 1, i); - find_csrow_limits(mci, i, &input_addr_min, &input_addr_max); - sys_addr = input_addr_to_sys_addr(mci, input_addr_min); - csrow->first_page = (u32) (sys_addr >> PAGE_SHIFT); - sys_addr = input_addr_to_sys_addr(mci, input_addr_max); - csrow->last_page = (u32) (sys_addr >> PAGE_SHIFT); + nr_pages += amd64_csrow_nr_pages(pvt, 1, i); get_cs_base_and_mask(pvt, i, 0, &base, &mask); - csrow->page_mask = ~mask; /* 8 bytes of resolution */ - csrow->mtype = amd64_determine_memory_type(pvt, i); + mtype = amd64_determine_memory_type(pvt, i); debugf1(" for MC node %d csrow %d:\n", pvt->mc_node_id, i); - debugf1(" input_addr_min: 0x%lx input_addr_max: 0x%lx\n", - (unsigned long)input_addr_min, - (unsigned long)input_addr_max); - debugf1(" sys_addr: 0x%lx page_mask: 0x%lx\n", - (unsigned long)sys_addr, csrow->page_mask); - debugf1(" nr_pages: %u first_page: 0x%lx " - "last_page: 0x%lx\n", - (unsigned)csrow->nr_pages, - csrow->first_page, csrow->last_page); + debugf1(" nr_pages: %u\n", nr_pages * pvt->channel_count); /* * determine whether CHIPKILL or JUST ECC or NO ECC is operating */ if (pvt->nbcfg & NBCFG_ECC_ENABLE) - csrow->edac_mode = - (pvt->nbcfg & NBCFG_CHIPKILL) ? - EDAC_S4ECD4ED : EDAC_SECDED; + edac_mode = (pvt->nbcfg & NBCFG_CHIPKILL) ? + EDAC_S4ECD4ED : EDAC_SECDED; else - csrow->edac_mode = EDAC_NONE; + edac_mode = EDAC_NONE; + + for (j = 0; j < pvt->channel_count; j++) { + csrow->channels[j].dimm->mtype = mtype; + csrow->channels[j].dimm->edac_mode = edac_mode; + csrow->channels[j].dimm->nr_pages = nr_pages; + } } return empty; @@ -2540,6 +2555,7 @@ static int amd64_init_one_instance(struct pci_dev *F2) struct amd64_pvt *pvt = NULL; struct amd64_family_type *fam_type = NULL; struct mem_ctl_info *mci = NULL; + struct edac_mc_layer layers[2]; int err = 0, ret; u8 nid = get_node_id(F2); @@ -2574,7 +2590,13 @@ static int amd64_init_one_instance(struct pci_dev *F2) goto err_siblings; ret = -ENOMEM; - mci = edac_mc_alloc(0, pvt->csels[0].b_cnt, pvt->channel_count, nid); + layers[0].type = EDAC_MC_LAYER_CHIP_SELECT; + layers[0].size = pvt->csels[0].b_cnt; + layers[0].is_virt_csrow = true; + layers[1].type = EDAC_MC_LAYER_CHANNEL; + layers[1].size = pvt->channel_count; + layers[1].is_virt_csrow = false; + mci = edac_mc_alloc(nid, ARRAY_SIZE(layers), layers, 0); if (!mci) goto err_siblings; diff --git a/drivers/edac/amd76x_edac.c b/drivers/edac/amd76x_edac.c index f8fd3c8..9774d44 100644 --- a/drivers/edac/amd76x_edac.c +++ b/drivers/edac/amd76x_edac.c @@ -29,7 +29,6 @@ edac_mc_chipset_printk(mci, level, "amd76x", fmt, ##arg) #define AMD76X_NR_CSROWS 8 -#define AMD76X_NR_CHANS 1 #define AMD76X_NR_DIMMS 4 /* AMD 76x register addresses - device 0 function 0 - PCI bridge */ @@ -146,8 +145,10 @@ static int amd76x_process_error_info(struct mem_ctl_info *mci, if (handle_errors) { row = (info->ecc_mode_status >> 4) & 0xf; - edac_mc_handle_ue(mci, mci->csrows[row].first_page, 0, - row, mci->ctl_name); + edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, + mci->csrows[row].first_page, 0, 0, + row, 0, -1, + mci->ctl_name, "", NULL); } } @@ -159,8 +160,10 @@ static int amd76x_process_error_info(struct mem_ctl_info *mci, if (handle_errors) { row = info->ecc_mode_status & 0xf; - edac_mc_handle_ce(mci, mci->csrows[row].first_page, 0, - 0, row, 0, mci->ctl_name); + edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, + mci->csrows[row].first_page, 0, 0, + row, 0, -1, + mci->ctl_name, "", NULL); } } @@ -186,11 +189,13 @@ static void amd76x_init_csrows(struct mem_ctl_info *mci, struct pci_dev *pdev, enum edac_type edac_mode) { struct csrow_info *csrow; + struct dimm_info *dimm; u32 mba, mba_base, mba_mask, dms; int index; for (index = 0; index < mci->nr_csrows; index++) { csrow = &mci->csrows[index]; + dimm = csrow->channels[0].dimm; /* find the DRAM Chip Select Base address and mask */ pci_read_config_dword(pdev, @@ -203,13 +208,13 @@ static void amd76x_init_csrows(struct mem_ctl_info *mci, struct pci_dev *pdev, mba_mask = ((mba & 0xff80) << 16) | 0x7fffffUL; pci_read_config_dword(pdev, AMD76X_DRAM_MODE_STATUS, &dms); csrow->first_page = mba_base >> PAGE_SHIFT; - csrow->nr_pages = (mba_mask + 1) >> PAGE_SHIFT; - csrow->last_page = csrow->first_page + csrow->nr_pages - 1; + dimm->nr_pages = (mba_mask + 1) >> PAGE_SHIFT; + csrow->last_page = csrow->first_page + dimm->nr_pages - 1; csrow->page_mask = mba_mask >> PAGE_SHIFT; - csrow->grain = csrow->nr_pages << PAGE_SHIFT; - csrow->mtype = MEM_RDDR; - csrow->dtype = ((dms >> index) & 0x1) ? DEV_X4 : DEV_UNKNOWN; - csrow->edac_mode = edac_mode; + dimm->grain = dimm->nr_pages << PAGE_SHIFT; + dimm->mtype = MEM_RDDR; + dimm->dtype = ((dms >> index) & 0x1) ? DEV_X4 : DEV_UNKNOWN; + dimm->edac_mode = edac_mode; } } @@ -230,7 +235,8 @@ static int amd76x_probe1(struct pci_dev *pdev, int dev_idx) EDAC_SECDED, EDAC_SECDED }; - struct mem_ctl_info *mci = NULL; + struct mem_ctl_info *mci; + struct edac_mc_layer layers[2]; u32 ems; u32 ems_mode; struct amd76x_error_info discard; @@ -238,11 +244,17 @@ static int amd76x_probe1(struct pci_dev *pdev, int dev_idx) debugf0("%s()\n", __func__); pci_read_config_dword(pdev, AMD76X_ECC_MODE_STATUS, &ems); ems_mode = (ems >> 10) & 0x3; - mci = edac_mc_alloc(0, AMD76X_NR_CSROWS, AMD76X_NR_CHANS, 0); - if (mci == NULL) { + layers[0].type = EDAC_MC_LAYER_CHIP_SELECT; + layers[0].size = AMD76X_NR_CSROWS; + layers[0].is_virt_csrow = true; + layers[1].type = EDAC_MC_LAYER_CHANNEL; + layers[1].size = 1; + layers[1].is_virt_csrow = false; + mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers, 0); + + if (mci == NULL) return -ENOMEM; - } debugf0("%s(): mci = %p\n", __func__, mci); mci->dev = &pdev->dev; diff --git a/drivers/edac/cell_edac.c b/drivers/edac/cell_edac.c index 9a6a274..69ee6aa 100644 --- a/drivers/edac/cell_edac.c +++ b/drivers/edac/cell_edac.c @@ -48,8 +48,9 @@ static void cell_edac_count_ce(struct mem_ctl_info *mci, int chan, u64 ar) syndrome = (ar & 0x000000001fe00000ul) >> 21; /* TODO: Decoding of the error address */ - edac_mc_handle_ce(mci, csrow->first_page + pfn, offset, - syndrome, 0, chan, ""); + edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, + csrow->first_page + pfn, offset, syndrome, + 0, chan, -1, "", "", NULL); } static void cell_edac_count_ue(struct mem_ctl_info *mci, int chan, u64 ar) @@ -69,7 +70,9 @@ static void cell_edac_count_ue(struct mem_ctl_info *mci, int chan, u64 ar) offset = address & ~PAGE_MASK; /* TODO: Decoding of the error address */ - edac_mc_handle_ue(mci, csrow->first_page + pfn, offset, 0, ""); + edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, + csrow->first_page + pfn, offset, 0, + 0, chan, -1, "", "", NULL); } static void cell_edac_check(struct mem_ctl_info *mci) @@ -124,8 +127,11 @@ static void cell_edac_check(struct mem_ctl_info *mci) static void __devinit cell_edac_init_csrows(struct mem_ctl_info *mci) { struct csrow_info *csrow = &mci->csrows[0]; + struct dimm_info *dimm; struct cell_edac_priv *priv = mci->pvt_info; struct device_node *np; + int j; + u32 nr_pages; for (np = NULL; (np = of_find_node_by_name(np, "memory")) != NULL;) { @@ -140,15 +146,20 @@ static void __devinit cell_edac_init_csrows(struct mem_ctl_info *mci) if (of_node_to_nid(np) != priv->node) continue; csrow->first_page = r.start >> PAGE_SHIFT; - csrow->nr_pages = resource_size(&r) >> PAGE_SHIFT; - csrow->last_page = csrow->first_page + csrow->nr_pages - 1; - csrow->mtype = MEM_XDR; - csrow->edac_mode = EDAC_SECDED; + nr_pages = resource_size(&r) >> PAGE_SHIFT; + csrow->last_page = csrow->first_page + nr_pages - 1; + + for (j = 0; j < csrow->nr_channels; j++) { + dimm = csrow->channels[j].dimm; + dimm->mtype = MEM_XDR; + dimm->edac_mode = EDAC_SECDED; + dimm->nr_pages = nr_pages / csrow->nr_channels; + } dev_dbg(mci->dev, "Initialized on node %d, chanmask=0x%x," " first_page=0x%lx, nr_pages=0x%x\n", priv->node, priv->chanmask, - csrow->first_page, csrow->nr_pages); + csrow->first_page, nr_pages); break; } } @@ -157,9 +168,10 @@ static int __devinit cell_edac_probe(struct platform_device *pdev) { struct cbe_mic_tm_regs __iomem *regs; struct mem_ctl_info *mci; + struct edac_mc_layer layers[2]; struct cell_edac_priv *priv; u64 reg; - int rc, chanmask; + int rc, chanmask, num_chans; regs = cbe_get_cpu_mic_tm_regs(cbe_node_to_cpu(pdev->id)); if (regs == NULL) @@ -184,8 +196,16 @@ static int __devinit cell_edac_probe(struct platform_device *pdev) in_be64(®s->mic_fir)); /* Allocate & init EDAC MC data structure */ - mci = edac_mc_alloc(sizeof(struct cell_edac_priv), 1, - chanmask == 3 ? 2 : 1, pdev->id); + num_chans = chanmask == 3 ? 2 : 1; + + layers[0].type = EDAC_MC_LAYER_CHIP_SELECT; + layers[0].size = 1; + layers[0].is_virt_csrow = true; + layers[1].type = EDAC_MC_LAYER_CHANNEL; + layers[1].size = num_chans; + layers[1].is_virt_csrow = false; + mci = edac_mc_alloc(pdev->id, ARRAY_SIZE(layers), layers, + sizeof(struct cell_edac_priv)); if (mci == NULL) return -ENOMEM; priv = mci->pvt_info; diff --git a/drivers/edac/cpc925_edac.c b/drivers/edac/cpc925_edac.c index a774c0d..e22030a 100644 --- a/drivers/edac/cpc925_edac.c +++ b/drivers/edac/cpc925_edac.c @@ -329,9 +329,10 @@ static void cpc925_init_csrows(struct mem_ctl_info *mci) { struct cpc925_mc_pdata *pdata = mci->pvt_info; struct csrow_info *csrow; - int index; + struct dimm_info *dimm; + int index, j; u32 mbmr, mbbar, bba; - unsigned long row_size, last_nr_pages = 0; + unsigned long row_size, nr_pages, last_nr_pages = 0; get_total_mem(pdata); @@ -350,36 +351,41 @@ static void cpc925_init_csrows(struct mem_ctl_info *mci) row_size = bba * (1UL << 28); /* 256M */ csrow->first_page = last_nr_pages; - csrow->nr_pages = row_size >> PAGE_SHIFT; - csrow->last_page = csrow->first_page + csrow->nr_pages - 1; + nr_pages = row_size >> PAGE_SHIFT; + csrow->last_page = csrow->first_page + nr_pages - 1; last_nr_pages = csrow->last_page + 1; - csrow->mtype = MEM_RDDR; - csrow->edac_mode = EDAC_SECDED; - - switch (csrow->nr_channels) { - case 1: /* Single channel */ - csrow->grain = 32; /* four-beat burst of 32 bytes */ - break; - case 2: /* Dual channel */ - default: - csrow->grain = 64; /* four-beat burst of 64 bytes */ - break; - } - - switch ((mbmr & MBMR_MODE_MASK) >> MBMR_MODE_SHIFT) { - case 6: /* 0110, no way to differentiate X8 VS X16 */ - case 5: /* 0101 */ - case 8: /* 1000 */ - csrow->dtype = DEV_X16; - break; - case 7: /* 0111 */ - case 9: /* 1001 */ - csrow->dtype = DEV_X8; - break; - default: - csrow->dtype = DEV_UNKNOWN; - break; + for (j = 0; j < csrow->nr_channels; j++) { + dimm = csrow->channels[j].dimm; + + dimm->nr_pages = nr_pages / csrow->nr_channels; + dimm->mtype = MEM_RDDR; + dimm->edac_mode = EDAC_SECDED; + + switch (csrow->nr_channels) { + case 1: /* Single channel */ + dimm->grain = 32; /* four-beat burst of 32 bytes */ + break; + case 2: /* Dual channel */ + default: + dimm->grain = 64; /* four-beat burst of 64 bytes */ + break; + } + + switch ((mbmr & MBMR_MODE_MASK) >> MBMR_MODE_SHIFT) { + case 6: /* 0110, no way to differentiate X8 VS X16 */ + case 5: /* 0101 */ + case 8: /* 1000 */ + dimm->dtype = DEV_X16; + break; + case 7: /* 0111 */ + case 9: /* 1001 */ + dimm->dtype = DEV_X8; + break; + default: + dimm->dtype = DEV_UNKNOWN; + break; + } } } } @@ -549,13 +555,18 @@ static void cpc925_mc_check(struct mem_ctl_info *mci) if (apiexcp & CECC_EXCP_DETECTED) { cpc925_mc_printk(mci, KERN_INFO, "DRAM CECC Fault\n"); channel = cpc925_mc_find_channel(mci, syndrome); - edac_mc_handle_ce(mci, pfn, offset, syndrome, - csrow, channel, mci->ctl_name); + edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, + pfn, offset, syndrome, + csrow, channel, -1, + mci->ctl_name, "", NULL); } if (apiexcp & UECC_EXCP_DETECTED) { cpc925_mc_printk(mci, KERN_INFO, "DRAM UECC Fault\n"); - edac_mc_handle_ue(mci, pfn, offset, csrow, mci->ctl_name); + edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, + pfn, offset, 0, + csrow, -1, -1, + mci->ctl_name, "", NULL); } cpc925_mc_printk(mci, KERN_INFO, "Dump registers:\n"); @@ -927,6 +938,7 @@ static int __devinit cpc925_probe(struct platform_device *pdev) { static int edac_mc_idx; struct mem_ctl_info *mci; + struct edac_mc_layer layers[2]; void __iomem *vbase; struct cpc925_mc_pdata *pdata; struct resource *r; @@ -962,9 +974,16 @@ static int __devinit cpc925_probe(struct platform_device *pdev) goto err2; } - nr_channels = cpc925_mc_get_channels(vbase); - mci = edac_mc_alloc(sizeof(struct cpc925_mc_pdata), - CPC925_NR_CSROWS, nr_channels + 1, edac_mc_idx); + nr_channels = cpc925_mc_get_channels(vbase) + 1; + + layers[0].type = EDAC_MC_LAYER_CHIP_SELECT; + layers[0].size = CPC925_NR_CSROWS; + layers[0].is_virt_csrow = true; + layers[1].type = EDAC_MC_LAYER_CHANNEL; + layers[1].size = nr_channels; + layers[1].is_virt_csrow = false; + mci = edac_mc_alloc(edac_mc_idx, ARRAY_SIZE(layers), layers, + sizeof(struct cpc925_mc_pdata)); if (!mci) { cpc925_printk(KERN_ERR, "No memory for mem_ctl_info\n"); res = -ENOMEM; diff --git a/drivers/edac/e752x_edac.c b/drivers/edac/e752x_edac.c index 4122326..3186512 100644 --- a/drivers/edac/e752x_edac.c +++ b/drivers/edac/e752x_edac.c @@ -4,7 +4,11 @@ * This file may be distributed under the terms of the * GNU General Public License. * - * See "enum e752x_chips" below for supported chipsets + * Implement support for the e7520, E7525, e7320 and i3100 memory controllers. + * + * Datasheets: + * http://www.intel.in/content/www/in/en/chipsets/e7525-memory-controller-hub-datasheet.html + * ftp://download.intel.com/design/intarch/datashts/31345803.pdf * * Written by Tom Zimmerman * @@ -13,8 +17,6 @@ * Wang Zhenyu at intel.com * Dave Jiang at mvista.com * - * $Id: edac_e752x.c,v 1.5.2.11 2005/10/05 00:43:44 dsp_llnl Exp $ - * */ #include <linux/module.h> @@ -187,6 +189,25 @@ enum e752x_chips { I3100 = 3 }; +/* + * Those chips Support single-rank and dual-rank memories only. + * + * On e752x chips, the odd rows are present only on dual-rank memories. + * Dividing the rank by two will provide the dimm# + * + * i3100 MC has a different mapping: it supports only 4 ranks. + * + * The mapping is (from 1 to n): + * slot single-ranked double-ranked + * dimm #1 -> rank #4 NA + * dimm #2 -> rank #3 NA + * dimm #3 -> rank #2 Ranks 2 and 3 + * dimm #4 -> rank $1 Ranks 1 and 4 + * + * FIXME: The current mapping for i3100 considers that it supports up to 8 + * ranks/chanel, but datasheet says that the MC supports only 4 ranks. + */ + struct e752x_pvt { struct pci_dev *bridge_ck; struct pci_dev *dev_d0f0; @@ -350,8 +371,10 @@ static void do_process_ce(struct mem_ctl_info *mci, u16 error_one, channel = !(error_one & 1); /* e752x mc reads 34:6 of the DRAM linear address */ - edac_mc_handle_ce(mci, page, offset_in_page(sec1_add << 4), - sec1_syndrome, row, channel, "e752x CE"); + edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, + page, offset_in_page(sec1_add << 4), sec1_syndrome, + row, channel, -1, + "e752x CE", "", NULL); } static inline void process_ce(struct mem_ctl_info *mci, u16 error_one, @@ -385,9 +408,12 @@ static void do_process_ue(struct mem_ctl_info *mci, u16 error_one, edac_mc_find_csrow_by_page(mci, block_page); /* e752x mc reads 34:6 of the DRAM linear address */ - edac_mc_handle_ue(mci, block_page, - offset_in_page(error_2b << 4), - row, "e752x UE from Read"); + edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, + block_page, + offset_in_page(error_2b << 4), 0, + row, -1, -1, + "e752x UE from Read", "", NULL); + } if (error_one & 0x0404) { error_2b = scrb_add; @@ -401,9 +427,11 @@ static void do_process_ue(struct mem_ctl_info *mci, u16 error_one, edac_mc_find_csrow_by_page(mci, block_page); /* e752x mc reads 34:6 of the DRAM linear address */ - edac_mc_handle_ue(mci, block_page, - offset_in_page(error_2b << 4), - row, "e752x UE from Scruber"); + edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, + block_page, + offset_in_page(error_2b << 4), 0, + row, -1, -1, + "e752x UE from Scruber", "", NULL); } } @@ -426,7 +454,9 @@ static inline void process_ue_no_info_wr(struct mem_ctl_info *mci, return; debugf3("%s()\n", __func__); - edac_mc_handle_ue_no_info(mci, "e752x UE log memory write"); + edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 0, 0, 0, + -1, -1, -1, + "e752x UE log memory write", "", NULL); } static void do_process_ded_retry(struct mem_ctl_info *mci, u16 error, @@ -1044,7 +1074,7 @@ static void e752x_init_csrows(struct mem_ctl_info *mci, struct pci_dev *pdev, int drc_drbg; /* DRB granularity 0=64mb, 1=128mb */ int drc_ddim; /* DRAM Data Integrity Mode 0=none, 2=edac */ u8 value; - u32 dra, drc, cumul_size; + u32 dra, drc, cumul_size, i, nr_pages; dra = 0; for (index = 0; index < 4; index++) { @@ -1053,7 +1083,7 @@ static void e752x_init_csrows(struct mem_ctl_info *mci, struct pci_dev *pdev, dra |= dra_reg << (index * 8); } pci_read_config_dword(pdev, E752X_DRC, &drc); - drc_chan = dual_channel_active(ddrcsr); + drc_chan = dual_channel_active(ddrcsr) ? 1 : 0; drc_drbg = drc_chan + 1; /* 128 in dual mode, 64 in single */ drc_ddim = (drc >> 20) & 0x3; @@ -1078,26 +1108,33 @@ static void e752x_init_csrows(struct mem_ctl_info *mci, struct pci_dev *pdev, csrow->first_page = last_cumul_size; csrow->last_page = cumul_size - 1; - csrow->nr_pages = cumul_size - last_cumul_size; + nr_pages = cumul_size - last_cumul_size; last_cumul_size = cumul_size; - csrow->grain = 1 << 12; /* 4KiB - resolution of CELOG */ - csrow->mtype = MEM_RDDR; /* only one type supported */ - csrow->dtype = mem_dev ? DEV_X4 : DEV_X8; - - /* - * if single channel or x8 devices then SECDED - * if dual channel and x4 then S4ECD4ED - */ - if (drc_ddim) { - if (drc_chan && mem_dev) { - csrow->edac_mode = EDAC_S4ECD4ED; - mci->edac_cap |= EDAC_FLAG_S4ECD4ED; - } else { - csrow->edac_mode = EDAC_SECDED; - mci->edac_cap |= EDAC_FLAG_SECDED; - } - } else - csrow->edac_mode = EDAC_NONE; + + for (i = 0; i < csrow->nr_channels; i++) { + struct dimm_info *dimm = csrow->channels[i].dimm; + + debugf3("Initializing rank at (%i,%i)\n", index, i); + dimm->nr_pages = nr_pages / csrow->nr_channels; + dimm->grain = 1 << 12; /* 4KiB - resolution of CELOG */ + dimm->mtype = MEM_RDDR; /* only one type supported */ + dimm->dtype = mem_dev ? DEV_X4 : DEV_X8; + + /* + * if single channel or x8 devices then SECDED + * if dual channel and x4 then S4ECD4ED + */ + if (drc_ddim) { + if (drc_chan && mem_dev) { + dimm->edac_mode = EDAC_S4ECD4ED; + mci->edac_cap |= EDAC_FLAG_S4ECD4ED; + } else { + dimm->edac_mode = EDAC_SECDED; + mci->edac_cap |= EDAC_FLAG_SECDED; + } + } else + dimm->edac_mode = EDAC_NONE; + } } } @@ -1226,6 +1263,7 @@ static int e752x_probe1(struct pci_dev *pdev, int dev_idx) u16 pci_data; u8 stat8; struct mem_ctl_info *mci; + struct edac_mc_layer layers[2]; struct e752x_pvt *pvt; u16 ddrcsr; int drc_chan; /* Number of channels 0=1chan,1=2chan */ @@ -1252,11 +1290,15 @@ static int e752x_probe1(struct pci_dev *pdev, int dev_idx) /* Dual channel = 1, Single channel = 0 */ drc_chan = dual_channel_active(ddrcsr); - mci = edac_mc_alloc(sizeof(*pvt), E752X_NR_CSROWS, drc_chan + 1, 0); - - if (mci == NULL) { + layers[0].type = EDAC_MC_LAYER_CHIP_SELECT; + layers[0].size = E752X_NR_CSROWS; + layers[0].is_virt_csrow = true; + layers[1].type = EDAC_MC_LAYER_CHANNEL; + layers[1].size = drc_chan + 1; + layers[1].is_virt_csrow = false; + mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers, sizeof(*pvt)); + if (mci == NULL) return -ENOMEM; - } debugf3("%s(): init mci\n", __func__); mci->mtype_cap = MEM_FLAG_RDDR; diff --git a/drivers/edac/e7xxx_edac.c b/drivers/edac/e7xxx_edac.c index 68dea87..9a9c1a5 100644 --- a/drivers/edac/e7xxx_edac.c +++ b/drivers/edac/e7xxx_edac.c @@ -10,6 +10,9 @@ * Based on work by Dan Hollis <goemon at anime dot net> and others. * http://www.anime.net/~goemon/linux-ecc/ * + * Datasheet: + * http://www.intel.com/content/www/us/en/chipsets/e7501-chipset-memory-controller-hub-datasheet.html + * * Contributors: * Eric Biederman (Linux Networx) * Tom Zimmerman (Linux Networx) @@ -71,7 +74,7 @@ #endif /* PCI_DEVICE_ID_INTEL_7505_1_ERR */ #define E7XXX_NR_CSROWS 8 /* number of csrows */ -#define E7XXX_NR_DIMMS 8 /* FIXME - is this correct? */ +#define E7XXX_NR_DIMMS 8 /* 2 channels, 4 dimms/channel */ /* E7XXX register addresses - device 0 function 0 */ #define E7XXX_DRB 0x60 /* DRAM row boundary register (8b) */ @@ -216,13 +219,15 @@ static void process_ce(struct mem_ctl_info *mci, struct e7xxx_error_info *info) row = edac_mc_find_csrow_by_page(mci, page); /* convert syndrome to channel */ channel = e7xxx_find_channel(syndrome); - edac_mc_handle_ce(mci, page, 0, syndrome, row, channel, "e7xxx CE"); + edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, page, 0, syndrome, + row, channel, -1, "e7xxx CE", "", NULL); } static void process_ce_no_info(struct mem_ctl_info *mci) { debugf3("%s()\n", __func__); - edac_mc_handle_ce_no_info(mci, "e7xxx CE log register overflow"); + edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 0, 0, 0, -1, -1, -1, + "e7xxx CE log register overflow", "", NULL); } static void process_ue(struct mem_ctl_info *mci, struct e7xxx_error_info *info) @@ -236,13 +241,17 @@ static void process_ue(struct mem_ctl_info *mci, struct e7xxx_error_info *info) /* FIXME - should use PAGE_SHIFT */ block_page = error_2b >> 6; /* convert to 4k address */ row = edac_mc_find_csrow_by_page(mci, block_page); - edac_mc_handle_ue(mci, block_page, 0, row, "e7xxx UE"); + + edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, block_page, 0, 0, + row, -1, -1, "e7xxx UE", "", NULL); } static void process_ue_no_info(struct mem_ctl_info *mci) { debugf3("%s()\n", __func__); - edac_mc_handle_ue_no_info(mci, "e7xxx UE log register overflow"); + + edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 0, 0, 0, -1, -1, -1, + "e7xxx UE log register overflow", "", NULL); } static void e7xxx_get_error_info(struct mem_ctl_info *mci, @@ -347,11 +356,12 @@ static void e7xxx_init_csrows(struct mem_ctl_info *mci, struct pci_dev *pdev, int dev_idx, u32 drc) { unsigned long last_cumul_size; - int index; + int index, j; u8 value; - u32 dra, cumul_size; + u32 dra, cumul_size, nr_pages; int drc_chan, drc_drbg, drc_ddim, mem_dev; struct csrow_info *csrow; + struct dimm_info *dimm; pci_read_config_dword(pdev, E7XXX_DRA, &dra); drc_chan = dual_channel_active(drc, dev_idx); @@ -379,26 +389,32 @@ static void e7xxx_init_csrows(struct mem_ctl_info *mci, struct pci_dev *pdev, csrow->first_page = last_cumul_size; csrow->last_page = cumul_size - 1; - csrow->nr_pages = cumul_size - last_cumul_size; + nr_pages = cumul_size - last_cumul_size; last_cumul_size = cumul_size; - csrow->grain = 1 << 12; /* 4KiB - resolution of CELOG */ - csrow->mtype = MEM_RDDR; /* only one type supported */ - csrow->dtype = mem_dev ? DEV_X4 : DEV_X8; - - /* - * if single channel or x8 devices then SECDED - * if dual channel and x4 then S4ECD4ED - */ - if (drc_ddim) { - if (drc_chan && mem_dev) { - csrow->edac_mode = EDAC_S4ECD4ED; - mci->edac_cap |= EDAC_FLAG_S4ECD4ED; - } else { - csrow->edac_mode = EDAC_SECDED; - mci->edac_cap |= EDAC_FLAG_SECDED; - } - } else - csrow->edac_mode = EDAC_NONE; + + for (j = 0; j < drc_chan + 1; j++) { + dimm = csrow->channels[j].dimm; + + dimm->nr_pages = nr_pages / (drc_chan + 1); + dimm->grain = 1 << 12; /* 4KiB - resolution of CELOG */ + dimm->mtype = MEM_RDDR; /* only one type supported */ + dimm->dtype = mem_dev ? DEV_X4 : DEV_X8; + + /* + * if single channel or x8 devices then SECDED + * if dual channel and x4 then S4ECD4ED + */ + if (drc_ddim) { + if (drc_chan && mem_dev) { + dimm->edac_mode = EDAC_S4ECD4ED; + mci->edac_cap |= EDAC_FLAG_S4ECD4ED; + } else { + dimm->edac_mode = EDAC_SECDED; + mci->edac_cap |= EDAC_FLAG_SECDED; + } + } else + dimm->edac_mode = EDAC_NONE; + } } } @@ -406,6 +422,7 @@ static int e7xxx_probe1(struct pci_dev *pdev, int dev_idx) { u16 pci_data; struct mem_ctl_info *mci = NULL; + struct edac_mc_layer layers[2]; struct e7xxx_pvt *pvt = NULL; u32 drc; int drc_chan; @@ -416,8 +433,21 @@ static int e7xxx_probe1(struct pci_dev *pdev, int dev_idx) pci_read_config_dword(pdev, E7XXX_DRC, &drc); drc_chan = dual_channel_active(drc, dev_idx); - mci = edac_mc_alloc(sizeof(*pvt), E7XXX_NR_CSROWS, drc_chan + 1, 0); - + /* + * According with the datasheet, this device has a maximum of + * 4 DIMMS per channel, either single-rank or dual-rank. So, the + * total amount of dimms is 8 (E7XXX_NR_DIMMS). + * That means that the DIMM is mapped as CSROWs, and the channel + * will map the rank. So, an error to either channel should be + * attributed to the same dimm. + */ + layers[0].type = EDAC_MC_LAYER_CHIP_SELECT; + layers[0].size = E7XXX_NR_CSROWS; + layers[0].is_virt_csrow = true; + layers[1].type = EDAC_MC_LAYER_CHANNEL; + layers[1].size = drc_chan + 1; + layers[1].is_virt_csrow = false; + mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers, sizeof(*pvt)); if (mci == NULL) return -ENOMEM; diff --git a/drivers/edac/edac_core.h b/drivers/edac/edac_core.h index 5b73941..117490d 100644 --- a/drivers/edac/edac_core.h +++ b/drivers/edac/edac_core.h @@ -447,8 +447,10 @@ static inline void pci_write_bits32(struct pci_dev *pdev, int offset, #endif /* CONFIG_PCI */ -extern struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows, - unsigned nr_chans, int edac_index); +struct mem_ctl_info *edac_mc_alloc(unsigned mc_num, + unsigned n_layers, + struct edac_mc_layer *layers, + unsigned sz_pvt); extern int edac_mc_add_mc(struct mem_ctl_info *mci); extern void edac_mc_free(struct mem_ctl_info *mci); extern struct mem_ctl_info *edac_mc_find(int idx); @@ -456,35 +458,17 @@ extern struct mem_ctl_info *find_mci_by_dev(struct device *dev); extern struct mem_ctl_info *edac_mc_del_mc(struct device *dev); extern int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci, unsigned long page); - -/* - * The no info errors are used when error overflows are reported. - * There are a limited number of error logging registers that can - * be exausted. When all registers are exhausted and an additional - * error occurs then an error overflow register records that an - * error occurred and the type of error, but doesn't have any - * further information. The ce/ue versions make for cleaner - * reporting logic and function interface - reduces conditional - * statement clutter and extra function arguments. - */ -extern void edac_mc_handle_ce(struct mem_ctl_info *mci, - unsigned long page_frame_number, - unsigned long offset_in_page, - unsigned long syndrome, int row, int channel, - const char *msg); -extern void edac_mc_handle_ce_no_info(struct mem_ctl_info *mci, - const char *msg); -extern void edac_mc_handle_ue(struct mem_ctl_info *mci, - unsigned long page_frame_number, - unsigned long offset_in_page, int row, - const char *msg); -extern void edac_mc_handle_ue_no_info(struct mem_ctl_info *mci, - const char *msg); -extern void edac_mc_handle_fbd_ue(struct mem_ctl_info *mci, unsigned int csrow, - unsigned int channel0, unsigned int channel1, - char *msg); -extern void edac_mc_handle_fbd_ce(struct mem_ctl_info *mci, unsigned int csrow, - unsigned int channel, char *msg); +void edac_mc_handle_error(const enum hw_event_mc_err_type type, + struct mem_ctl_info *mci, + const unsigned long page_frame_number, + const unsigned long offset_in_page, + const unsigned long syndrome, + const int layer0, + const int layer1, + const int layer2, + const char *msg, + const char *other_detail, + const void *mcelog); /* * edac_device APIs @@ -496,6 +480,7 @@ extern void edac_device_handle_ue(struct edac_device_ctl_info *edac_dev, extern void edac_device_handle_ce(struct edac_device_ctl_info *edac_dev, int inst_nr, int block_nr, const char *msg); extern int edac_device_alloc_index(void); +extern const char *edac_layer_name[]; /* * edac_pci APIs diff --git a/drivers/edac/edac_device.c b/drivers/edac/edac_device.c index 45b8f4b..ee3f1f8 100644 --- a/drivers/edac/edac_device.c +++ b/drivers/edac/edac_device.c @@ -79,7 +79,7 @@ struct edac_device_ctl_info *edac_device_alloc_ctl_info( unsigned total_size; unsigned count; unsigned instance, block, attr; - void *pvt; + void *pvt, *p; int err; debugf4("%s() instances=%d blocks=%d\n", @@ -92,35 +92,30 @@ struct edac_device_ctl_info *edac_device_alloc_ctl_info( * to be at least as stringent as what the compiler would * provide if we could simply hardcode everything into a single struct. */ - dev_ctl = (struct edac_device_ctl_info *)NULL; + p = NULL; + dev_ctl = edac_align_ptr(&p, sizeof(*dev_ctl), 1); /* Calc the 'end' offset past end of ONE ctl_info structure * which will become the start of the 'instance' array */ - dev_inst = edac_align_ptr(&dev_ctl[1], sizeof(*dev_inst)); + dev_inst = edac_align_ptr(&p, sizeof(*dev_inst), nr_instances); /* Calc the 'end' offset past the instance array within the ctl_info * which will become the start of the block array */ - dev_blk = edac_align_ptr(&dev_inst[nr_instances], sizeof(*dev_blk)); + count = nr_instances * nr_blocks; + dev_blk = edac_align_ptr(&p, sizeof(*dev_blk), count); /* Calc the 'end' offset past the dev_blk array * which will become the start of the attrib array, if any. */ - count = nr_instances * nr_blocks; - dev_attrib = edac_align_ptr(&dev_blk[count], sizeof(*dev_attrib)); - - /* Check for case of when an attribute array is specified */ - if (nr_attrib > 0) { - /* calc how many nr_attrib we need */ + /* calc how many nr_attrib we need */ + if (nr_attrib > 0) count *= nr_attrib; + dev_attrib = edac_align_ptr(&p, sizeof(*dev_attrib), count); - /* Calc the 'end' offset past the attributes array */ - pvt = edac_align_ptr(&dev_attrib[count], sz_private); - } else { - /* no attribute array specified */ - pvt = edac_align_ptr(dev_attrib, sz_private); - } + /* Calc the 'end' offset past the attributes array */ + pvt = edac_align_ptr(&p, sz_private, 1); /* 'pvt' now points to where the private data area is. * At this point 'pvt' (like dev_inst,dev_blk and dev_attrib) diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index feef773..10f3750 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -43,9 +43,26 @@ static void edac_mc_dump_channel(struct rank_info *chan) { debugf4("\tchannel = %p\n", chan); debugf4("\tchannel->chan_idx = %d\n", chan->chan_idx); - debugf4("\tchannel->ce_count = %d\n", chan->ce_count); - debugf4("\tchannel->label = '%s'\n", chan->label); debugf4("\tchannel->csrow = %p\n\n", chan->csrow); + debugf4("\tchannel->dimm = %p\n", chan->dimm); +} + +static void edac_mc_dump_dimm(struct dimm_info *dimm) +{ + int i; + + debugf4("\tdimm = %p\n", dimm); + debugf4("\tdimm->label = '%s'\n", dimm->label); + debugf4("\tdimm->nr_pages = 0x%x\n", dimm->nr_pages); + debugf4("\tdimm location "); + for (i = 0; i < dimm->mci->n_layers; i++) { + printk(KERN_CONT "%d", dimm->location[i]); + if (i < dimm->mci->n_layers - 1) + printk(KERN_CONT "."); + } + printk(KERN_CONT "\n"); + debugf4("\tdimm->grain = %d\n", dimm->grain); + debugf4("\tdimm->nr_pages = 0x%x\n", dimm->nr_pages); } static void edac_mc_dump_csrow(struct csrow_info *csrow) @@ -55,7 +72,6 @@ static void edac_mc_dump_csrow(struct csrow_info *csrow) debugf4("\tcsrow->first_page = 0x%lx\n", csrow->first_page); debugf4("\tcsrow->last_page = 0x%lx\n", csrow->last_page); debugf4("\tcsrow->page_mask = 0x%lx\n", csrow->page_mask); - debugf4("\tcsrow->nr_pages = 0x%x\n", csrow->nr_pages); debugf4("\tcsrow->nr_channels = %d\n", csrow->nr_channels); debugf4("\tcsrow->channels = %p\n", csrow->channels); debugf4("\tcsrow->mci = %p\n\n", csrow->mci); @@ -70,6 +86,8 @@ static void edac_mc_dump_mci(struct mem_ctl_info *mci) debugf4("\tmci->edac_check = %p\n", mci->edac_check); debugf3("\tmci->nr_csrows = %d, csrows = %p\n", mci->nr_csrows, mci->csrows); + debugf3("\tmci->nr_dimms = %d, dimms = %p\n", + mci->tot_dimms, mci->dimms); debugf3("\tdev = %p\n", mci->dev); debugf3("\tmod_name:ctl_name = %s:%s\n", mci->mod_name, mci->ctl_name); debugf3("\tpvt_info = %p\n\n", mci->pvt_info); @@ -101,18 +119,37 @@ const char *edac_mem_types[] = { }; EXPORT_SYMBOL_GPL(edac_mem_types); -/* 'ptr' points to a possibly unaligned item X such that sizeof(X) is 'size'. - * Adjust 'ptr' so that its alignment is at least as stringent as what the - * compiler would provide for X and return the aligned result. +/** + * edac_align_ptr - Prepares the pointer offsets for a single-shot allocation + * @p: pointer to a pointer with the memory offset to be used. At + * return, this will be incremented to point to the next offset + * @size: Size of the data structure to be reserved + * @n_elems: Number of elements that should be reserved * * If 'size' is a constant, the compiler will optimize this whole function - * down to either a no-op or the addition of a constant to the value of 'ptr'. + * down to either a no-op or the addition of a constant to the value of '*p'. + * + * The 'p' pointer is absolutely needed to keep the proper advancing + * further in memory to the proper offsets when allocating the struct along + * with its embedded structs, as edac_device_alloc_ctl_info() does it + * above, for example. + * + * At return, the pointer 'p' will be incremented to be used on a next call + * to this function. */ -void *edac_align_ptr(void *ptr, unsigned size) +void *edac_align_ptr(void **p, unsigned size, int n_elems) { unsigned align, r; + void *ptr = *p; + + *p += size * n_elems; - /* Here we assume that the alignment of a "long long" is the most + /* + * 'p' can possibly be an unaligned item X such that sizeof(X) is + * 'size'. Adjust 'p' so that its alignment is at least as + * stringent as what the compiler would provide for X and return + * the aligned result. + * Here we assume that the alignment of a "long long" is the most * stringent alignment that the compiler will ever provide by default. * As far as I know, this is a reasonable assumption. */ @@ -132,14 +169,18 @@ void *edac_align_ptr(void *ptr, unsigned size) if (r == 0) return (char *)ptr; + *p += align - r; + return (void *)(((unsigned long)ptr) + align - r); } /** - * edac_mc_alloc: Allocate a struct mem_ctl_info structure - * @size_pvt: size of private storage needed - * @nr_csrows: Number of CWROWS needed for this MC - * @nr_chans: Number of channels for the MC + * edac_mc_alloc: Allocate and partially fill a struct mem_ctl_info structure + * @mc_num: Memory controller number + * @n_layers: Number of MC hierarchy layers + * layers: Describes each layer as seen by the Memory Controller + * @size_pvt: size of private storage needed + * * * Everything is kmalloc'ed as one big chunk - more efficient. * Only can be used if all structures have the same lifetime - otherwise @@ -147,32 +188,77 @@ void *edac_align_ptr(void *ptr, unsigned size) * * Use edac_mc_free() to free mc structures allocated by this function. * + * NOTE: drivers handle multi-rank memories in different ways: in some + * drivers, one multi-rank memory stick is mapped as one entry, while, in + * others, a single multi-rank memory stick would be mapped into several + * entries. Currently, this function will allocate multiple struct dimm_info + * on such scenarios, as grouping the multiple ranks require drivers change. + * * Returns: - * NULL allocation failed - * struct mem_ctl_info pointer + * On failure: NULL + * On success: struct mem_ctl_info pointer */ -struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows, - unsigned nr_chans, int edac_index) +struct mem_ctl_info *edac_mc_alloc(unsigned mc_num, + unsigned n_layers, + struct edac_mc_layer *layers, + unsigned sz_pvt) { struct mem_ctl_info *mci; - struct csrow_info *csi, *csrow; + struct edac_mc_layer *layer; + struct csrow_info *csi, *csr; struct rank_info *chi, *chp, *chan; - void *pvt; - unsigned size; - int row, chn; - int err; + struct dimm_info *dimm; + u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS]; + unsigned pos[EDAC_MAX_LAYERS]; + unsigned size, tot_dimms = 1, count = 1; + unsigned tot_csrows = 1, tot_channels = 1, tot_errcount = 0; + void *pvt, *p, *ptr = NULL; + int i, j, err, row, chn, n, len; + bool per_rank = false; + + BUG_ON(n_layers > EDAC_MAX_LAYERS || n_layers == 0); + /* + * Calculate the total amount of dimms and csrows/cschannels while + * in the old API emulation mode + */ + for (i = 0; i < n_layers; i++) { + tot_dimms *= layers[i].size; + if (layers[i].is_virt_csrow) + tot_csrows *= layers[i].size; + else + tot_channels *= layers[i].size; + + if (layers[i].type == EDAC_MC_LAYER_CHIP_SELECT) + per_rank = true; + } /* Figure out the offsets of the various items from the start of an mc * structure. We want the alignment of each item to be at least as * stringent as what the compiler would provide if we could simply * hardcode everything into a single struct. */ - mci = (struct mem_ctl_info *)0; - csi = edac_align_ptr(&mci[1], sizeof(*csi)); - chi = edac_align_ptr(&csi[nr_csrows], sizeof(*chi)); - pvt = edac_align_ptr(&chi[nr_chans * nr_csrows], sz_pvt); + mci = edac_align_ptr(&ptr, sizeof(*mci), 1); + layer = edac_align_ptr(&ptr, sizeof(*layer), n_layers); + csi = edac_align_ptr(&ptr, sizeof(*csi), tot_csrows); + chi = edac_align_ptr(&ptr, sizeof(*chi), tot_csrows * tot_channels); + dimm = edac_align_ptr(&ptr, sizeof(*dimm), tot_dimms); + for (i = 0; i < n_layers; i++) { + count *= layers[i].size; + debugf4("%s: errcount layer %d size %d\n", __func__, i, count); + ce_per_layer[i] = edac_align_ptr(&ptr, sizeof(u32), count); + ue_per_layer[i] = edac_align_ptr(&ptr, sizeof(u32), count); + tot_errcount += 2 * count; + } + + debugf4("%s: allocating %d error counters\n", __func__, tot_errcount); + pvt = edac_align_ptr(&ptr, sz_pvt, 1); size = ((unsigned long)pvt) + sz_pvt; + debugf1("%s(): allocating %u bytes for mci data (%d %s, %d csrows/channels)\n", + __func__, size, + tot_dimms, + per_rank ? "ranks" : "dimms", + tot_csrows * tot_channels); mci = kzalloc(size, GFP_KERNEL); if (mci == NULL) return NULL; @@ -180,28 +266,103 @@ struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows, /* Adjust pointers so they point within the memory we just allocated * rather than an imaginary chunk of memory located at address 0. */ + layer = (struct edac_mc_layer *)(((char *)mci) + ((unsigned long)layer)); csi = (struct csrow_info *)(((char *)mci) + ((unsigned long)csi)); chi = (struct rank_info *)(((char *)mci) + ((unsigned long)chi)); + dimm = (struct dimm_info *)(((char *)mci) + ((unsigned long)dimm)); + for (i = 0; i < n_layers; i++) { + mci->ce_per_layer[i] = (u32 *)((char *)mci + ((unsigned long)ce_per_layer[i])); + mci->ue_per_layer[i] = (u32 *)((char *)mci + ((unsigned long)ue_per_layer[i])); + } pvt = sz_pvt ? (((char *)mci) + ((unsigned long)pvt)) : NULL; /* setup index and various internal pointers */ - mci->mc_idx = edac_index; + mci->mc_idx = mc_num; mci->csrows = csi; + mci->dimms = dimm; + mci->tot_dimms = tot_dimms; mci->pvt_info = pvt; - mci->nr_csrows = nr_csrows; - - for (row = 0; row < nr_csrows; row++) { - csrow = &csi[row]; - csrow->csrow_idx = row; - csrow->mci = mci; - csrow->nr_channels = nr_chans; - chp = &chi[row * nr_chans]; - csrow->channels = chp; + mci->n_layers = n_layers; + mci->layers = layer; + memcpy(mci->layers, layers, sizeof(*layer) * n_layers); + mci->nr_csrows = tot_csrows; + mci->num_cschannel = tot_channels; + mci->mem_is_per_rank = per_rank; - for (chn = 0; chn < nr_chans; chn++) { + /* + * Fill the csrow struct + */ + for (row = 0; row < tot_csrows; row++) { + csr = &csi[row]; + csr->csrow_idx = row; + csr->mci = mci; + csr->nr_channels = tot_channels; + chp = &chi[row * tot_channels]; + csr->channels = chp; + + for (chn = 0; chn < tot_channels; chn++) { chan = &chp[chn]; chan->chan_idx = chn; - chan->csrow = csrow; + chan->csrow = csr; + } + } + + /* + * Fill the dimm struct + */ + memset(&pos, 0, sizeof(pos)); + row = 0; + chn = 0; + debugf4("%s: initializing %d %s\n", __func__, tot_dimms, + per_rank ? "ranks" : "dimms"); + for (i = 0; i < tot_dimms; i++) { + chan = &csi[row].channels[chn]; + dimm = EDAC_DIMM_PTR(layer, mci->dimms, n_layers, + pos[0], pos[1], pos[2]); + dimm->mci = mci; + + debugf2("%s: %d: %s%zd (%d:%d:%d): row %d, chan %d\n", __func__, + i, per_rank ? "rank" : "dimm", (dimm - mci->dimms), + pos[0], pos[1], pos[2], row, chn); + + /* + * Copy DIMM location and initialize it. + */ + len = sizeof(dimm->label); + p = dimm->label; + n = snprintf(p, len, "mc#%u", mc_num); + p += n; + len -= n; + for (j = 0; j < n_layers; j++) { + n = snprintf(p, len, "%s#%u", + edac_layer_name[layers[j].type], + pos[j]); + p += n; + len -= n; + dimm->location[j] = pos[j]; + + if (len <= 0) + break; + } + + /* Link it to the csrows old API data */ + chan->dimm = dimm; + dimm->csrow = row; + dimm->cschannel = chn; + + /* Increment csrow location */ + row++; + if (row == tot_csrows) { + row = 0; + chn++; + } + + /* Increment dimm location */ + for (j = n_layers - 1; j >= 0; j--) { + pos[j]++; + if (pos[j] < layers[j].size) + break; + pos[j] = 0; } } @@ -490,7 +651,6 @@ EXPORT_SYMBOL(edac_mc_find); * edac_mc_add_mc: Insert the 'mci' structure into the mci global list and * create sysfs entries associated with mci structure * @mci: pointer to the mci structure to be added to the list - * @mc_idx: A unique numeric identifier to be assigned to the 'mci' structure. * * Return: * 0 Success @@ -517,6 +677,8 @@ int edac_mc_add_mc(struct mem_ctl_info *mci) edac_mc_dump_channel(&mci->csrows[i]. channels[j]); } + for (i = 0; i < mci->tot_dimms; i++) + edac_mc_dump_dimm(&mci->dimms[i]); } #endif mutex_lock(&mem_ctls_mutex); @@ -636,15 +798,19 @@ static void edac_mc_scrub_block(unsigned long page, unsigned long offset, int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci, unsigned long page) { struct csrow_info *csrows = mci->csrows; - int row, i; + int row, i, j, n; debugf1("MC%d: %s(): 0x%lx\n", mci->mc_idx, __func__, page); row = -1; for (i = 0; i < mci->nr_csrows; i++) { struct csrow_info *csrow = &csrows[i]; - - if (csrow->nr_pages == 0) + n = 0; + for (j = 0; j < csrow->nr_channels; j++) { + struct dimm_info *dimm = csrow->channels[j].dimm; + n += dimm->nr_pages; + } + if (n == 0) continue; debugf3("MC%d: %s(): first(0x%lx) page(0x%lx) last(0x%lx) " @@ -670,249 +836,307 @@ int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci, unsigned long page) } EXPORT_SYMBOL_GPL(edac_mc_find_csrow_by_page); -/* FIXME - setable log (warning/emerg) levels */ -/* FIXME - integrate with evlog: http://evlog.sourceforge.net/ */ -void edac_mc_handle_ce(struct mem_ctl_info *mci, - unsigned long page_frame_number, - unsigned long offset_in_page, unsigned long syndrome, - int row, int channel, const char *msg) -{ - unsigned long remapped_page; +const char *edac_layer_name[] = { + [EDAC_MC_LAYER_BRANCH] = "branch", + [EDAC_MC_LAYER_CHANNEL] = "channel", + [EDAC_MC_LAYER_SLOT] = "slot", + [EDAC_MC_LAYER_CHIP_SELECT] = "csrow", +}; +EXPORT_SYMBOL_GPL(edac_layer_name); - debugf3("MC%d: %s()\n", mci->mc_idx, __func__); +static void edac_inc_ce_error(struct mem_ctl_info *mci, + bool enable_per_layer_report, + const int pos[EDAC_MAX_LAYERS]) +{ + int i, index = 0; - /* FIXME - maybe make panic on INTERNAL ERROR an option */ - if (row >= mci->nr_csrows || row < 0) { - /* something is wrong */ - edac_mc_printk(mci, KERN_ERR, - "INTERNAL ERROR: row out of range " - "(%d >= %d)\n", row, mci->nr_csrows); - edac_mc_handle_ce_no_info(mci, "INTERNAL ERROR"); - return; - } + mci->ce_mc++; - if (channel >= mci->csrows[row].nr_channels || channel < 0) { - /* something is wrong */ - edac_mc_printk(mci, KERN_ERR, - "INTERNAL ERROR: channel out of range " - "(%d >= %d)\n", channel, - mci->csrows[row].nr_channels); - edac_mc_handle_ce_no_info(mci, "INTERNAL ERROR"); + if (!enable_per_layer_report) { + mci->ce_noinfo_count++; return; } - if (edac_mc_get_log_ce()) - /* FIXME - put in DIMM location */ - edac_mc_printk(mci, KERN_WARNING, - "CE page 0x%lx, offset 0x%lx, grain %d, syndrome " - "0x%lx, row %d, channel %d, label \"%s\": %s\n", - page_frame_number, offset_in_page, - mci->csrows[row].grain, syndrome, row, channel, - mci->csrows[row].channels[channel].label, msg); - - mci->ce_count++; - mci->csrows[row].ce_count++; - mci->csrows[row].channels[channel].ce_count++; - - if (mci->scrub_mode & SCRUB_SW_SRC) { - /* - * Some MC's can remap memory so that it is still available - * at a different address when PCI devices map into memory. - * MC's that can't do this lose the memory where PCI devices - * are mapped. This mapping is MC dependent and so we call - * back into the MC driver for it to map the MC page to - * a physical (CPU) page which can then be mapped to a virtual - * page - which can then be scrubbed. - */ - remapped_page = mci->ctl_page_to_phys ? - mci->ctl_page_to_phys(mci, page_frame_number) : - page_frame_number; + for (i = 0; i < mci->n_layers; i++) { + if (pos[i] < 0) + break; + index += pos[i]; + mci->ce_per_layer[i][index]++; - edac_mc_scrub_block(remapped_page, offset_in_page, - mci->csrows[row].grain); + if (i < mci->n_layers - 1) + index *= mci->layers[i + 1].size; } } -EXPORT_SYMBOL_GPL(edac_mc_handle_ce); -void edac_mc_handle_ce_no_info(struct mem_ctl_info *mci, const char *msg) +static void edac_inc_ue_error(struct mem_ctl_info *mci, + bool enable_per_layer_report, + const int pos[EDAC_MAX_LAYERS]) { - if (edac_mc_get_log_ce()) - edac_mc_printk(mci, KERN_WARNING, - "CE - no information available: %s\n", msg); + int i, index = 0; - mci->ce_noinfo_count++; - mci->ce_count++; -} -EXPORT_SYMBOL_GPL(edac_mc_handle_ce_no_info); + mci->ue_mc++; -void edac_mc_handle_ue(struct mem_ctl_info *mci, - unsigned long page_frame_number, - unsigned long offset_in_page, int row, const char *msg) -{ - int len = EDAC_MC_LABEL_LEN * 4; - char labels[len + 1]; - char *pos = labels; - int chan; - int chars; - - debugf3("MC%d: %s()\n", mci->mc_idx, __func__); - - /* FIXME - maybe make panic on INTERNAL ERROR an option */ - if (row >= mci->nr_csrows || row < 0) { - /* something is wrong */ - edac_mc_printk(mci, KERN_ERR, - "INTERNAL ERROR: row out of range " - "(%d >= %d)\n", row, mci->nr_csrows); - edac_mc_handle_ue_no_info(mci, "INTERNAL ERROR"); + if (!enable_per_layer_report) { + mci->ce_noinfo_count++; return; } - chars = snprintf(pos, len + 1, "%s", - mci->csrows[row].channels[0].label); - len -= chars; - pos += chars; + for (i = 0; i < mci->n_layers; i++) { + if (pos[i] < 0) + break; + index += pos[i]; + mci->ue_per_layer[i][index]++; - for (chan = 1; (chan < mci->csrows[row].nr_channels) && (len > 0); - chan++) { - chars = snprintf(pos, len + 1, ":%s", - mci->csrows[row].channels[chan].label); - len -= chars; - pos += chars; + if (i < mci->n_layers - 1) + index *= mci->layers[i + 1].size; } +} - if (edac_mc_get_log_ue()) - edac_mc_printk(mci, KERN_EMERG, - "UE page 0x%lx, offset 0x%lx, grain %d, row %d, " - "labels \"%s\": %s\n", page_frame_number, - offset_in_page, mci->csrows[row].grain, row, - labels, msg); +static void edac_ce_error(struct mem_ctl_info *mci, + const int pos[EDAC_MAX_LAYERS], + const char *msg, + const char *location, + const char *label, + const char *detail, + const char *other_detail, + const bool enable_per_layer_report, + const unsigned long page_frame_number, + const unsigned long offset_in_page, + u32 grain) +{ + unsigned long remapped_page; - if (edac_mc_get_panic_on_ue()) - panic("EDAC MC%d: UE page 0x%lx, offset 0x%lx, grain %d, " - "row %d, labels \"%s\": %s\n", mci->mc_idx, - page_frame_number, offset_in_page, - mci->csrows[row].grain, row, labels, msg); + if (edac_mc_get_log_ce()) { + if (other_detail && *other_detail) + edac_mc_printk(mci, KERN_WARNING, + "CE %s on %s (%s%s - %s)\n", + msg, label, location, + detail, other_detail); + else + edac_mc_printk(mci, KERN_WARNING, + "CE %s on %s (%s%s)\n", + msg, label, location, + detail); + } + edac_inc_ce_error(mci, enable_per_layer_report, pos); - mci->ue_count++; - mci->csrows[row].ue_count++; + if (mci->scrub_mode & SCRUB_SW_SRC) { + /* + * Some memory controllers (called MCs below) can remap + * memory so that it is still available at a different + * address when PCI devices map into memory. + * MC's that can't do this, lose the memory where PCI + * devices are mapped. This mapping is MC-dependent + * and so we call back into the MC driver for it to + * map the MC page to a physical (CPU) page which can + * then be mapped to a virtual page - which can then + * be scrubbed. + */ + remapped_page = mci->ctl_page_to_phys ? + mci->ctl_page_to_phys(mci, page_frame_number) : + page_frame_number; + + edac_mc_scrub_block(remapped_page, + offset_in_page, grain); + } } -EXPORT_SYMBOL_GPL(edac_mc_handle_ue); -void edac_mc_handle_ue_no_info(struct mem_ctl_info *mci, const char *msg) +static void edac_ue_error(struct mem_ctl_info *mci, + const int pos[EDAC_MAX_LAYERS], + const char *msg, + const char *location, + const char *label, + const char *detail, + const char *other_detail, + const bool enable_per_layer_report) { - if (edac_mc_get_panic_on_ue()) - panic("EDAC MC%d: Uncorrected Error", mci->mc_idx); + if (edac_mc_get_log_ue()) { + if (other_detail && *other_detail) + edac_mc_printk(mci, KERN_WARNING, + "UE %s on %s (%s%s - %s)\n", + msg, label, location, detail, + other_detail); + else + edac_mc_printk(mci, KERN_WARNING, + "UE %s on %s (%s%s)\n", + msg, label, location, detail); + } - if (edac_mc_get_log_ue()) - edac_mc_printk(mci, KERN_WARNING, - "UE - no information available: %s\n", msg); - mci->ue_noinfo_count++; - mci->ue_count++; + if (edac_mc_get_panic_on_ue()) { + if (other_detail && *other_detail) + panic("UE %s on %s (%s%s - %s)\n", + msg, label, location, detail, other_detail); + else + panic("UE %s on %s (%s%s)\n", + msg, label, location, detail); + } + + edac_inc_ue_error(mci, enable_per_layer_report, pos); } -EXPORT_SYMBOL_GPL(edac_mc_handle_ue_no_info); -/************************************************************* - * On Fully Buffered DIMM modules, this help function is - * called to process UE events - */ -void edac_mc_handle_fbd_ue(struct mem_ctl_info *mci, - unsigned int csrow, - unsigned int channela, - unsigned int channelb, char *msg) +#define OTHER_LABEL " or " +void edac_mc_handle_error(const enum hw_event_mc_err_type type, + struct mem_ctl_info *mci, + const unsigned long page_frame_number, + const unsigned long offset_in_page, + const unsigned long syndrome, + const int layer0, + const int layer1, + const int layer2, + const char *msg, + const char *other_detail, + const void *mcelog) { - int len = EDAC_MC_LABEL_LEN * 4; - char labels[len + 1]; - char *pos = labels; - int chars; + /* FIXME: too much for stack: move it to some pre-alocated area */ + char detail[80], location[80]; + char label[(EDAC_MC_LABEL_LEN + 1 + sizeof(OTHER_LABEL)) * mci->tot_dimms]; + char *p; + int row = -1, chan = -1; + int pos[EDAC_MAX_LAYERS] = { layer0, layer1, layer2 }; + int i; + u32 grain; + bool enable_per_layer_report = false; - if (csrow >= mci->nr_csrows) { - /* something is wrong */ - edac_mc_printk(mci, KERN_ERR, - "INTERNAL ERROR: row out of range (%d >= %d)\n", - csrow, mci->nr_csrows); - edac_mc_handle_ue_no_info(mci, "INTERNAL ERROR"); - return; - } + debugf3("MC%d: %s()\n", mci->mc_idx, __func__); - if (channela >= mci->csrows[csrow].nr_channels) { - /* something is wrong */ - edac_mc_printk(mci, KERN_ERR, - "INTERNAL ERROR: channel-a out of range " - "(%d >= %d)\n", - channela, mci->csrows[csrow].nr_channels); - edac_mc_handle_ue_no_info(mci, "INTERNAL ERROR"); - return; + /* + * Check if the event report is consistent and if the memory + * location is known. If it is known, enable_per_layer_report will be + * true, the DIMM(s) label info will be filled and the per-layer + * error counters will be incremented. + */ + for (i = 0; i < mci->n_layers; i++) { + if (pos[i] >= (int)mci->layers[i].size) { + if (type == HW_EVENT_ERR_CORRECTED) + p = "CE"; + else + p = "UE"; + + edac_mc_printk(mci, KERN_ERR, + "INTERNAL ERROR: %s value is out of range (%d >= %d)\n", + edac_layer_name[mci->layers[i].type], + pos[i], mci->layers[i].size); + /* + * Instead of just returning it, let's use what's + * known about the error. The increment routines and + * the DIMM filter logic will do the right thing by + * pointing the likely damaged DIMMs. + */ + pos[i] = -1; + } + if (pos[i] >= 0) + enable_per_layer_report = true; } - if (channelb >= mci->csrows[csrow].nr_channels) { - /* something is wrong */ - edac_mc_printk(mci, KERN_ERR, - "INTERNAL ERROR: channel-b out of range " - "(%d >= %d)\n", - channelb, mci->csrows[csrow].nr_channels); - edac_mc_handle_ue_no_info(mci, "INTERNAL ERROR"); - return; - } + /* + * Get the dimm label/grain that applies to the match criteria. + * As the error algorithm may not be able to point to just one memory + * stick, the logic here will get all possible labels that could + * pottentially be affected by the error. + * On FB-DIMM memory controllers, for uncorrected errors, it is common + * to have only the MC channel and the MC dimm (also called "branch") + * but the channel is not known, as the memory is arranged in pairs, + * where each memory belongs to a separate channel within the same + * branch. + */ + grain = 0; + p = label; + *p = '\0'; + for (i = 0; i < mci->tot_dimms; i++) { + struct dimm_info *dimm = &mci->dimms[i]; - mci->ue_count++; - mci->csrows[csrow].ue_count++; + if (layer0 >= 0 && layer0 != dimm->location[0]) + continue; + if (layer1 >= 0 && layer1 != dimm->location[1]) + continue; + if (layer2 >= 0 && layer2 != dimm->location[2]) + continue; - /* Generate the DIMM labels from the specified channels */ - chars = snprintf(pos, len + 1, "%s", - mci->csrows[csrow].channels[channela].label); - len -= chars; - pos += chars; - chars = snprintf(pos, len + 1, "-%s", - mci->csrows[csrow].channels[channelb].label); + /* get the max grain, over the error match range */ + if (dimm->grain > grain) + grain = dimm->grain; - if (edac_mc_get_log_ue()) - edac_mc_printk(mci, KERN_EMERG, - "UE row %d, channel-a= %d channel-b= %d " - "labels \"%s\": %s\n", csrow, channela, channelb, - labels, msg); + /* + * If the error is memory-controller wide, there's no need to + * seek for the affected DIMMs because the whole + * channel/memory controller/... may be affected. + * Also, don't show errors for empty DIMM slots. + */ + if (enable_per_layer_report && dimm->nr_pages) { + if (p != label) { + strcpy(p, OTHER_LABEL); + p += strlen(OTHER_LABEL); + } + strcpy(p, dimm->label); + p += strlen(p); + *p = '\0'; + + /* + * get csrow/channel of the DIMM, in order to allow + * incrementing the compat API counters + */ + debugf4("%s: %s csrows map: (%d,%d)\n", + __func__, + mci->mem_is_per_rank ? "rank" : "dimm", + dimm->csrow, dimm->cschannel); + + if (row == -1) + row = dimm->csrow; + else if (row >= 0 && row != dimm->csrow) + row = -2; + + if (chan == -1) + chan = dimm->cschannel; + else if (chan >= 0 && chan != dimm->cschannel) + chan = -2; + } + } - if (edac_mc_get_panic_on_ue()) - panic("UE row %d, channel-a= %d channel-b= %d " - "labels \"%s\": %s\n", csrow, channela, - channelb, labels, msg); -} -EXPORT_SYMBOL(edac_mc_handle_fbd_ue); + if (!enable_per_layer_report) { + strcpy(label, "any memory"); + } else { + debugf4("%s: csrow/channel to increment: (%d,%d)\n", + __func__, row, chan); + if (p == label) + strcpy(label, "unknown memory"); + if (type == HW_EVENT_ERR_CORRECTED) { + if (row >= 0) { + mci->csrows[row].ce_count++; + if (chan >= 0) + mci->csrows[row].channels[chan].ce_count++; + } + } else + if (row >= 0) + mci->csrows[row].ue_count++; + } -/************************************************************* - * On Fully Buffered DIMM modules, this help function is - * called to process CE events - */ -void edac_mc_handle_fbd_ce(struct mem_ctl_info *mci, - unsigned int csrow, unsigned int channel, char *msg) -{ + /* Fill the RAM location data */ + p = location; + for (i = 0; i < mci->n_layers; i++) { + if (pos[i] < 0) + continue; - /* Ensure boundary values */ - if (csrow >= mci->nr_csrows) { - /* something is wrong */ - edac_mc_printk(mci, KERN_ERR, - "INTERNAL ERROR: row out of range (%d >= %d)\n", - csrow, mci->nr_csrows); - edac_mc_handle_ce_no_info(mci, "INTERNAL ERROR"); - return; - } - if (channel >= mci->csrows[csrow].nr_channels) { - /* something is wrong */ - edac_mc_printk(mci, KERN_ERR, - "INTERNAL ERROR: channel out of range (%d >= %d)\n", - channel, mci->csrows[csrow].nr_channels); - edac_mc_handle_ce_no_info(mci, "INTERNAL ERROR"); - return; + p += sprintf(p, "%s:%d ", + edac_layer_name[mci->layers[i].type], + pos[i]); } - if (edac_mc_get_log_ce()) - /* FIXME - put in DIMM location */ - edac_mc_printk(mci, KERN_WARNING, - "CE row %d, channel %d, label \"%s\": %s\n", - csrow, channel, - mci->csrows[csrow].channels[channel].label, msg); + /* Memory type dependent details about the error */ + if (type == HW_EVENT_ERR_CORRECTED) { + snprintf(detail, sizeof(detail), + "page:0x%lx offset:0x%lx grain:%d syndrome:0x%lx", + page_frame_number, offset_in_page, + grain, syndrome); + edac_ce_error(mci, pos, msg, location, label, detail, + other_detail, enable_per_layer_report, + page_frame_number, offset_in_page, grain); + } else { + snprintf(detail, sizeof(detail), + "page:0x%lx offset:0x%lx grain:%d", + page_frame_number, offset_in_page, grain); - mci->ce_count++; - mci->csrows[csrow].ce_count++; - mci->csrows[csrow].channels[channel].ce_count++; + edac_ue_error(mci, pos, msg, location, label, detail, + other_detail, enable_per_layer_report); + } } -EXPORT_SYMBOL(edac_mc_handle_fbd_ce); +EXPORT_SYMBOL_GPL(edac_mc_handle_error); diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c index e9a28f5..f6a29b0 100644 --- a/drivers/edac/edac_mc_sysfs.c +++ b/drivers/edac/edac_mc_sysfs.c @@ -144,25 +144,31 @@ static ssize_t csrow_ce_count_show(struct csrow_info *csrow, char *data, static ssize_t csrow_size_show(struct csrow_info *csrow, char *data, int private) { - return sprintf(data, "%u\n", PAGES_TO_MiB(csrow->nr_pages)); + int i; + u32 nr_pages = 0; + + for (i = 0; i < csrow->nr_channels; i++) + nr_pages += csrow->channels[i].dimm->nr_pages; + + return sprintf(data, "%u\n", PAGES_TO_MiB(nr_pages)); } static ssize_t csrow_mem_type_show(struct csrow_info *csrow, char *data, int private) { - return sprintf(data, "%s\n", mem_types[csrow->mtype]); + return sprintf(data, "%s\n", mem_types[csrow->channels[0].dimm->mtype]); } static ssize_t csrow_dev_type_show(struct csrow_info *csrow, char *data, int private) { - return sprintf(data, "%s\n", dev_types[csrow->dtype]); + return sprintf(data, "%s\n", dev_types[csrow->channels[0].dimm->dtype]); } static ssize_t csrow_edac_mode_show(struct csrow_info *csrow, char *data, int private) { - return sprintf(data, "%s\n", edac_caps[csrow->edac_mode]); + return sprintf(data, "%s\n", edac_caps[csrow->channels[0].dimm->edac_mode]); } /* show/store functions for DIMM Label attributes */ @@ -170,11 +176,11 @@ static ssize_t channel_dimm_label_show(struct csrow_info *csrow, char *data, int channel) { /* if field has not been initialized, there is nothing to send */ - if (!csrow->channels[channel].label[0]) + if (!csrow->channels[channel].dimm->label[0]) return 0; return snprintf(data, EDAC_MC_LABEL_LEN, "%s\n", - csrow->channels[channel].label); + csrow->channels[channel].dimm->label); } static ssize_t channel_dimm_label_store(struct csrow_info *csrow, @@ -184,8 +190,8 @@ static ssize_t channel_dimm_label_store(struct csrow_info *csrow, ssize_t max_size = 0; max_size = min((ssize_t) count, (ssize_t) EDAC_MC_LABEL_LEN - 1); - strncpy(csrow->channels[channel].label, data, max_size); - csrow->channels[channel].label[max_size] = '\0'; + strncpy(csrow->channels[channel].dimm->label, data, max_size); + csrow->channels[channel].dimm->label[max_size] = '\0'; return max_size; } @@ -419,8 +425,8 @@ static ssize_t mci_reset_counters_store(struct mem_ctl_info *mci, mci->ue_noinfo_count = 0; mci->ce_noinfo_count = 0; - mci->ue_count = 0; - mci->ce_count = 0; + mci->ue_mc = 0; + mci->ce_mc = 0; for (row = 0; row < mci->nr_csrows; row++) { struct csrow_info *ri = &mci->csrows[row]; @@ -489,12 +495,12 @@ static ssize_t mci_sdram_scrub_rate_show(struct mem_ctl_info *mci, char *data) /* default attribute files for the MCI object */ static ssize_t mci_ue_count_show(struct mem_ctl_info *mci, char *data) { - return sprintf(data, "%d\n", mci->ue_count); + return sprintf(data, "%d\n", mci->ue_mc); } static ssize_t mci_ce_count_show(struct mem_ctl_info *mci, char *data) { - return sprintf(data, "%d\n", mci->ce_count); + return sprintf(data, "%d\n", mci->ce_mc); } static ssize_t mci_ce_noinfo_show(struct mem_ctl_info *mci, char *data) @@ -519,16 +525,16 @@ static ssize_t mci_ctl_name_show(struct mem_ctl_info *mci, char *data) static ssize_t mci_size_mb_show(struct mem_ctl_info *mci, char *data) { - int total_pages, csrow_idx; + int total_pages = 0, csrow_idx, j; - for (total_pages = csrow_idx = 0; csrow_idx < mci->nr_csrows; - csrow_idx++) { + for (csrow_idx = 0; csrow_idx < mci->nr_csrows; csrow_idx++) { struct csrow_info *csrow = &mci->csrows[csrow_idx]; - if (!csrow->nr_pages) - continue; + for (j = 0; j < csrow->nr_channels; j++) { + struct dimm_info *dimm = csrow->channels[j].dimm; - total_pages += csrow->nr_pages; + total_pages += dimm->nr_pages; + } } return sprintf(data, "%u\n", PAGES_TO_MiB(total_pages)); @@ -900,7 +906,7 @@ static void edac_remove_mci_instance_attributes(struct mem_ctl_info *mci, */ int edac_create_sysfs_mci_device(struct mem_ctl_info *mci) { - int i; + int i, j; int err; struct csrow_info *csrow; struct kobject *kobj_mci = &mci->edac_mci_kobj; @@ -934,10 +940,13 @@ int edac_create_sysfs_mci_device(struct mem_ctl_info *mci) /* Make directories for each CSROW object under the mc<id> kobject */ for (i = 0; i < mci->nr_csrows; i++) { + int nr_pages = 0; + csrow = &mci->csrows[i]; + for (j = 0; j < csrow->nr_channels; j++) + nr_pages += csrow->channels[j].dimm->nr_pages; - /* Only expose populated CSROWs */ - if (csrow->nr_pages > 0) { + if (nr_pages > 0) { err = edac_create_csrow_object(mci, csrow, i); if (err) { debugf1("%s() failure: create csrow %d obj\n", @@ -949,12 +958,15 @@ int edac_create_sysfs_mci_device(struct mem_ctl_info *mci) return 0; - /* CSROW error: backout what has already been registered, */ fail1: for (i--; i >= 0; i--) { - if (csrow->nr_pages > 0) { + int nr_pages = 0; + + csrow = &mci->csrows[i]; + for (j = 0; j < csrow->nr_channels; j++) + nr_pages += csrow->channels[j].dimm->nr_pages; + if (nr_pages > 0) kobject_put(&mci->csrows[i].kobj); - } } /* remove the mci instance's attributes, if any */ @@ -973,14 +985,20 @@ fail0: */ void edac_remove_sysfs_mci_device(struct mem_ctl_info *mci) { - int i; + struct csrow_info *csrow; + int i, j; debugf0("%s()\n", __func__); /* remove all csrow kobjects */ debugf4("%s() unregister this mci kobj\n", __func__); for (i = 0; i < mci->nr_csrows; i++) { - if (mci->csrows[i].nr_pages > 0) { + int nr_pages = 0; + + csrow = &mci->csrows[i]; + for (j = 0; j < csrow->nr_channels; j++) + nr_pages += csrow->channels[j].dimm->nr_pages; + if (nr_pages > 0) { debugf0("%s() unreg csrow-%d\n", __func__, i); kobject_put(&mci->csrows[i].kobj); } diff --git a/drivers/edac/edac_module.h b/drivers/edac/edac_module.h index 00f81b4..0ea7d14 100644 --- a/drivers/edac/edac_module.h +++ b/drivers/edac/edac_module.h @@ -50,7 +50,7 @@ extern void edac_device_reset_delay_period(struct edac_device_ctl_info *edac_dev, unsigned long value); extern void edac_mc_reset_delay_period(int value); -extern void *edac_align_ptr(void *ptr, unsigned size); +extern void *edac_align_ptr(void **p, unsigned size, int n_elems); /* * EDAC PCI functions diff --git a/drivers/edac/edac_pci.c b/drivers/edac/edac_pci.c index 63af1c5..f1ac866 100644 --- a/drivers/edac/edac_pci.c +++ b/drivers/edac/edac_pci.c @@ -42,13 +42,13 @@ struct edac_pci_ctl_info *edac_pci_alloc_ctl_info(unsigned int sz_pvt, const char *edac_pci_name) { struct edac_pci_ctl_info *pci; - void *pvt; + void *p = NULL, *pvt; unsigned int size; debugf1("%s()\n", __func__); - pci = (struct edac_pci_ctl_info *)0; - pvt = edac_align_ptr(&pci[1], sz_pvt); + pci = edac_align_ptr(&p, sizeof(*pci), 1); + pvt = edac_align_ptr(&p, 1, sz_pvt); size = ((unsigned long)pvt) + sz_pvt; /* Alloc the needed control struct memory */ diff --git a/drivers/edac/i3000_edac.c b/drivers/edac/i3000_edac.c index 277689a..8ad1744 100644 --- a/drivers/edac/i3000_edac.c +++ b/drivers/edac/i3000_edac.c @@ -245,7 +245,9 @@ static int i3000_process_error_info(struct mem_ctl_info *mci, return 1; if ((info->errsts ^ info->errsts2) & I3000_ERRSTS_BITS) { - edac_mc_handle_ce_no_info(mci, "UE overwrote CE"); + edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 0, 0, 0, + -1, -1, -1, + "UE overwrote CE", "", NULL); info->errsts = info->errsts2; } @@ -256,10 +258,15 @@ static int i3000_process_error_info(struct mem_ctl_info *mci, row = edac_mc_find_csrow_by_page(mci, pfn); if (info->errsts & I3000_ERRSTS_UE) - edac_mc_handle_ue(mci, pfn, offset, row, "i3000 UE"); + edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, + pfn, offset, 0, + row, -1, -1, + "i3000 UE", "", NULL); else - edac_mc_handle_ce(mci, pfn, offset, info->derrsyn, row, - multi_chan ? channel : 0, "i3000 CE"); + edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, + pfn, offset, info->derrsyn, + row, multi_chan ? channel : 0, -1, + "i3000 CE", "", NULL); return 1; } @@ -304,9 +311,10 @@ static int i3000_is_interleaved(const unsigned char *c0dra, static int i3000_probe1(struct pci_dev *pdev, int dev_idx) { int rc; - int i; + int i, j; struct mem_ctl_info *mci = NULL; - unsigned long last_cumul_size; + struct edac_mc_layer layers[2]; + unsigned long last_cumul_size, nr_pages; int interleaved, nr_channels; unsigned char dra[I3000_RANKS / 2], drb[I3000_RANKS]; unsigned char *c0dra = dra, *c1dra = &dra[I3000_RANKS_PER_CHANNEL / 2]; @@ -347,7 +355,14 @@ static int i3000_probe1(struct pci_dev *pdev, int dev_idx) */ interleaved = i3000_is_interleaved(c0dra, c1dra, c0drb, c1drb); nr_channels = interleaved ? 2 : 1; - mci = edac_mc_alloc(0, I3000_RANKS / nr_channels, nr_channels, 0); + + layers[0].type = EDAC_MC_LAYER_CHIP_SELECT; + layers[0].size = I3000_RANKS / nr_channels; + layers[0].is_virt_csrow = true; + layers[1].type = EDAC_MC_LAYER_CHANNEL; + layers[1].size = nr_channels; + layers[1].is_virt_csrow = false; + mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers, 0); if (!mci) return -ENOMEM; @@ -386,19 +401,23 @@ static int i3000_probe1(struct pci_dev *pdev, int dev_idx) cumul_size <<= 1; debugf3("MC: %s(): (%d) cumul_size 0x%x\n", __func__, i, cumul_size); - if (cumul_size == last_cumul_size) { - csrow->mtype = MEM_EMPTY; + if (cumul_size == last_cumul_size) continue; - } csrow->first_page = last_cumul_size; csrow->last_page = cumul_size - 1; - csrow->nr_pages = cumul_size - last_cumul_size; + nr_pages = cumul_size - last_cumul_size; last_cumul_size = cumul_size; - csrow->grain = I3000_DEAP_GRAIN; - csrow->mtype = MEM_DDR2; - csrow->dtype = DEV_UNKNOWN; - csrow->edac_mode = EDAC_UNKNOWN; + + for (j = 0; j < nr_channels; j++) { + struct dimm_info *dimm = csrow->channels[j].dimm; + + dimm->nr_pages = nr_pages / nr_channels; + dimm->grain = I3000_DEAP_GRAIN; + dimm->mtype = MEM_DDR2; + dimm->dtype = DEV_UNKNOWN; + dimm->edac_mode = EDAC_UNKNOWN; + } } /* diff --git a/drivers/edac/i3200_edac.c b/drivers/edac/i3200_edac.c index 046808c..bbe43ef 100644 --- a/drivers/edac/i3200_edac.c +++ b/drivers/edac/i3200_edac.c @@ -23,6 +23,7 @@ #define PCI_DEVICE_ID_INTEL_3200_HB 0x29f0 +#define I3200_DIMMS 4 #define I3200_RANKS 8 #define I3200_RANKS_PER_CHANNEL 4 #define I3200_CHANNELS 2 @@ -217,21 +218,25 @@ static void i3200_process_error_info(struct mem_ctl_info *mci, return; if ((info->errsts ^ info->errsts2) & I3200_ERRSTS_BITS) { - edac_mc_handle_ce_no_info(mci, "UE overwrote CE"); + edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 0, 0, 0, + -1, -1, -1, "UE overwrote CE", "", NULL); info->errsts = info->errsts2; } for (channel = 0; channel < nr_channels; channel++) { log = info->eccerrlog[channel]; if (log & I3200_ECCERRLOG_UE) { - edac_mc_handle_ue(mci, 0, 0, - eccerrlog_row(channel, log), - "i3200 UE"); + edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, + 0, 0, 0, + eccerrlog_row(channel, log), + -1, -1, + "i3000 UE", "", NULL); } else if (log & I3200_ECCERRLOG_CE) { - edac_mc_handle_ce(mci, 0, 0, - eccerrlog_syndrome(log), - eccerrlog_row(channel, log), 0, - "i3200 CE"); + edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, + 0, 0, eccerrlog_syndrome(log), + eccerrlog_row(channel, log), + -1, -1, + "i3000 UE", "", NULL); } } } @@ -319,9 +324,9 @@ static unsigned long drb_to_nr_pages( static int i3200_probe1(struct pci_dev *pdev, int dev_idx) { int rc; - int i; + int i, j; struct mem_ctl_info *mci = NULL; - unsigned long last_page; + struct edac_mc_layer layers[2]; u16 drbs[I3200_CHANNELS][I3200_RANKS_PER_CHANNEL]; bool stacked; void __iomem *window; @@ -336,8 +341,14 @@ static int i3200_probe1(struct pci_dev *pdev, int dev_idx) i3200_get_drbs(window, drbs); nr_channels = how_many_channels(pdev); - mci = edac_mc_alloc(sizeof(struct i3200_priv), I3200_RANKS, - nr_channels, 0); + layers[0].type = EDAC_MC_LAYER_CHIP_SELECT; + layers[0].size = I3200_DIMMS; + layers[0].is_virt_csrow = true; + layers[1].type = EDAC_MC_LAYER_CHANNEL; + layers[1].size = nr_channels; + layers[1].is_virt_csrow = false; + mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers, + sizeof(struct i3200_priv)); if (!mci) return -ENOMEM; @@ -366,7 +377,6 @@ static int i3200_probe1(struct pci_dev *pdev, int dev_idx) * cumulative; the last one will contain the total memory * contained in all ranks. */ - last_page = -1UL; for (i = 0; i < mci->nr_csrows; i++) { unsigned long nr_pages; struct csrow_info *csrow = &mci->csrows[i]; @@ -375,20 +385,18 @@ static int i3200_probe1(struct pci_dev *pdev, int dev_idx) i / I3200_RANKS_PER_CHANNEL, i % I3200_RANKS_PER_CHANNEL); - if (nr_pages == 0) { - csrow->mtype = MEM_EMPTY; + if (nr_pages == 0) continue; - } - csrow->first_page = last_page + 1; - last_page += nr_pages; - csrow->last_page = last_page; - csrow->nr_pages = nr_pages; + for (j = 0; j < nr_channels; j++) { + struct dimm_info *dimm = csrow->channels[j].dimm; - csrow->grain = nr_pages << PAGE_SHIFT; - csrow->mtype = MEM_DDR2; - csrow->dtype = DEV_UNKNOWN; - csrow->edac_mode = EDAC_UNKNOWN; + dimm->nr_pages = nr_pages / nr_channels; + dimm->grain = nr_pages << PAGE_SHIFT; + dimm->mtype = MEM_DDR2; + dimm->dtype = DEV_UNKNOWN; + dimm->edac_mode = EDAC_UNKNOWN; + } } i3200_clear_error_info(mci); diff --git a/drivers/edac/i5000_edac.c b/drivers/edac/i5000_edac.c index a2680d8..11ea835 100644 --- a/drivers/edac/i5000_edac.c +++ b/drivers/edac/i5000_edac.c @@ -270,7 +270,8 @@ #define MTR3 0x8C #define NUM_MTRS 4 -#define CHANNELS_PER_BRANCH (2) +#define CHANNELS_PER_BRANCH 2 +#define MAX_BRANCHES 2 /* Defines to extract the vaious fields from the * MTRx - Memory Technology Registers @@ -473,7 +474,6 @@ static void i5000_process_fatal_error_info(struct mem_ctl_info *mci, char msg[EDAC_MC_LABEL_LEN + 1 + 160]; char *specific = NULL; u32 allErrors; - int branch; int channel; int bank; int rank; @@ -485,8 +485,7 @@ static void i5000_process_fatal_error_info(struct mem_ctl_info *mci, if (!allErrors) return; /* if no error, return now */ - branch = EXTRACT_FBDCHAN_INDX(info->ferr_fat_fbd); - channel = branch; + channel = EXTRACT_FBDCHAN_INDX(info->ferr_fat_fbd); /* Use the NON-Recoverable macros to extract data */ bank = NREC_BANK(info->nrecmema); @@ -495,9 +494,9 @@ static void i5000_process_fatal_error_info(struct mem_ctl_info *mci, ras = NREC_RAS(info->nrecmemb); cas = NREC_CAS(info->nrecmemb); - debugf0("\t\tCSROW= %d Channels= %d,%d (Branch= %d " - "DRAM Bank= %d rdwr= %s ras= %d cas= %d)\n", - rank, channel, channel + 1, branch >> 1, bank, + debugf0("\t\tCSROW= %d Channel= %d " + "(DRAM Bank= %d rdwr= %s ras= %d cas= %d)\n", + rank, channel, bank, rdwr ? "Write" : "Read", ras, cas); /* Only 1 bit will be on */ @@ -533,13 +532,14 @@ static void i5000_process_fatal_error_info(struct mem_ctl_info *mci, /* Form out message */ snprintf(msg, sizeof(msg), - "(Branch=%d DRAM-Bank=%d RDWR=%s RAS=%d CAS=%d " - "FATAL Err=0x%x (%s))", - branch >> 1, bank, rdwr ? "Write" : "Read", ras, cas, - allErrors, specific); + "Bank=%d RAS=%d CAS=%d FATAL Err=0x%x (%s)", + bank, ras, cas, allErrors, specific); /* Call the helper to output message */ - edac_mc_handle_fbd_ue(mci, rank, channel, channel + 1, msg); + edac_mc_handle_error(HW_EVENT_ERR_FATAL, mci, 0, 0, 0, + channel >> 1, channel & 1, rank, + rdwr ? "Write error" : "Read error", + msg, NULL); } /* @@ -633,13 +633,14 @@ static void i5000_process_nonfatal_error_info(struct mem_ctl_info *mci, /* Form out message */ snprintf(msg, sizeof(msg), - "(Branch=%d DRAM-Bank=%d RDWR=%s RAS=%d " - "CAS=%d, UE Err=0x%x (%s))", - branch >> 1, bank, rdwr ? "Write" : "Read", ras, cas, - ue_errors, specific); + "Rank=%d Bank=%d RAS=%d CAS=%d, UE Err=0x%x (%s)", + rank, bank, ras, cas, ue_errors, specific); /* Call the helper to output message */ - edac_mc_handle_fbd_ue(mci, rank, channel, channel + 1, msg); + edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 0, 0, 0, + channel >> 1, -1, rank, + rdwr ? "Write error" : "Read error", + msg, NULL); } /* Check correctable errors */ @@ -685,13 +686,16 @@ static void i5000_process_nonfatal_error_info(struct mem_ctl_info *mci, /* Form out message */ snprintf(msg, sizeof(msg), - "(Branch=%d DRAM-Bank=%d RDWR=%s RAS=%d " + "Rank=%d Bank=%d RDWR=%s RAS=%d " "CAS=%d, CE Err=0x%x (%s))", branch >> 1, bank, rdwr ? "Write" : "Read", ras, cas, ce_errors, specific); /* Call the helper to output message */ - edac_mc_handle_fbd_ce(mci, rank, channel, msg); + edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 0, 0, 0, + channel >> 1, channel % 2, rank, + rdwr ? "Write error" : "Read error", + msg, NULL); } if (!misc_messages) @@ -731,11 +735,12 @@ static void i5000_process_nonfatal_error_info(struct mem_ctl_info *mci, /* Form out message */ snprintf(msg, sizeof(msg), - "(Branch=%d Err=%#x (%s))", branch >> 1, - misc_errors, specific); + "Err=%#x (%s)", misc_errors, specific); /* Call the helper to output message */ - edac_mc_handle_fbd_ce(mci, 0, 0, msg); + edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 0, 0, 0, + branch >> 1, -1, -1, + "Misc error", msg, NULL); } } @@ -956,14 +961,14 @@ static int determine_amb_present_reg(struct i5000_pvt *pvt, int channel) * * return the proper MTR register as determine by the csrow and channel desired */ -static int determine_mtr(struct i5000_pvt *pvt, int csrow, int channel) +static int determine_mtr(struct i5000_pvt *pvt, int slot, int channel) { int mtr; if (channel < CHANNELS_PER_BRANCH) - mtr = pvt->b0_mtr[csrow >> 1]; + mtr = pvt->b0_mtr[slot]; else - mtr = pvt->b1_mtr[csrow >> 1]; + mtr = pvt->b1_mtr[slot]; return mtr; } @@ -988,37 +993,34 @@ static void decode_mtr(int slot_row, u16 mtr) debugf2("\t\tNUMCOL: %s\n", numcol_toString[MTR_DIMM_COLS(mtr)]); } -static void handle_channel(struct i5000_pvt *pvt, int csrow, int channel, +static void handle_channel(struct i5000_pvt *pvt, int slot, int channel, struct i5000_dimm_info *dinfo) { int mtr; int amb_present_reg; int addrBits; - mtr = determine_mtr(pvt, csrow, channel); + mtr = determine_mtr(pvt, slot, channel); if (MTR_DIMMS_PRESENT(mtr)) { amb_present_reg = determine_amb_present_reg(pvt, channel); - /* Determine if there is a DIMM present in this DIMM slot */ - if (amb_present_reg & (1 << (csrow >> 1))) { + /* Determine if there is a DIMM present in this DIMM slot */ + if (amb_present_reg) { dinfo->dual_rank = MTR_DIMM_RANK(mtr); - if (!((dinfo->dual_rank == 0) && - ((csrow & 0x1) == 0x1))) { - /* Start with the number of bits for a Bank - * on the DRAM */ - addrBits = MTR_DRAM_BANKS_ADDR_BITS(mtr); - /* Add thenumber of ROW bits */ - addrBits += MTR_DIMM_ROWS_ADDR_BITS(mtr); - /* add the number of COLUMN bits */ - addrBits += MTR_DIMM_COLS_ADDR_BITS(mtr); - - addrBits += 6; /* add 64 bits per DIMM */ - addrBits -= 20; /* divide by 2^^20 */ - addrBits -= 3; /* 8 bits per bytes */ - - dinfo->megabytes = 1 << addrBits; - } + /* Start with the number of bits for a Bank + * on the DRAM */ + addrBits = MTR_DRAM_BANKS_ADDR_BITS(mtr); + /* Add the number of ROW bits */ + addrBits += MTR_DIMM_ROWS_ADDR_BITS(mtr); + /* add the number of COLUMN bits */ + addrBits += MTR_DIMM_COLS_ADDR_BITS(mtr); + + addrBits += 6; /* add 64 bits per DIMM */ + addrBits -= 20; /* divide by 2^^20 */ + addrBits -= 3; /* 8 bits per bytes */ + + dinfo->megabytes = 1 << addrBits; } } } @@ -1032,10 +1034,9 @@ static void handle_channel(struct i5000_pvt *pvt, int csrow, int channel, static void calculate_dimm_size(struct i5000_pvt *pvt) { struct i5000_dimm_info *dinfo; - int csrow, max_csrows; + int slot, channel, branch; char *p, *mem_buffer; int space, n; - int channel; /* ================= Generate some debug output ================= */ space = PAGE_SIZE; @@ -1046,22 +1047,17 @@ static void calculate_dimm_size(struct i5000_pvt *pvt) return; } - n = snprintf(p, space, "\n"); - p += n; - space -= n; - - /* Scan all the actual CSROWS (which is # of DIMMS * 2) + /* Scan all the actual slots * and calculate the information for each DIMM - * Start with the highest csrow first, to display it first - * and work toward the 0th csrow + * Start with the highest slot first, to display it first + * and work toward the 0th slot */ - max_csrows = pvt->maxdimmperch * 2; - for (csrow = max_csrows - 1; csrow >= 0; csrow--) { + for (slot = pvt->maxdimmperch - 1; slot >= 0; slot--) { - /* on an odd csrow, first output a 'boundary' marker, + /* on an odd slot, first output a 'boundary' marker, * then reset the message buffer */ - if (csrow & 0x1) { - n = snprintf(p, space, "---------------------------" + if (slot & 0x1) { + n = snprintf(p, space, "--------------------------" "--------------------------------"); p += n; space -= n; @@ -1069,30 +1065,39 @@ static void calculate_dimm_size(struct i5000_pvt *pvt) p = mem_buffer; space = PAGE_SIZE; } - n = snprintf(p, space, "csrow %2d ", csrow); + n = snprintf(p, space, "slot %2d ", slot); p += n; space -= n; for (channel = 0; channel < pvt->maxch; channel++) { - dinfo = &pvt->dimm_info[csrow][channel]; - handle_channel(pvt, csrow, channel, dinfo); - n = snprintf(p, space, "%4d MB | ", dinfo->megabytes); + dinfo = &pvt->dimm_info[slot][channel]; + handle_channel(pvt, slot, channel, dinfo); + if (dinfo->megabytes) + n = snprintf(p, space, "%4d MB %dR| ", + dinfo->megabytes, dinfo->dual_rank + 1); + else + n = snprintf(p, space, "%4d MB | ", 0); p += n; space -= n; } - n = snprintf(p, space, "\n"); p += n; space -= n; + debugf2("%s\n", mem_buffer); + p = mem_buffer; + space = PAGE_SIZE; } /* Output the last bottom 'boundary' marker */ - n = snprintf(p, space, "---------------------------" - "--------------------------------\n"); + n = snprintf(p, space, "--------------------------" + "--------------------------------"); p += n; space -= n; + debugf2("%s\n", mem_buffer); + p = mem_buffer; + space = PAGE_SIZE; /* now output the 'channel' labels */ - n = snprintf(p, space, " "); + n = snprintf(p, space, " "); p += n; space -= n; for (channel = 0; channel < pvt->maxch; channel++) { @@ -1100,9 +1105,17 @@ static void calculate_dimm_size(struct i5000_pvt *pvt) p += n; space -= n; } - n = snprintf(p, space, "\n"); + debugf2("%s\n", mem_buffer); + p = mem_buffer; + space = PAGE_SIZE; + + n = snprintf(p, space, " "); p += n; - space -= n; + for (branch = 0; branch < MAX_BRANCHES; branch++) { + n = snprintf(p, space, " branch %d | ", branch); + p += n; + space -= n; + } /* output the last message and free buffer */ debugf2("%s\n", mem_buffer); @@ -1235,13 +1248,13 @@ static void i5000_get_mc_regs(struct mem_ctl_info *mci) static int i5000_init_csrows(struct mem_ctl_info *mci) { struct i5000_pvt *pvt; - struct csrow_info *p_csrow; + struct dimm_info *dimm; int empty, channel_count; int max_csrows; - int mtr, mtr1; + int mtr; int csrow_megs; int channel; - int csrow; + int slot; pvt = mci->pvt_info; @@ -1250,43 +1263,40 @@ static int i5000_init_csrows(struct mem_ctl_info *mci) empty = 1; /* Assume NO memory */ - for (csrow = 0; csrow < max_csrows; csrow++) { - p_csrow = &mci->csrows[csrow]; - - p_csrow->csrow_idx = csrow; - - /* use branch 0 for the basis */ - mtr = pvt->b0_mtr[csrow >> 1]; - mtr1 = pvt->b1_mtr[csrow >> 1]; - - /* if no DIMMS on this row, continue */ - if (!MTR_DIMMS_PRESENT(mtr) && !MTR_DIMMS_PRESENT(mtr1)) - continue; + /* + * FIXME: The memory layout used to map slot/channel into the + * real memory architecture is weird: branch+slot are "csrows" + * and channel is channel. That required an extra array (dimm_info) + * to map the dimms. A good cleanup would be to remove this array, + * and do a loop here with branch, channel, slot + */ + for (slot = 0; slot < max_csrows; slot++) { + for (channel = 0; channel < pvt->maxch; channel++) { - /* FAKE OUT VALUES, FIXME */ - p_csrow->first_page = 0 + csrow * 20; - p_csrow->last_page = 9 + csrow * 20; - p_csrow->page_mask = 0xFFF; + mtr = determine_mtr(pvt, slot, channel); - p_csrow->grain = 8; + if (!MTR_DIMMS_PRESENT(mtr)) + continue; - csrow_megs = 0; - for (channel = 0; channel < pvt->maxch; channel++) { - csrow_megs += pvt->dimm_info[csrow][channel].megabytes; - } + dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers, + channel / MAX_BRANCHES, + channel % MAX_BRANCHES, slot); - p_csrow->nr_pages = csrow_megs << 8; + csrow_megs = pvt->dimm_info[slot][channel].megabytes; + dimm->grain = 8; - /* Assume DDR2 for now */ - p_csrow->mtype = MEM_FB_DDR2; + /* Assume DDR2 for now */ + dimm->mtype = MEM_FB_DDR2; - /* ask what device type on this row */ - if (MTR_DRAM_WIDTH(mtr)) - p_csrow->dtype = DEV_X8; - else - p_csrow->dtype = DEV_X4; + /* ask what device type on this row */ + if (MTR_DRAM_WIDTH(mtr)) + dimm->dtype = DEV_X8; + else + dimm->dtype = DEV_X4; - p_csrow->edac_mode = EDAC_S8ECD8ED; + dimm->edac_mode = EDAC_S8ECD8ED; + dimm->nr_pages = csrow_megs << 8; + } empty = 0; } @@ -1317,7 +1327,7 @@ static void i5000_enable_error_reporting(struct mem_ctl_info *mci) } /* - * i5000_get_dimm_and_channel_counts(pdev, &num_csrows, &num_channels) + * i5000_get_dimm_and_channel_counts(pdev, &nr_csrows, &num_channels) * * ask the device how many channels are present and how many CSROWS * as well @@ -1332,7 +1342,7 @@ static void i5000_get_dimm_and_channel_counts(struct pci_dev *pdev, * supported on this memory controller */ pci_read_config_byte(pdev, MAXDIMMPERCH, &value); - *num_dimms_per_channel = (int)value *2; + *num_dimms_per_channel = (int)value; pci_read_config_byte(pdev, MAXCH, &value); *num_channels = (int)value; @@ -1348,10 +1358,10 @@ static void i5000_get_dimm_and_channel_counts(struct pci_dev *pdev, static int i5000_probe1(struct pci_dev *pdev, int dev_idx) { struct mem_ctl_info *mci; + struct edac_mc_layer layers[3]; struct i5000_pvt *pvt; int num_channels; int num_dimms_per_channel; - int num_csrows; debugf0("MC: %s: %s(), pdev bus %u dev=0x%x fn=0x%x\n", __FILE__, __func__, @@ -1377,14 +1387,22 @@ static int i5000_probe1(struct pci_dev *pdev, int dev_idx) */ i5000_get_dimm_and_channel_counts(pdev, &num_dimms_per_channel, &num_channels); - num_csrows = num_dimms_per_channel * 2; - debugf0("MC: %s(): Number of - Channels= %d DIMMS= %d CSROWS= %d\n", - __func__, num_channels, num_dimms_per_channel, num_csrows); + debugf0("MC: %s(): Number of Branches=2 Channels= %d DIMMS= %d\n", + __func__, num_channels, num_dimms_per_channel); /* allocate a new MC control structure */ - mci = edac_mc_alloc(sizeof(*pvt), num_csrows, num_channels, 0); + layers[0].type = EDAC_MC_LAYER_BRANCH; + layers[0].size = MAX_BRANCHES; + layers[0].is_virt_csrow = false; + layers[1].type = EDAC_MC_LAYER_CHANNEL; + layers[1].size = num_channels / MAX_BRANCHES; + layers[1].is_virt_csrow = false; + layers[2].type = EDAC_MC_LAYER_SLOT; + layers[2].size = num_dimms_per_channel; + layers[2].is_virt_csrow = true; + mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers, sizeof(*pvt)); if (mci == NULL) return -ENOMEM; diff --git a/drivers/edac/i5100_edac.c b/drivers/edac/i5100_edac.c index d500749..e9e7c2a 100644 --- a/drivers/edac/i5100_edac.c +++ b/drivers/edac/i5100_edac.c @@ -14,6 +14,11 @@ * rows for each respective channel are laid out one after another, * the first half belonging to channel 0, the second half belonging * to channel 1. + * + * This driver is for DDR2 DIMMs, and it uses chip select to select among the + * several ranks. However, instead of showing memories as ranks, it outputs + * them as DIMM's. An internal table creates the association between ranks + * and DIMM's. */ #include <linux/module.h> #include <linux/init.h> @@ -410,14 +415,6 @@ static int i5100_csrow_to_chan(const struct mem_ctl_info *mci, int csrow) return csrow / priv->ranksperchan; } -static unsigned i5100_rank_to_csrow(const struct mem_ctl_info *mci, - int chan, int rank) -{ - const struct i5100_priv *priv = mci->pvt_info; - - return chan * priv->ranksperchan + rank; -} - static void i5100_handle_ce(struct mem_ctl_info *mci, int chan, unsigned bank, @@ -427,17 +424,17 @@ static void i5100_handle_ce(struct mem_ctl_info *mci, unsigned ras, const char *msg) { - const int csrow = i5100_rank_to_csrow(mci, chan, rank); + char detail[80]; - printk(KERN_ERR - "CE chan %d, bank %u, rank %u, syndrome 0x%lx, " - "cas %u, ras %u, csrow %u, label \"%s\": %s\n", - chan, bank, rank, syndrome, cas, ras, - csrow, mci->csrows[csrow].channels[0].label, msg); + /* Form out message */ + snprintf(detail, sizeof(detail), + "bank %u, cas %u, ras %u\n", + bank, cas, ras); - mci->ce_count++; - mci->csrows[csrow].ce_count++; - mci->csrows[csrow].channels[0].ce_count++; + edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, + 0, 0, syndrome, + chan, rank, -1, + msg, detail, NULL); } static void i5100_handle_ue(struct mem_ctl_info *mci, @@ -449,16 +446,17 @@ static void i5100_handle_ue(struct mem_ctl_info *mci, unsigned ras, const char *msg) { - const int csrow = i5100_rank_to_csrow(mci, chan, rank); + char detail[80]; - printk(KERN_ERR - "UE chan %d, bank %u, rank %u, syndrome 0x%lx, " - "cas %u, ras %u, csrow %u, label \"%s\": %s\n", - chan, bank, rank, syndrome, cas, ras, - csrow, mci->csrows[csrow].channels[0].label, msg); + /* Form out message */ + snprintf(detail, sizeof(detail), + "bank %u, cas %u, ras %u\n", + bank, cas, ras); - mci->ue_count++; - mci->csrows[csrow].ue_count++; + edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, + 0, 0, syndrome, + chan, rank, -1, + msg, detail, NULL); } static void i5100_read_log(struct mem_ctl_info *mci, int chan, @@ -835,10 +833,10 @@ static void __devinit i5100_init_interleaving(struct pci_dev *pdev, static void __devinit i5100_init_csrows(struct mem_ctl_info *mci) { int i; - unsigned long total_pages = 0UL; struct i5100_priv *priv = mci->pvt_info; - for (i = 0; i < mci->nr_csrows; i++) { + for (i = 0; i < mci->tot_dimms; i++) { + struct dimm_info *dimm; const unsigned long npages = i5100_npages(mci, i); const unsigned chan = i5100_csrow_to_chan(mci, i); const unsigned rank = i5100_csrow_to_rank(mci, i); @@ -846,33 +844,23 @@ static void __devinit i5100_init_csrows(struct mem_ctl_info *mci) if (!npages) continue; - /* - * FIXME: these two are totally bogus -- I don't see how to - * map them correctly to this structure... - */ - mci->csrows[i].first_page = total_pages; - mci->csrows[i].last_page = total_pages + npages - 1; - mci->csrows[i].page_mask = 0UL; - - mci->csrows[i].nr_pages = npages; - mci->csrows[i].grain = 32; - mci->csrows[i].csrow_idx = i; - mci->csrows[i].dtype = - (priv->mtr[chan][rank].width == 4) ? DEV_X4 : DEV_X8; - mci->csrows[i].ue_count = 0; - mci->csrows[i].ce_count = 0; - mci->csrows[i].mtype = MEM_RDDR2; - mci->csrows[i].edac_mode = EDAC_SECDED; - mci->csrows[i].mci = mci; - mci->csrows[i].nr_channels = 1; - mci->csrows[i].channels[0].chan_idx = 0; - mci->csrows[i].channels[0].ce_count = 0; - mci->csrows[i].channels[0].csrow = mci->csrows + i; - snprintf(mci->csrows[i].channels[0].label, - sizeof(mci->csrows[i].channels[0].label), - "DIMM%u", i5100_rank_to_slot(mci, chan, rank)); - - total_pages += npages; + dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers, + chan, rank, 0); + + dimm->nr_pages = npages; + if (npages) { + dimm->grain = 32; + dimm->dtype = (priv->mtr[chan][rank].width == 4) ? + DEV_X4 : DEV_X8; + dimm->mtype = MEM_RDDR2; + dimm->edac_mode = EDAC_SECDED; + snprintf(dimm->label, sizeof(dimm->label), + "DIMM%u", + i5100_rank_to_slot(mci, chan, rank)); + } + + debugf2("dimm channel %d, rank %d, size %ld\n", + chan, rank, (long)PAGES_TO_MiB(npages)); } } @@ -881,6 +869,7 @@ static int __devinit i5100_init_one(struct pci_dev *pdev, { int rc; struct mem_ctl_info *mci; + struct edac_mc_layer layers[2]; struct i5100_priv *priv; struct pci_dev *ch0mm, *ch1mm; int ret = 0; @@ -941,7 +930,14 @@ static int __devinit i5100_init_one(struct pci_dev *pdev, goto bail_ch1; } - mci = edac_mc_alloc(sizeof(*priv), ranksperch * 2, 1, 0); + layers[0].type = EDAC_MC_LAYER_CHANNEL; + layers[0].size = 2; + layers[0].is_virt_csrow = false; + layers[1].type = EDAC_MC_LAYER_SLOT; + layers[1].size = ranksperch; + layers[1].is_virt_csrow = true; + mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers, + sizeof(*priv)); if (!mci) { ret = -ENOMEM; goto bail_disable_ch1; diff --git a/drivers/edac/i5400_edac.c b/drivers/edac/i5400_edac.c index 1869a10..6640c29 100644 --- a/drivers/edac/i5400_edac.c +++ b/drivers/edac/i5400_edac.c @@ -18,6 +18,10 @@ * Intel 5400 Chipset Memory Controller Hub (MCH) - Datasheet * http://developer.intel.com/design/chipsets/datashts/313070.htm * + * This Memory Controller manages DDR2 FB-DIMMs. It has 2 branches, each with + * 2 channels operating in lockstep no-mirror mode. Each channel can have up to + * 4 dimm's, each with up to 8GB. + * */ #include <linux/module.h> @@ -44,12 +48,10 @@ edac_mc_chipset_printk(mci, level, "i5400", fmt, ##arg) /* Limits for i5400 */ -#define NUM_MTRS_PER_BRANCH 4 +#define MAX_BRANCHES 2 #define CHANNELS_PER_BRANCH 2 -#define MAX_DIMMS_PER_CHANNEL NUM_MTRS_PER_BRANCH -#define MAX_CHANNELS 4 -/* max possible csrows per channel */ -#define MAX_CSROWS (MAX_DIMMS_PER_CHANNEL) +#define DIMMS_PER_CHANNEL 4 +#define MAX_CHANNELS (MAX_BRANCHES * CHANNELS_PER_BRANCH) /* Device 16, * Function 0: System Address @@ -347,16 +349,16 @@ struct i5400_pvt { u16 mir0, mir1; - u16 b0_mtr[NUM_MTRS_PER_BRANCH]; /* Memory Technlogy Reg */ + u16 b0_mtr[DIMMS_PER_CHANNEL]; /* Memory Technlogy Reg */ u16 b0_ambpresent0; /* Branch 0, Channel 0 */ u16 b0_ambpresent1; /* Brnach 0, Channel 1 */ - u16 b1_mtr[NUM_MTRS_PER_BRANCH]; /* Memory Technlogy Reg */ + u16 b1_mtr[DIMMS_PER_CHANNEL]; /* Memory Technlogy Reg */ u16 b1_ambpresent0; /* Branch 1, Channel 8 */ u16 b1_ambpresent1; /* Branch 1, Channel 1 */ /* DIMM information matrix, allocating architecture maximums */ - struct i5400_dimm_info dimm_info[MAX_CSROWS][MAX_CHANNELS]; + struct i5400_dimm_info dimm_info[DIMMS_PER_CHANNEL][MAX_CHANNELS]; /* Actual values for this controller */ int maxch; /* Max channels */ @@ -532,13 +534,15 @@ static void i5400_proccess_non_recoverable_info(struct mem_ctl_info *mci, int ras, cas; int errnum; char *type = NULL; + enum hw_event_mc_err_type tp_event = HW_EVENT_ERR_UNCORRECTED; if (!allErrors) return; /* if no error, return now */ - if (allErrors & ERROR_FAT_MASK) + if (allErrors & ERROR_FAT_MASK) { type = "FATAL"; - else if (allErrors & FERR_NF_UNCORRECTABLE) + tp_event = HW_EVENT_ERR_FATAL; + } else if (allErrors & FERR_NF_UNCORRECTABLE) type = "NON-FATAL uncorrected"; else type = "NON-FATAL recoverable"; @@ -556,7 +560,7 @@ static void i5400_proccess_non_recoverable_info(struct mem_ctl_info *mci, ras = nrec_ras(info); cas = nrec_cas(info); - debugf0("\t\tCSROW= %d Channels= %d,%d (Branch= %d " + debugf0("\t\tDIMM= %d Channels= %d,%d (Branch= %d " "DRAM Bank= %d Buffer ID = %d rdwr= %s ras= %d cas= %d)\n", rank, channel, channel + 1, branch >> 1, bank, buf_id, rdwr_str(rdwr), ras, cas); @@ -566,13 +570,13 @@ static void i5400_proccess_non_recoverable_info(struct mem_ctl_info *mci, /* Form out message */ snprintf(msg, sizeof(msg), - "%s (Branch=%d DRAM-Bank=%d Buffer ID = %d RDWR=%s " - "RAS=%d CAS=%d %s Err=0x%lx (%s))", - type, branch >> 1, bank, buf_id, rdwr_str(rdwr), ras, cas, - type, allErrors, error_name[errnum]); + "Bank=%d Buffer ID = %d RAS=%d CAS=%d Err=0x%lx (%s)", + bank, buf_id, ras, cas, allErrors, error_name[errnum]); - /* Call the helper to output message */ - edac_mc_handle_fbd_ue(mci, rank, channel, channel + 1, msg); + edac_mc_handle_error(tp_event, mci, 0, 0, 0, + branch >> 1, -1, rank, + rdwr ? "Write error" : "Read error", + msg, NULL); } /* @@ -630,7 +634,7 @@ static void i5400_process_nonfatal_error_info(struct mem_ctl_info *mci, /* Only 1 bit will be on */ errnum = find_first_bit(&allErrors, ARRAY_SIZE(error_name)); - debugf0("\t\tCSROW= %d Channel= %d (Branch %d " + debugf0("\t\tDIMM= %d Channel= %d (Branch %d " "DRAM Bank= %d rdwr= %s ras= %d cas= %d)\n", rank, channel, branch >> 1, bank, rdwr_str(rdwr), ras, cas); @@ -642,8 +646,10 @@ static void i5400_process_nonfatal_error_info(struct mem_ctl_info *mci, branch >> 1, bank, rdwr_str(rdwr), ras, cas, allErrors, error_name[errnum]); - /* Call the helper to output message */ - edac_mc_handle_fbd_ce(mci, rank, channel, msg); + edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 0, 0, 0, + branch >> 1, channel % 2, rank, + rdwr ? "Write error" : "Read error", + msg, NULL); return; } @@ -831,8 +837,8 @@ static int i5400_get_devices(struct mem_ctl_info *mci, int dev_idx) /* * determine_amb_present * - * the information is contained in NUM_MTRS_PER_BRANCH different - * registers determining which of the NUM_MTRS_PER_BRANCH requires + * the information is contained in DIMMS_PER_CHANNEL different + * registers determining which of the DIMMS_PER_CHANNEL requires * knowing which channel is in question * * 2 branches, each with 2 channels @@ -861,11 +867,11 @@ static int determine_amb_present_reg(struct i5400_pvt *pvt, int channel) } /* - * determine_mtr(pvt, csrow, channel) + * determine_mtr(pvt, dimm, channel) * - * return the proper MTR register as determine by the csrow and desired channel + * return the proper MTR register as determine by the dimm and desired channel */ -static int determine_mtr(struct i5400_pvt *pvt, int csrow, int channel) +static int determine_mtr(struct i5400_pvt *pvt, int dimm, int channel) { int mtr; int n; @@ -873,11 +879,11 @@ static int determine_mtr(struct i5400_pvt *pvt, int csrow, int channel) /* There is one MTR for each slot pair of FB-DIMMs, Each slot pair may be at branch 0 or branch 1. */ - n = csrow; + n = dimm; - if (n >= NUM_MTRS_PER_BRANCH) { - debugf0("ERROR: trying to access an invalid csrow: %d\n", - csrow); + if (n >= DIMMS_PER_CHANNEL) { + debugf0("ERROR: trying to access an invalid dimm: %d\n", + dimm); return 0; } @@ -913,19 +919,19 @@ static void decode_mtr(int slot_row, u16 mtr) debugf2("\t\tNUMCOL: %s\n", numcol_toString[MTR_DIMM_COLS(mtr)]); } -static void handle_channel(struct i5400_pvt *pvt, int csrow, int channel, +static void handle_channel(struct i5400_pvt *pvt, int dimm, int channel, struct i5400_dimm_info *dinfo) { int mtr; int amb_present_reg; int addrBits; - mtr = determine_mtr(pvt, csrow, channel); + mtr = determine_mtr(pvt, dimm, channel); if (MTR_DIMMS_PRESENT(mtr)) { amb_present_reg = determine_amb_present_reg(pvt, channel); /* Determine if there is a DIMM present in this DIMM slot */ - if (amb_present_reg & (1 << csrow)) { + if (amb_present_reg & (1 << dimm)) { /* Start with the number of bits for a Bank * on the DRAM */ addrBits = MTR_DRAM_BANKS_ADDR_BITS(mtr); @@ -954,10 +960,10 @@ static void handle_channel(struct i5400_pvt *pvt, int csrow, int channel, static void calculate_dimm_size(struct i5400_pvt *pvt) { struct i5400_dimm_info *dinfo; - int csrow, max_csrows; + int dimm, max_dimms; char *p, *mem_buffer; int space, n; - int channel; + int channel, branch; /* ================= Generate some debug output ================= */ space = PAGE_SIZE; @@ -968,32 +974,32 @@ static void calculate_dimm_size(struct i5400_pvt *pvt) return; } - /* Scan all the actual CSROWS + /* Scan all the actual DIMMS * and calculate the information for each DIMM - * Start with the highest csrow first, to display it first - * and work toward the 0th csrow + * Start with the highest dimm first, to display it first + * and work toward the 0th dimm */ - max_csrows = pvt->maxdimmperch; - for (csrow = max_csrows - 1; csrow >= 0; csrow--) { + max_dimms = pvt->maxdimmperch; + for (dimm = max_dimms - 1; dimm >= 0; dimm--) { - /* on an odd csrow, first output a 'boundary' marker, + /* on an odd dimm, first output a 'boundary' marker, * then reset the message buffer */ - if (csrow & 0x1) { + if (dimm & 0x1) { n = snprintf(p, space, "---------------------------" - "--------------------------------"); + "-------------------------------"); p += n; space -= n; debugf2("%s\n", mem_buffer); p = mem_buffer; space = PAGE_SIZE; } - n = snprintf(p, space, "csrow %2d ", csrow); + n = snprintf(p, space, "dimm %2d ", dimm); p += n; space -= n; for (channel = 0; channel < pvt->maxch; channel++) { - dinfo = &pvt->dimm_info[csrow][channel]; - handle_channel(pvt, csrow, channel, dinfo); + dinfo = &pvt->dimm_info[dimm][channel]; + handle_channel(pvt, dimm, channel, dinfo); n = snprintf(p, space, "%4d MB | ", dinfo->megabytes); p += n; space -= n; @@ -1005,7 +1011,7 @@ static void calculate_dimm_size(struct i5400_pvt *pvt) /* Output the last bottom 'boundary' marker */ n = snprintf(p, space, "---------------------------" - "--------------------------------"); + "-------------------------------"); p += n; space -= n; debugf2("%s\n", mem_buffer); @@ -1013,7 +1019,7 @@ static void calculate_dimm_size(struct i5400_pvt *pvt) space = PAGE_SIZE; /* now output the 'channel' labels */ - n = snprintf(p, space, " "); + n = snprintf(p, space, " "); p += n; space -= n; for (channel = 0; channel < pvt->maxch; channel++) { @@ -1022,6 +1028,19 @@ static void calculate_dimm_size(struct i5400_pvt *pvt) space -= n; } + space -= n; + debugf2("%s\n", mem_buffer); + p = mem_buffer; + space = PAGE_SIZE; + + n = snprintf(p, space, " "); + p += n; + for (branch = 0; branch < MAX_BRANCHES; branch++) { + n = snprintf(p, space, " branch %d | ", branch); + p += n; + space -= n; + } + /* output the last message and free buffer */ debugf2("%s\n", mem_buffer); kfree(mem_buffer); @@ -1080,7 +1099,7 @@ static void i5400_get_mc_regs(struct mem_ctl_info *mci) debugf2("MIR1: limit= 0x%x WAY1= %u WAY0= %x\n", limit, way1, way0); /* Get the set of MTR[0-3] regs by each branch */ - for (slot_row = 0; slot_row < NUM_MTRS_PER_BRANCH; slot_row++) { + for (slot_row = 0; slot_row < DIMMS_PER_CHANNEL; slot_row++) { int where = MTR0 + (slot_row * sizeof(u16)); /* Branch 0 set of MTR registers */ @@ -1105,7 +1124,7 @@ static void i5400_get_mc_regs(struct mem_ctl_info *mci) /* Read and dump branch 0's MTRs */ debugf2("\nMemory Technology Registers:\n"); debugf2(" Branch 0:\n"); - for (slot_row = 0; slot_row < NUM_MTRS_PER_BRANCH; slot_row++) + for (slot_row = 0; slot_row < DIMMS_PER_CHANNEL; slot_row++) decode_mtr(slot_row, pvt->b0_mtr[slot_row]); pci_read_config_word(pvt->branch_0, AMBPRESENT_0, @@ -1122,7 +1141,7 @@ static void i5400_get_mc_regs(struct mem_ctl_info *mci) } else { /* Read and dump branch 1's MTRs */ debugf2(" Branch 1:\n"); - for (slot_row = 0; slot_row < NUM_MTRS_PER_BRANCH; slot_row++) + for (slot_row = 0; slot_row < DIMMS_PER_CHANNEL; slot_row++) decode_mtr(slot_row, pvt->b1_mtr[slot_row]); pci_read_config_word(pvt->branch_1, AMBPRESENT_0, @@ -1141,7 +1160,7 @@ static void i5400_get_mc_regs(struct mem_ctl_info *mci) } /* - * i5400_init_csrows Initialize the 'csrows' table within + * i5400_init_dimms Initialize the 'dimms' table within * the mci control structure with the * addressing of memory. * @@ -1149,64 +1168,68 @@ static void i5400_get_mc_regs(struct mem_ctl_info *mci) * 0 success * 1 no actual memory found on this MC */ -static int i5400_init_csrows(struct mem_ctl_info *mci) +static int i5400_init_dimms(struct mem_ctl_info *mci) { struct i5400_pvt *pvt; - struct csrow_info *p_csrow; - int empty, channel_count; - int max_csrows; + struct dimm_info *dimm; + int ndimms, channel_count; + int max_dimms; int mtr; - int csrow_megs; - int channel; - int csrow; + int size_mb; + int channel, slot; pvt = mci->pvt_info; channel_count = pvt->maxch; - max_csrows = pvt->maxdimmperch; + max_dimms = pvt->maxdimmperch; - empty = 1; /* Assume NO memory */ + ndimms = 0; - for (csrow = 0; csrow < max_csrows; csrow++) { - p_csrow = &mci->csrows[csrow]; - - p_csrow->csrow_idx = csrow; - - /* use branch 0 for the basis */ - mtr = determine_mtr(pvt, csrow, 0); - - /* if no DIMMS on this row, continue */ - if (!MTR_DIMMS_PRESENT(mtr)) - continue; - - /* FAKE OUT VALUES, FIXME */ - p_csrow->first_page = 0 + csrow * 20; - p_csrow->last_page = 9 + csrow * 20; - p_csrow->page_mask = 0xFFF; - - p_csrow->grain = 8; - - csrow_megs = 0; - for (channel = 0; channel < pvt->maxch; channel++) - csrow_megs += pvt->dimm_info[csrow][channel].megabytes; - - p_csrow->nr_pages = csrow_megs << 8; - - /* Assume DDR2 for now */ - p_csrow->mtype = MEM_FB_DDR2; - - /* ask what device type on this row */ - if (MTR_DRAM_WIDTH(mtr)) - p_csrow->dtype = DEV_X8; - else - p_csrow->dtype = DEV_X4; - - p_csrow->edac_mode = EDAC_S8ECD8ED; - - empty = 0; + /* + * FIXME: remove pvt->dimm_info[slot][channel] and use the 3 + * layers here. + */ + for (channel = 0; channel < mci->layers[0].size * mci->layers[1].size; + channel++) { + for (slot = 0; slot < mci->layers[2].size; slot++) { + mtr = determine_mtr(pvt, slot, channel); + + /* if no DIMMS on this slot, continue */ + if (!MTR_DIMMS_PRESENT(mtr)) + continue; + + dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers, + channel / 2, channel % 2, slot); + + size_mb = pvt->dimm_info[slot][channel].megabytes; + + debugf2("%s: dimm%zd (branch %d channel %d slot %d): %d.%03d GB\n", + __func__, dimm - mci->dimms, + channel / 2, channel % 2, slot, + size_mb / 1000, size_mb % 1000); + + dimm->nr_pages = size_mb << 8; + dimm->grain = 8; + dimm->dtype = MTR_DRAM_WIDTH(mtr) ? DEV_X8 : DEV_X4; + dimm->mtype = MEM_FB_DDR2; + /* + * The eccc mechanism is SDDC (aka SECC), with + * is similar to Chipkill. + */ + dimm->edac_mode = MTR_DRAM_WIDTH(mtr) ? + EDAC_S8ECD8ED : EDAC_S4ECD4ED; + ndimms++; + } } - return empty; + /* + * When just one memory is provided, it should be at location (0,0,0). + * With such single-DIMM mode, the SDCC algorithm degrades to SECDEC+. + */ + if (ndimms == 1) + mci->dimms[0].edac_mode = EDAC_SECDED; + + return (ndimms == 0); } /* @@ -1242,9 +1265,7 @@ static int i5400_probe1(struct pci_dev *pdev, int dev_idx) { struct mem_ctl_info *mci; struct i5400_pvt *pvt; - int num_channels; - int num_dimms_per_channel; - int num_csrows; + struct edac_mc_layer layers[3]; if (dev_idx >= ARRAY_SIZE(i5400_devs)) return -EINVAL; @@ -1258,23 +1279,21 @@ static int i5400_probe1(struct pci_dev *pdev, int dev_idx) if (PCI_FUNC(pdev->devfn) != 0) return -ENODEV; - /* As we don't have a motherboard identification routine to determine - * actual number of slots/dimms per channel, we thus utilize the - * resource as specified by the chipset. Thus, we might have - * have more DIMMs per channel than actually on the mobo, but this - * allows the driver to support up to the chipset max, without - * some fancy mobo determination. + /* + * allocate a new MC control structure + * + * This drivers uses the DIMM slot as "csrow" and the rest as "channel". */ - num_dimms_per_channel = MAX_DIMMS_PER_CHANNEL; - num_channels = MAX_CHANNELS; - num_csrows = num_dimms_per_channel; - - debugf0("MC: %s(): Number of - Channels= %d DIMMS= %d CSROWS= %d\n", - __func__, num_channels, num_dimms_per_channel, num_csrows); - - /* allocate a new MC control structure */ - mci = edac_mc_alloc(sizeof(*pvt), num_csrows, num_channels, 0); - + layers[0].type = EDAC_MC_LAYER_BRANCH; + layers[0].size = MAX_BRANCHES; + layers[0].is_virt_csrow = false; + layers[1].type = EDAC_MC_LAYER_CHANNEL; + layers[1].size = CHANNELS_PER_BRANCH; + layers[1].is_virt_csrow = false; + layers[2].type = EDAC_MC_LAYER_SLOT; + layers[2].size = DIMMS_PER_CHANNEL; + layers[2].is_virt_csrow = true; + mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers, sizeof(*pvt)); if (mci == NULL) return -ENOMEM; @@ -1284,8 +1303,8 @@ static int i5400_probe1(struct pci_dev *pdev, int dev_idx) pvt = mci->pvt_info; pvt->system_address = pdev; /* Record this device in our private */ - pvt->maxch = num_channels; - pvt->maxdimmperch = num_dimms_per_channel; + pvt->maxch = MAX_CHANNELS; + pvt->maxdimmperch = DIMMS_PER_CHANNEL; /* 'get' the pci devices we want to reserve for our use */ if (i5400_get_devices(mci, dev_idx)) @@ -1307,13 +1326,13 @@ static int i5400_probe1(struct pci_dev *pdev, int dev_idx) /* Set the function pointer to an actual operation function */ mci->edac_check = i5400_check_error; - /* initialize the MC control structure 'csrows' table + /* initialize the MC control structure 'dimms' table * with the mapping and control information */ - if (i5400_init_csrows(mci)) { + if (i5400_init_dimms(mci)) { debugf0("MC: Setting mci->edac_cap to EDAC_FLAG_NONE\n" - " because i5400_init_csrows() returned nonzero " + " because i5400_init_dimms() returned nonzero " "value\n"); - mci->edac_cap = EDAC_FLAG_NONE; /* no csrows found */ + mci->edac_cap = EDAC_FLAG_NONE; /* no dimms found */ } else { debugf1("MC: Enable error reporting now\n"); i5400_enable_error_reporting(mci); diff --git a/drivers/edac/i7300_edac.c b/drivers/edac/i7300_edac.c index 3bafa3b..97c22fd 100644 --- a/drivers/edac/i7300_edac.c +++ b/drivers/edac/i7300_edac.c @@ -464,17 +464,14 @@ static void i7300_process_fbd_error(struct mem_ctl_info *mci) FERR_FAT_FBD, error_reg); snprintf(pvt->tmp_prt_buffer, PAGE_SIZE, - "FATAL (Branch=%d DRAM-Bank=%d %s " - "RAS=%d CAS=%d Err=0x%lx (%s))", - branch, bank, - is_wr ? "RDWR" : "RD", - ras, cas, - errors, specific); - - /* Call the helper to output message */ - edac_mc_handle_fbd_ue(mci, rank, branch << 1, - (branch << 1) + 1, - pvt->tmp_prt_buffer); + "Bank=%d RAS=%d CAS=%d Err=0x%lx (%s))", + bank, ras, cas, errors, specific); + + edac_mc_handle_error(HW_EVENT_ERR_FATAL, mci, 0, 0, 0, + branch, -1, rank, + is_wr ? "Write error" : "Read error", + pvt->tmp_prt_buffer, NULL); + } /* read in the 1st NON-FATAL error register */ @@ -513,23 +510,14 @@ static void i7300_process_fbd_error(struct mem_ctl_info *mci) /* Form out message */ snprintf(pvt->tmp_prt_buffer, PAGE_SIZE, - "Corrected error (Branch=%d, Channel %d), " - " DRAM-Bank=%d %s " - "RAS=%d CAS=%d, CE Err=0x%lx, Syndrome=0x%08x(%s))", - branch, channel, - bank, - is_wr ? "RDWR" : "RD", - ras, cas, - errors, syndrome, specific); - - /* - * Call the helper to output message - * NOTE: Errors are reported per-branch, and not per-channel - * Currently, we don't know how to identify the right - * channel. - */ - edac_mc_handle_fbd_ce(mci, rank, channel, - pvt->tmp_prt_buffer); + "DRAM-Bank=%d RAS=%d CAS=%d, Err=0x%lx (%s))", + bank, ras, cas, errors, specific); + + edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 0, 0, + syndrome, + branch >> 1, channel % 2, rank, + is_wr ? "Write error" : "Read error", + pvt->tmp_prt_buffer, NULL); } return; } @@ -617,8 +605,7 @@ static void i7300_enable_error_reporting(struct mem_ctl_info *mci) static int decode_mtr(struct i7300_pvt *pvt, int slot, int ch, int branch, struct i7300_dimm_info *dinfo, - struct csrow_info *p_csrow, - u32 *nr_pages) + struct dimm_info *dimm) { int mtr, ans, addrBits, channel; @@ -650,7 +637,6 @@ static int decode_mtr(struct i7300_pvt *pvt, addrBits -= 3; /* 8 bits per bytes */ dinfo->megabytes = 1 << addrBits; - *nr_pages = dinfo->megabytes << 8; debugf2("\t\tWIDTH: x%d\n", MTR_DRAM_WIDTH(mtr)); @@ -663,11 +649,6 @@ static int decode_mtr(struct i7300_pvt *pvt, debugf2("\t\tNUMCOL: %s\n", numcol_toString[MTR_DIMM_COLS(mtr)]); debugf2("\t\tSIZE: %d MB\n", dinfo->megabytes); - p_csrow->grain = 8; - p_csrow->mtype = MEM_FB_DDR2; - p_csrow->csrow_idx = slot; - p_csrow->page_mask = 0; - /* * The type of error detection actually depends of the * mode of operation. When it is just one single memory chip, at @@ -677,15 +658,18 @@ static int decode_mtr(struct i7300_pvt *pvt, * See datasheet Sections 7.3.6 to 7.3.8 */ + dimm->nr_pages = MiB_TO_PAGES(dinfo->megabytes); + dimm->grain = 8; + dimm->mtype = MEM_FB_DDR2; if (IS_SINGLE_MODE(pvt->mc_settings_a)) { - p_csrow->edac_mode = EDAC_SECDED; + dimm->edac_mode = EDAC_SECDED; debugf2("\t\tECC code is 8-byte-over-32-byte SECDED+ code\n"); } else { debugf2("\t\tECC code is on Lockstep mode\n"); if (MTR_DRAM_WIDTH(mtr) == 8) - p_csrow->edac_mode = EDAC_S8ECD8ED; + dimm->edac_mode = EDAC_S8ECD8ED; else - p_csrow->edac_mode = EDAC_S4ECD4ED; + dimm->edac_mode = EDAC_S4ECD4ED; } /* ask what device type on this row */ @@ -694,9 +678,9 @@ static int decode_mtr(struct i7300_pvt *pvt, IS_SCRBALGO_ENHANCED(pvt->mc_settings) ? "enhanced" : "normal"); - p_csrow->dtype = DEV_X8; + dimm->dtype = DEV_X8; } else - p_csrow->dtype = DEV_X4; + dimm->dtype = DEV_X4; return mtr; } @@ -774,11 +758,10 @@ static int i7300_init_csrows(struct mem_ctl_info *mci) { struct i7300_pvt *pvt; struct i7300_dimm_info *dinfo; - struct csrow_info *p_csrow; int rc = -ENODEV; int mtr; int ch, branch, slot, channel; - u32 last_page = 0, nr_pages; + struct dimm_info *dimm; pvt = mci->pvt_info; @@ -809,25 +792,23 @@ static int i7300_init_csrows(struct mem_ctl_info *mci) pci_read_config_word(pvt->pci_dev_2x_0_fbd_branch[branch], where, &pvt->mtr[slot][branch]); - for (ch = 0; ch < MAX_BRANCHES; ch++) { + for (ch = 0; ch < MAX_CH_PER_BRANCH; ch++) { int channel = to_channel(ch, branch); + dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, + mci->n_layers, branch, ch, slot); + dinfo = &pvt->dimm_info[slot][channel]; - p_csrow = &mci->csrows[slot]; mtr = decode_mtr(pvt, slot, ch, branch, - dinfo, p_csrow, &nr_pages); + dinfo, dimm); + /* if no DIMMS on this row, continue */ if (!MTR_DIMMS_PRESENT(mtr)) continue; - /* Update per_csrow memory count */ - p_csrow->nr_pages += nr_pages; - p_csrow->first_page = last_page; - last_page += nr_pages; - p_csrow->last_page = last_page; - rc = 0; + } } } @@ -1042,10 +1023,8 @@ static int __devinit i7300_init_one(struct pci_dev *pdev, const struct pci_device_id *id) { struct mem_ctl_info *mci; + struct edac_mc_layer layers[3]; struct i7300_pvt *pvt; - int num_channels; - int num_dimms_per_channel; - int num_csrows; int rc; /* wake up device */ @@ -1062,23 +1041,17 @@ static int __devinit i7300_init_one(struct pci_dev *pdev, if (PCI_FUNC(pdev->devfn) != 0) return -ENODEV; - /* As we don't have a motherboard identification routine to determine - * actual number of slots/dimms per channel, we thus utilize the - * resource as specified by the chipset. Thus, we might have - * have more DIMMs per channel than actually on the mobo, but this - * allows the driver to support up to the chipset max, without - * some fancy mobo determination. - */ - num_dimms_per_channel = MAX_SLOTS; - num_channels = MAX_CHANNELS; - num_csrows = MAX_SLOTS * MAX_CHANNELS; - - debugf0("MC: %s(): Number of - Channels= %d DIMMS= %d CSROWS= %d\n", - __func__, num_channels, num_dimms_per_channel, num_csrows); - /* allocate a new MC control structure */ - mci = edac_mc_alloc(sizeof(*pvt), num_csrows, num_channels, 0); - + layers[0].type = EDAC_MC_LAYER_BRANCH; + layers[0].size = MAX_BRANCHES; + layers[0].is_virt_csrow = false; + layers[1].type = EDAC_MC_LAYER_CHANNEL; + layers[1].size = MAX_CH_PER_BRANCH; + layers[1].is_virt_csrow = true; + layers[2].type = EDAC_MC_LAYER_SLOT; + layers[2].size = MAX_SLOTS; + layers[2].is_virt_csrow = true; + mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers, sizeof(*pvt)); if (mci == NULL) return -ENOMEM; diff --git a/drivers/edac/i7core_edac.c b/drivers/edac/i7core_edac.c index 7f1dfcc..d27778f 100644 --- a/drivers/edac/i7core_edac.c +++ b/drivers/edac/i7core_edac.c @@ -221,7 +221,9 @@ struct i7core_inject { }; struct i7core_channel { - u32 ranks; + bool is_3dimms_present; + bool is_single_4rank; + bool has_4rank; u32 dimms; }; @@ -257,7 +259,6 @@ struct i7core_pvt { struct i7core_channel channel[NUM_CHANS]; int ce_count_available; - int csrow_map[NUM_CHANS][MAX_DIMMS]; /* ECC corrected errors counts per udimm */ unsigned long udimm_ce_count[MAX_DIMMS]; @@ -492,116 +493,15 @@ static void free_i7core_dev(struct i7core_dev *i7core_dev) /**************************************************************************** Memory check routines ****************************************************************************/ -static struct pci_dev *get_pdev_slot_func(u8 socket, unsigned slot, - unsigned func) -{ - struct i7core_dev *i7core_dev = get_i7core_dev(socket); - int i; - - if (!i7core_dev) - return NULL; - - for (i = 0; i < i7core_dev->n_devs; i++) { - if (!i7core_dev->pdev[i]) - continue; - - if (PCI_SLOT(i7core_dev->pdev[i]->devfn) == slot && - PCI_FUNC(i7core_dev->pdev[i]->devfn) == func) { - return i7core_dev->pdev[i]; - } - } - - return NULL; -} - -/** - * i7core_get_active_channels() - gets the number of channels and csrows - * @socket: Quick Path Interconnect socket - * @channels: Number of channels that will be returned - * @csrows: Number of csrows found - * - * Since EDAC core needs to know in advance the number of available channels - * and csrows, in order to allocate memory for csrows/channels, it is needed - * to run two similar steps. At the first step, implemented on this function, - * it checks the number of csrows/channels present at one socket. - * this is used in order to properly allocate the size of mci components. - * - * It should be noticed that none of the current available datasheets explain - * or even mention how csrows are seen by the memory controller. So, we need - * to add a fake description for csrows. - * So, this driver is attributing one DIMM memory for one csrow. - */ -static int i7core_get_active_channels(const u8 socket, unsigned *channels, - unsigned *csrows) -{ - struct pci_dev *pdev = NULL; - int i, j; - u32 status, control; - - *channels = 0; - *csrows = 0; - - pdev = get_pdev_slot_func(socket, 3, 0); - if (!pdev) { - i7core_printk(KERN_ERR, "Couldn't find socket %d fn 3.0!!!\n", - socket); - return -ENODEV; - } - - /* Device 3 function 0 reads */ - pci_read_config_dword(pdev, MC_STATUS, &status); - pci_read_config_dword(pdev, MC_CONTROL, &control); - - for (i = 0; i < NUM_CHANS; i++) { - u32 dimm_dod[3]; - /* Check if the channel is active */ - if (!(control & (1 << (8 + i)))) - continue; - - /* Check if the channel is disabled */ - if (status & (1 << i)) - continue; - - pdev = get_pdev_slot_func(socket, i + 4, 1); - if (!pdev) { - i7core_printk(KERN_ERR, "Couldn't find socket %d " - "fn %d.%d!!!\n", - socket, i + 4, 1); - return -ENODEV; - } - /* Devices 4-6 function 1 */ - pci_read_config_dword(pdev, - MC_DOD_CH_DIMM0, &dimm_dod[0]); - pci_read_config_dword(pdev, - MC_DOD_CH_DIMM1, &dimm_dod[1]); - pci_read_config_dword(pdev, - MC_DOD_CH_DIMM2, &dimm_dod[2]); - (*channels)++; - - for (j = 0; j < 3; j++) { - if (!DIMM_PRESENT(dimm_dod[j])) - continue; - (*csrows)++; - } - } - - debugf0("Number of active channels on socket %d: %d\n", - socket, *channels); - - return 0; -} - -static int get_dimm_config(const struct mem_ctl_info *mci) +static int get_dimm_config(struct mem_ctl_info *mci) { struct i7core_pvt *pvt = mci->pvt_info; - struct csrow_info *csr; struct pci_dev *pdev; int i, j; - int csrow = 0; - unsigned long last_page = 0; enum edac_type mode; enum mem_type mtype; + struct dimm_info *dimm; /* Get data from the MC register, function 0 */ pdev = pvt->pci_mcr[0]; @@ -657,21 +557,20 @@ static int get_dimm_config(const struct mem_ctl_info *mci) pci_read_config_dword(pvt->pci_ch[i][0], MC_CHANNEL_DIMM_INIT_PARAMS, &data); - pvt->channel[i].ranks = (data & QUAD_RANK_PRESENT) ? - 4 : 2; + + if (data & THREE_DIMMS_PRESENT) + pvt->channel[i].is_3dimms_present = true; + + if (data & SINGLE_QUAD_RANK_PRESENT) + pvt->channel[i].is_single_4rank = true; + + if (data & QUAD_RANK_PRESENT) + pvt->channel[i].has_4rank = true; if (data & REGISTERED_DIMM) mtype = MEM_RDDR3; else mtype = MEM_DDR3; -#if 0 - if (data & THREE_DIMMS_PRESENT) - pvt->channel[i].dimms = 3; - else if (data & SINGLE_QUAD_RANK_PRESENT) - pvt->channel[i].dimms = 1; - else - pvt->channel[i].dimms = 2; -#endif /* Devices 4-6 function 1 */ pci_read_config_dword(pvt->pci_ch[i][1], @@ -682,11 +581,13 @@ static int get_dimm_config(const struct mem_ctl_info *mci) MC_DOD_CH_DIMM2, &dimm_dod[2]); debugf0("Ch%d phy rd%d, wr%d (0x%08x): " - "%d ranks, %cDIMMs\n", + "%s%s%s%cDIMMs\n", i, RDLCH(pvt->info.ch_map, i), WRLCH(pvt->info.ch_map, i), data, - pvt->channel[i].ranks, + pvt->channel[i].is_3dimms_present ? "3DIMMS " : "", + pvt->channel[i].is_3dimms_present ? "SINGLE_4R " : "", + pvt->channel[i].has_4rank ? "HAS_4R " : "", (data & REGISTERED_DIMM) ? 'R' : 'U'); for (j = 0; j < 3; j++) { @@ -696,6 +597,8 @@ static int get_dimm_config(const struct mem_ctl_info *mci) if (!DIMM_PRESENT(dimm_dod[j])) continue; + dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers, + i, j, 0); banks = numbank(MC_DOD_NUMBANK(dimm_dod[j])); ranks = numrank(MC_DOD_NUMRANK(dimm_dod[j])); rows = numrow(MC_DOD_NUMROW(dimm_dod[j])); @@ -704,8 +607,6 @@ static int get_dimm_config(const struct mem_ctl_info *mci) /* DDR3 has 8 I/O banks */ size = (rows * cols * banks * ranks) >> (20 - 3); - pvt->channel[i].dimms++; - debugf0("\tdimm %d %d Mb offset: %x, " "bank: %d, rank: %d, row: %#x, col: %#x\n", j, size, @@ -714,44 +615,28 @@ static int get_dimm_config(const struct mem_ctl_info *mci) npages = MiB_TO_PAGES(size); - csr = &mci->csrows[csrow]; - csr->first_page = last_page + 1; - last_page += npages; - csr->last_page = last_page; - csr->nr_pages = npages; - - csr->page_mask = 0; - csr->grain = 8; - csr->csrow_idx = csrow; - csr->nr_channels = 1; - - csr->channels[0].chan_idx = i; - csr->channels[0].ce_count = 0; - - pvt->csrow_map[i][j] = csrow; + dimm->nr_pages = npages; switch (banks) { case 4: - csr->dtype = DEV_X4; + dimm->dtype = DEV_X4; break; case 8: - csr->dtype = DEV_X8; + dimm->dtype = DEV_X8; break; case 16: - csr->dtype = DEV_X16; + dimm->dtype = DEV_X16; break; default: - csr->dtype = DEV_UNKNOWN; + dimm->dtype = DEV_UNKNOWN; } - csr->edac_mode = mode; - csr->mtype = mtype; - snprintf(csr->channels[0].label, - sizeof(csr->channels[0].label), - "CPU#%uChannel#%u_DIMM#%u", - pvt->i7core_dev->socket, i, j); - - csrow++; + snprintf(dimm->label, sizeof(dimm->label), + "CPU#%uChannel#%u_DIMM#%u", + pvt->i7core_dev->socket, i, j); + dimm->grain = 8; + dimm->edac_mode = mode; + dimm->mtype = mtype; } pci_read_config_dword(pdev, MC_SAG_CH_0, &value[0]); @@ -1567,22 +1452,16 @@ error: /**************************************************************************** Error check routines ****************************************************************************/ -static void i7core_rdimm_update_csrow(struct mem_ctl_info *mci, +static void i7core_rdimm_update_errcount(struct mem_ctl_info *mci, const int chan, const int dimm, const int add) { - char *msg; - struct i7core_pvt *pvt = mci->pvt_info; - int row = pvt->csrow_map[chan][dimm], i; + int i; for (i = 0; i < add; i++) { - msg = kasprintf(GFP_KERNEL, "Corrected error " - "(Socket=%d channel=%d dimm=%d)", - pvt->i7core_dev->socket, chan, dimm); - - edac_mc_handle_fbd_ce(mci, row, 0, msg); - kfree (msg); + edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 0, 0, 0, + chan, dimm, -1, "error", "", NULL); } } @@ -1623,11 +1502,11 @@ static void i7core_rdimm_update_ce_count(struct mem_ctl_info *mci, /*updated the edac core */ if (add0 != 0) - i7core_rdimm_update_csrow(mci, chan, 0, add0); + i7core_rdimm_update_errcount(mci, chan, 0, add0); if (add1 != 0) - i7core_rdimm_update_csrow(mci, chan, 1, add1); + i7core_rdimm_update_errcount(mci, chan, 1, add1); if (add2 != 0) - i7core_rdimm_update_csrow(mci, chan, 2, add2); + i7core_rdimm_update_errcount(mci, chan, 2, add2); } @@ -1747,20 +1626,30 @@ static void i7core_mce_output_error(struct mem_ctl_info *mci, const struct mce *m) { struct i7core_pvt *pvt = mci->pvt_info; - char *type, *optype, *err, *msg; + char *type, *optype, *err, msg[80]; + enum hw_event_mc_err_type tp_event; unsigned long error = m->status & 0x1ff0000l; + bool uncorrected_error = m->mcgstatus & 1ll << 61; + bool ripv = m->mcgstatus & 1; u32 optypenum = (m->status >> 4) & 0x07; u32 core_err_cnt = (m->status >> 38) & 0x7fff; u32 dimm = (m->misc >> 16) & 0x3; u32 channel = (m->misc >> 18) & 0x3; u32 syndrome = m->misc >> 32; u32 errnum = find_first_bit(&error, 32); - int csrow; - if (m->mcgstatus & 1) - type = "FATAL"; - else - type = "NON_FATAL"; + if (uncorrected_error) { + if (ripv) { + type = "FATAL"; + tp_event = HW_EVENT_ERR_FATAL; + } else { + type = "NON_FATAL"; + tp_event = HW_EVENT_ERR_UNCORRECTED; + } + } else { + type = "CORRECTED"; + tp_event = HW_EVENT_ERR_CORRECTED; + } switch (optypenum) { case 0: @@ -1815,27 +1704,20 @@ static void i7core_mce_output_error(struct mem_ctl_info *mci, err = "unknown"; } - /* FIXME: should convert addr into bank and rank information */ - msg = kasprintf(GFP_ATOMIC, - "%s (addr = 0x%08llx, cpu=%d, Dimm=%d, Channel=%d, " - "syndrome=0x%08x, count=%d, Err=%08llx:%08llx (%s: %s))\n", - type, (long long) m->addr, m->cpu, dimm, channel, - syndrome, core_err_cnt, (long long)m->status, - (long long)m->misc, optype, err); - - debugf0("%s", msg); - - csrow = pvt->csrow_map[channel][dimm]; + snprintf(msg, sizeof(msg), "count=%d %s", core_err_cnt, optype); - /* Call the helper to output message */ - if (m->mcgstatus & 1) - edac_mc_handle_fbd_ue(mci, csrow, 0, - 0 /* FIXME: should be channel here */, msg); - else if (!pvt->is_registered) - edac_mc_handle_fbd_ce(mci, csrow, - 0 /* FIXME: should be channel here */, msg); - - kfree(msg); + /* + * Call the helper to output message + * FIXME: what to do if core_err_cnt > 1? Currently, it generates + * only one event + */ + if (uncorrected_error || !pvt->is_registered) + edac_mc_handle_error(tp_event, mci, + m->addr >> PAGE_SHIFT, + m->addr & ~PAGE_MASK, + syndrome, + channel, dimm, -1, + err, msg, m); } /* @@ -2252,15 +2134,19 @@ static int i7core_register_mci(struct i7core_dev *i7core_dev) { struct mem_ctl_info *mci; struct i7core_pvt *pvt; - int rc, channels, csrows; - - /* Check the number of active and not disabled channels */ - rc = i7core_get_active_channels(i7core_dev->socket, &channels, &csrows); - if (unlikely(rc < 0)) - return rc; + int rc; + struct edac_mc_layer layers[2]; /* allocate a new MC control structure */ - mci = edac_mc_alloc(sizeof(*pvt), csrows, channels, i7core_dev->socket); + + layers[0].type = EDAC_MC_LAYER_CHANNEL; + layers[0].size = NUM_CHANS; + layers[0].is_virt_csrow = false; + layers[1].type = EDAC_MC_LAYER_SLOT; + layers[1].size = MAX_DIMMS; + layers[1].is_virt_csrow = true; + mci = edac_mc_alloc(i7core_dev->socket, ARRAY_SIZE(layers), layers, + sizeof(*pvt)); if (unlikely(!mci)) return -ENOMEM; diff --git a/drivers/edac/i82443bxgx_edac.c b/drivers/edac/i82443bxgx_edac.c index 3bf2b2f..52072c2 100644 --- a/drivers/edac/i82443bxgx_edac.c +++ b/drivers/edac/i82443bxgx_edac.c @@ -12,7 +12,7 @@ * 440GX fix by Jason Uhlenkott <juhlenko@akamai.com>. * * Written with reference to 82443BX Host Bridge Datasheet: - * http://download.intel.com/design/chipsets/datashts/29063301.pdf + * http://download.intel.com/design/chipsets/datashts/29063301.pdf * references to this document given in []. * * This module doesn't support the 440LX, but it may be possible to @@ -156,19 +156,19 @@ static int i82443bxgx_edacmc_process_error_info(struct mem_ctl_info *mci, if (info->eap & I82443BXGX_EAP_OFFSET_SBE) { error_found = 1; if (handle_errors) - edac_mc_handle_ce(mci, page, pageoffset, - /* 440BX/GX don't make syndrome information - * available */ - 0, edac_mc_find_csrow_by_page(mci, page), 0, - mci->ctl_name); + edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, + page, pageoffset, 0, + edac_mc_find_csrow_by_page(mci, page), + 0, -1, mci->ctl_name, "", NULL); } if (info->eap & I82443BXGX_EAP_OFFSET_MBE) { error_found = 1; if (handle_errors) - edac_mc_handle_ue(mci, page, pageoffset, - edac_mc_find_csrow_by_page(mci, page), - mci->ctl_name); + edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, + page, pageoffset, 0, + edac_mc_find_csrow_by_page(mci, page), + 0, -1, mci->ctl_name, "", NULL); } return error_found; @@ -189,6 +189,7 @@ static void i82443bxgx_init_csrows(struct mem_ctl_info *mci, enum mem_type mtype) { struct csrow_info *csrow; + struct dimm_info *dimm; int index; u8 drbar, dramc; u32 row_base, row_high_limit, row_high_limit_last; @@ -197,6 +198,8 @@ static void i82443bxgx_init_csrows(struct mem_ctl_info *mci, row_high_limit_last = 0; for (index = 0; index < mci->nr_csrows; index++) { csrow = &mci->csrows[index]; + dimm = csrow->channels[0].dimm; + pci_read_config_byte(pdev, I82443BXGX_DRB + index, &drbar); debugf1("MC%d: %s: %s() Row=%d DRB = %#0x\n", mci->mc_idx, __FILE__, __func__, index, drbar); @@ -217,14 +220,14 @@ static void i82443bxgx_init_csrows(struct mem_ctl_info *mci, row_base = row_high_limit_last; csrow->first_page = row_base >> PAGE_SHIFT; csrow->last_page = (row_high_limit >> PAGE_SHIFT) - 1; - csrow->nr_pages = csrow->last_page - csrow->first_page + 1; + dimm->nr_pages = csrow->last_page - csrow->first_page + 1; /* EAP reports in 4kilobyte granularity [61] */ - csrow->grain = 1 << 12; - csrow->mtype = mtype; + dimm->grain = 1 << 12; + dimm->mtype = mtype; /* I don't think 440BX can tell you device type? FIXME? */ - csrow->dtype = DEV_UNKNOWN; + dimm->dtype = DEV_UNKNOWN; /* Mode is global to all rows on 440BX */ - csrow->edac_mode = edac_mode; + dimm->edac_mode = edac_mode; row_high_limit_last = row_high_limit; } } @@ -232,6 +235,7 @@ static void i82443bxgx_init_csrows(struct mem_ctl_info *mci, static int i82443bxgx_edacmc_probe1(struct pci_dev *pdev, int dev_idx) { struct mem_ctl_info *mci; + struct edac_mc_layer layers[2]; u8 dramc; u32 nbxcfg, ecc_mode; enum mem_type mtype; @@ -245,8 +249,13 @@ static int i82443bxgx_edacmc_probe1(struct pci_dev *pdev, int dev_idx) if (pci_read_config_dword(pdev, I82443BXGX_NBXCFG, &nbxcfg)) return -EIO; - mci = edac_mc_alloc(0, I82443BXGX_NR_CSROWS, I82443BXGX_NR_CHANS, 0); - + layers[0].type = EDAC_MC_LAYER_CHIP_SELECT; + layers[0].size = I82443BXGX_NR_CSROWS; + layers[0].is_virt_csrow = true; + layers[1].type = EDAC_MC_LAYER_CHANNEL; + layers[1].size = I82443BXGX_NR_CHANS; + layers[1].is_virt_csrow = false; + mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers, 0); if (mci == NULL) return -ENOMEM; diff --git a/drivers/edac/i82860_edac.c b/drivers/edac/i82860_edac.c index c779092..0804505 100644 --- a/drivers/edac/i82860_edac.c +++ b/drivers/edac/i82860_edac.c @@ -99,6 +99,7 @@ static int i82860_process_error_info(struct mem_ctl_info *mci, struct i82860_error_info *info, int handle_errors) { + struct dimm_info *dimm; int row; if (!(info->errsts2 & 0x0003)) @@ -108,18 +109,25 @@ static int i82860_process_error_info(struct mem_ctl_info *mci, return 1; if ((info->errsts ^ info->errsts2) & 0x0003) { - edac_mc_handle_ce_no_info(mci, "UE overwrote CE"); + edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 0, 0, 0, + -1, -1, -1, "UE overwrote CE", "", NULL); info->errsts = info->errsts2; } info->eap >>= PAGE_SHIFT; row = edac_mc_find_csrow_by_page(mci, info->eap); + dimm = mci->csrows[row].channels[0].dimm; if (info->errsts & 0x0002) - edac_mc_handle_ue(mci, info->eap, 0, row, "i82860 UE"); + edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, + info->eap, 0, 0, + dimm->location[0], dimm->location[1], -1, + "i82860 UE", "", NULL); else - edac_mc_handle_ce(mci, info->eap, 0, info->derrsyn, row, 0, - "i82860 UE"); + edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, + info->eap, 0, info->derrsyn, + dimm->location[0], dimm->location[1], -1, + "i82860 CE", "", NULL); return 1; } @@ -140,6 +148,7 @@ static void i82860_init_csrows(struct mem_ctl_info *mci, struct pci_dev *pdev) u16 value; u32 cumul_size; struct csrow_info *csrow; + struct dimm_info *dimm; int index; pci_read_config_word(pdev, I82860_MCHCFG, &mchcfg_ddim); @@ -153,6 +162,8 @@ static void i82860_init_csrows(struct mem_ctl_info *mci, struct pci_dev *pdev) */ for (index = 0; index < mci->nr_csrows; index++) { csrow = &mci->csrows[index]; + dimm = csrow->channels[0].dimm; + pci_read_config_word(pdev, I82860_GBA + index * 2, &value); cumul_size = (value & I82860_GBA_MASK) << (I82860_GBA_SHIFT - PAGE_SHIFT); @@ -164,30 +175,38 @@ static void i82860_init_csrows(struct mem_ctl_info *mci, struct pci_dev *pdev) csrow->first_page = last_cumul_size; csrow->last_page = cumul_size - 1; - csrow->nr_pages = cumul_size - last_cumul_size; + dimm->nr_pages = cumul_size - last_cumul_size; last_cumul_size = cumul_size; - csrow->grain = 1 << 12; /* I82860_EAP has 4KiB reolution */ - csrow->mtype = MEM_RMBS; - csrow->dtype = DEV_UNKNOWN; - csrow->edac_mode = mchcfg_ddim ? EDAC_SECDED : EDAC_NONE; + dimm->grain = 1 << 12; /* I82860_EAP has 4KiB reolution */ + dimm->mtype = MEM_RMBS; + dimm->dtype = DEV_UNKNOWN; + dimm->edac_mode = mchcfg_ddim ? EDAC_SECDED : EDAC_NONE; } } static int i82860_probe1(struct pci_dev *pdev, int dev_idx) { struct mem_ctl_info *mci; + struct edac_mc_layer layers[2]; struct i82860_error_info discard; - /* RDRAM has channels but these don't map onto the abstractions that - edac uses. - The device groups from the GRA registers seem to map reasonably - well onto the notion of a chip select row. - There are 16 GRA registers and since the name is associated with - the channel and the GRA registers map to physical devices so we are - going to make 1 channel for group. + /* + * RDRAM has channels but these don't map onto the csrow abstraction. + * According with the datasheet, there are 2 Rambus channels, supporting + * up to 16 direct RDRAM devices. + * The device groups from the GRA registers seem to map reasonably + * well onto the notion of a chip select row. + * There are 16 GRA registers and since the name is associated with + * the channel and the GRA registers map to physical devices so we are + * going to make 1 channel for group. */ - mci = edac_mc_alloc(0, 16, 1, 0); - + layers[0].type = EDAC_MC_LAYER_CHANNEL; + layers[0].size = 2; + layers[0].is_virt_csrow = true; + layers[1].type = EDAC_MC_LAYER_SLOT; + layers[1].size = 8; + layers[1].is_virt_csrow = true; + mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers, 0); if (!mci) return -ENOMEM; diff --git a/drivers/edac/i82875p_edac.c b/drivers/edac/i82875p_edac.c index 10f15d8..b613e31 100644 --- a/drivers/edac/i82875p_edac.c +++ b/drivers/edac/i82875p_edac.c @@ -38,7 +38,8 @@ #endif /* PCI_DEVICE_ID_INTEL_82875_6 */ /* four csrows in dual channel, eight in single channel */ -#define I82875P_NR_CSROWS(nr_chans) (8/(nr_chans)) +#define I82875P_NR_DIMMS 8 +#define I82875P_NR_CSROWS(nr_chans) (I82875P_NR_DIMMS / (nr_chans)) /* Intel 82875p register addresses - device 0 function 0 - DRAM Controller */ #define I82875P_EAP 0x58 /* Error Address Pointer (32b) @@ -235,7 +236,9 @@ static int i82875p_process_error_info(struct mem_ctl_info *mci, return 1; if ((info->errsts ^ info->errsts2) & 0x0081) { - edac_mc_handle_ce_no_info(mci, "UE overwrote CE"); + edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 0, 0, 0, + -1, -1, -1, + "UE overwrote CE", "", NULL); info->errsts = info->errsts2; } @@ -243,11 +246,15 @@ static int i82875p_process_error_info(struct mem_ctl_info *mci, row = edac_mc_find_csrow_by_page(mci, info->eap); if (info->errsts & 0x0080) - edac_mc_handle_ue(mci, info->eap, 0, row, "i82875p UE"); + edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, + info->eap, 0, 0, + row, -1, -1, + "i82875p UE", "", NULL); else - edac_mc_handle_ce(mci, info->eap, 0, info->derrsyn, row, - multi_chan ? (info->des & 0x1) : 0, - "i82875p CE"); + edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, + info->eap, 0, info->derrsyn, + row, multi_chan ? (info->des & 0x1) : 0, + -1, "i82875p CE", "", NULL); return 1; } @@ -342,11 +349,13 @@ static void i82875p_init_csrows(struct mem_ctl_info *mci, void __iomem * ovrfl_window, u32 drc) { struct csrow_info *csrow; + struct dimm_info *dimm; + unsigned nr_chans = dual_channel_active(drc) + 1; unsigned long last_cumul_size; u8 value; u32 drc_ddim; /* DRAM Data Integrity Mode 0=none,2=edac */ - u32 cumul_size; - int index; + u32 cumul_size, nr_pages; + int index, j; drc_ddim = (drc >> 18) & 0x1; last_cumul_size = 0; @@ -369,12 +378,18 @@ static void i82875p_init_csrows(struct mem_ctl_info *mci, csrow->first_page = last_cumul_size; csrow->last_page = cumul_size - 1; - csrow->nr_pages = cumul_size - last_cumul_size; + nr_pages = cumul_size - last_cumul_size; last_cumul_size = cumul_size; - csrow->grain = 1 << 12; /* I82875P_EAP has 4KiB reolution */ - csrow->mtype = MEM_DDR; - csrow->dtype = DEV_UNKNOWN; - csrow->edac_mode = drc_ddim ? EDAC_SECDED : EDAC_NONE; + + for (j = 0; j < nr_chans; j++) { + dimm = csrow->channels[j].dimm; + + dimm->nr_pages = nr_pages / nr_chans; + dimm->grain = 1 << 12; /* I82875P_EAP has 4KiB reolution */ + dimm->mtype = MEM_DDR; + dimm->dtype = DEV_UNKNOWN; + dimm->edac_mode = drc_ddim ? EDAC_SECDED : EDAC_NONE; + } } } @@ -382,6 +397,7 @@ static int i82875p_probe1(struct pci_dev *pdev, int dev_idx) { int rc = -ENODEV; struct mem_ctl_info *mci; + struct edac_mc_layer layers[2]; struct i82875p_pvt *pvt; struct pci_dev *ovrfl_pdev; void __iomem *ovrfl_window; @@ -397,9 +413,14 @@ static int i82875p_probe1(struct pci_dev *pdev, int dev_idx) return -ENODEV; drc = readl(ovrfl_window + I82875P_DRC); nr_chans = dual_channel_active(drc) + 1; - mci = edac_mc_alloc(sizeof(*pvt), I82875P_NR_CSROWS(nr_chans), - nr_chans, 0); + layers[0].type = EDAC_MC_LAYER_CHIP_SELECT; + layers[0].size = I82875P_NR_CSROWS(nr_chans); + layers[0].is_virt_csrow = true; + layers[1].type = EDAC_MC_LAYER_CHANNEL; + layers[1].size = nr_chans; + layers[1].is_virt_csrow = false; + mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers, sizeof(*pvt)); if (!mci) { rc = -ENOMEM; goto fail0; diff --git a/drivers/edac/i82975x_edac.c b/drivers/edac/i82975x_edac.c index 0cd8368..433332c 100644 --- a/drivers/edac/i82975x_edac.c +++ b/drivers/edac/i82975x_edac.c @@ -29,7 +29,8 @@ #define PCI_DEVICE_ID_INTEL_82975_0 0x277c #endif /* PCI_DEVICE_ID_INTEL_82975_0 */ -#define I82975X_NR_CSROWS(nr_chans) (8/(nr_chans)) +#define I82975X_NR_DIMMS 8 +#define I82975X_NR_CSROWS(nr_chans) (I82975X_NR_DIMMS / (nr_chans)) /* Intel 82975X register addresses - device 0 function 0 - DRAM Controller */ #define I82975X_EAP 0x58 /* Dram Error Address Pointer (32b) @@ -287,7 +288,8 @@ static int i82975x_process_error_info(struct mem_ctl_info *mci, return 1; if ((info->errsts ^ info->errsts2) & 0x0003) { - edac_mc_handle_ce_no_info(mci, "UE overwrote CE"); + edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 0, 0, 0, + -1, -1, -1, "UE overwrote CE", "", NULL); info->errsts = info->errsts2; } @@ -309,13 +311,18 @@ static int i82975x_process_error_info(struct mem_ctl_info *mci, chan = (mci->csrows[row].nr_channels == 1) ? 0 : info->eap & 1; offst = info->eap & ((1 << PAGE_SHIFT) - - (1 << mci->csrows[row].grain)); + (1 << mci->csrows[row].channels[chan].dimm->grain)); if (info->errsts & 0x0002) - edac_mc_handle_ue(mci, page, offst , row, "i82975x UE"); + edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, + page, offst, 0, + row, -1, -1, + "i82975x UE", "", NULL); else - edac_mc_handle_ce(mci, page, offst, info->derrsyn, row, - chan, "i82975x CE"); + edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, + page, offst, info->derrsyn, + row, chan ? chan : 0, -1, + "i82975x CE", "", NULL); return 1; } @@ -370,8 +377,10 @@ static void i82975x_init_csrows(struct mem_ctl_info *mci, struct csrow_info *csrow; unsigned long last_cumul_size; u8 value; - u32 cumul_size; + u32 cumul_size, nr_pages; int index, chan; + struct dimm_info *dimm; + enum dev_type dtype; last_cumul_size = 0; @@ -400,28 +409,33 @@ static void i82975x_init_csrows(struct mem_ctl_info *mci, debugf3("%s(): (%d) cumul_size 0x%x\n", __func__, index, cumul_size); + nr_pages = cumul_size - last_cumul_size; + if (!nr_pages) + continue; + /* * Initialise dram labels * index values: * [0-7] for single-channel; i.e. csrow->nr_channels = 1 * [0-3] for dual-channel; i.e. csrow->nr_channels = 2 */ - for (chan = 0; chan < csrow->nr_channels; chan++) - strncpy(csrow->channels[chan].label, + dtype = i82975x_dram_type(mch_window, index); + for (chan = 0; chan < csrow->nr_channels; chan++) { + dimm = mci->csrows[index].channels[chan].dimm; + + dimm->nr_pages = nr_pages / csrow->nr_channels; + strncpy(csrow->channels[chan].dimm->label, labels[(index >> 1) + (chan * 2)], EDAC_MC_LABEL_LEN); - - if (cumul_size == last_cumul_size) - continue; /* not populated */ + dimm->grain = 1 << 7; /* 128Byte cache-line resolution */ + dimm->dtype = i82975x_dram_type(mch_window, index); + dimm->mtype = MEM_DDR2; /* I82975x supports only DDR2 */ + dimm->edac_mode = EDAC_SECDED; /* only supported */ + } csrow->first_page = last_cumul_size; csrow->last_page = cumul_size - 1; - csrow->nr_pages = cumul_size - last_cumul_size; last_cumul_size = cumul_size; - csrow->grain = 1 << 7; /* 128Byte cache-line resolution */ - csrow->mtype = MEM_DDR2; /* I82975x supports only DDR2 */ - csrow->dtype = i82975x_dram_type(mch_window, index); - csrow->edac_mode = EDAC_SECDED; /* only supported */ } } @@ -463,6 +477,7 @@ static int i82975x_probe1(struct pci_dev *pdev, int dev_idx) { int rc = -ENODEV; struct mem_ctl_info *mci; + struct edac_mc_layer layers[2]; struct i82975x_pvt *pvt; void __iomem *mch_window; u32 mchbar; @@ -531,8 +546,13 @@ static int i82975x_probe1(struct pci_dev *pdev, int dev_idx) chans = dual_channel_active(mch_window) + 1; /* assuming only one controller, index thus is 0 */ - mci = edac_mc_alloc(sizeof(*pvt), I82975X_NR_CSROWS(chans), - chans, 0); + layers[0].type = EDAC_MC_LAYER_CHIP_SELECT; + layers[0].size = I82975X_NR_DIMMS; + layers[0].is_virt_csrow = true; + layers[1].type = EDAC_MC_LAYER_CHANNEL; + layers[1].size = I82975X_NR_CSROWS(chans); + layers[1].is_virt_csrow = false; + mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers, sizeof(*pvt)); if (!mci) { rc = -ENOMEM; goto fail1; diff --git a/drivers/edac/mpc85xx_edac.c b/drivers/edac/mpc85xx_edac.c index 73464a6..4c40235 100644 --- a/drivers/edac/mpc85xx_edac.c +++ b/drivers/edac/mpc85xx_edac.c @@ -854,12 +854,16 @@ static void mpc85xx_mc_check(struct mem_ctl_info *mci) mpc85xx_mc_printk(mci, KERN_ERR, "PFN out of range!\n"); if (err_detect & DDR_EDE_SBE) - edac_mc_handle_ce(mci, pfn, err_addr & ~PAGE_MASK, - syndrome, row_index, 0, mci->ctl_name); + edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, + pfn, err_addr & ~PAGE_MASK, syndrome, + row_index, 0, -1, + mci->ctl_name, "", NULL); if (err_detect & DDR_EDE_MBE) - edac_mc_handle_ue(mci, pfn, err_addr & ~PAGE_MASK, - row_index, mci->ctl_name); + edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, + pfn, err_addr & ~PAGE_MASK, syndrome, + row_index, 0, -1, + mci->ctl_name, "", NULL); out_be32(pdata->mc_vbase + MPC85XX_MC_ERR_DETECT, err_detect); } @@ -883,6 +887,7 @@ static void __devinit mpc85xx_init_csrows(struct mem_ctl_info *mci) { struct mpc85xx_mc_pdata *pdata = mci->pvt_info; struct csrow_info *csrow; + struct dimm_info *dimm; u32 sdram_ctl; u32 sdtype; enum mem_type mtype; @@ -929,6 +934,8 @@ static void __devinit mpc85xx_init_csrows(struct mem_ctl_info *mci) u32 end; csrow = &mci->csrows[index]; + dimm = csrow->channels[0].dimm; + cs_bnds = in_be32(pdata->mc_vbase + MPC85XX_MC_CS_BNDS_0 + (index * MPC85XX_MC_CS_BNDS_OFS)); @@ -944,19 +951,21 @@ static void __devinit mpc85xx_init_csrows(struct mem_ctl_info *mci) csrow->first_page = start; csrow->last_page = end; - csrow->nr_pages = end + 1 - start; - csrow->grain = 8; - csrow->mtype = mtype; - csrow->dtype = DEV_UNKNOWN; + + dimm->nr_pages = end + 1 - start; + dimm->grain = 8; + dimm->mtype = mtype; + dimm->dtype = DEV_UNKNOWN; if (sdram_ctl & DSC_X32_EN) - csrow->dtype = DEV_X32; - csrow->edac_mode = EDAC_SECDED; + dimm->dtype = DEV_X32; + dimm->edac_mode = EDAC_SECDED; } } static int __devinit mpc85xx_mc_err_probe(struct platform_device *op) { struct mem_ctl_info *mci; + struct edac_mc_layer layers[2]; struct mpc85xx_mc_pdata *pdata; struct resource r; u32 sdram_ctl; @@ -965,7 +974,13 @@ static int __devinit mpc85xx_mc_err_probe(struct platform_device *op) if (!devres_open_group(&op->dev, mpc85xx_mc_err_probe, GFP_KERNEL)) return -ENOMEM; - mci = edac_mc_alloc(sizeof(*pdata), 4, 1, edac_mc_idx); + layers[0].type = EDAC_MC_LAYER_CHIP_SELECT; + layers[0].size = 4; + layers[0].is_virt_csrow = true; + layers[1].type = EDAC_MC_LAYER_CHANNEL; + layers[1].size = 1; + layers[1].is_virt_csrow = false; + mci = edac_mc_alloc(edac_mc_idx, ARRAY_SIZE(layers), sizeof(*pdata)); if (!mci) { devres_release_group(&op->dev, mpc85xx_mc_err_probe); return -ENOMEM; diff --git a/drivers/edac/mv64x60_edac.c b/drivers/edac/mv64x60_edac.c index 7e5ff36..b0bb5a3 100644 --- a/drivers/edac/mv64x60_edac.c +++ b/drivers/edac/mv64x60_edac.c @@ -611,12 +611,17 @@ static void mv64x60_mc_check(struct mem_ctl_info *mci) /* first bit clear in ECC Err Reg, 1 bit error, correctable by HW */ if (!(reg & 0x1)) - edac_mc_handle_ce(mci, err_addr >> PAGE_SHIFT, - err_addr & PAGE_MASK, syndrome, 0, 0, - mci->ctl_name); + edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, + err_addr >> PAGE_SHIFT, + err_addr & PAGE_MASK, syndrome, + 0, 0, -1, + mci->ctl_name, "", NULL); else /* 2 bit error, UE */ - edac_mc_handle_ue(mci, err_addr >> PAGE_SHIFT, - err_addr & PAGE_MASK, 0, mci->ctl_name); + edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, + err_addr >> PAGE_SHIFT, + err_addr & PAGE_MASK, 0, + 0, 0, -1, + mci->ctl_name, "", NULL); /* clear the error */ out_le32(pdata->mc_vbase + MV64X60_SDRAM_ERR_ADDR, 0); @@ -656,6 +661,8 @@ static void mv64x60_init_csrows(struct mem_ctl_info *mci, struct mv64x60_mc_pdata *pdata) { struct csrow_info *csrow; + struct dimm_info *dimm; + u32 devtype; u32 ctl; @@ -664,35 +671,36 @@ static void mv64x60_init_csrows(struct mem_ctl_info *mci, ctl = in_le32(pdata->mc_vbase + MV64X60_SDRAM_CONFIG); csrow = &mci->csrows[0]; - csrow->first_page = 0; - csrow->nr_pages = pdata->total_mem >> PAGE_SHIFT; - csrow->last_page = csrow->first_page + csrow->nr_pages - 1; - csrow->grain = 8; + dimm = csrow->channels[0].dimm; + + dimm->nr_pages = pdata->total_mem >> PAGE_SHIFT; + dimm->grain = 8; - csrow->mtype = (ctl & MV64X60_SDRAM_REGISTERED) ? MEM_RDDR : MEM_DDR; + dimm->mtype = (ctl & MV64X60_SDRAM_REGISTERED) ? MEM_RDDR : MEM_DDR; devtype = (ctl >> 20) & 0x3; switch (devtype) { case 0x0: - csrow->dtype = DEV_X32; + dimm->dtype = DEV_X32; break; case 0x2: /* could be X8 too, but no way to tell */ - csrow->dtype = DEV_X16; + dimm->dtype = DEV_X16; break; case 0x3: - csrow->dtype = DEV_X4; + dimm->dtype = DEV_X4; break; default: - csrow->dtype = DEV_UNKNOWN; + dimm->dtype = DEV_UNKNOWN; break; } - csrow->edac_mode = EDAC_SECDED; + dimm->edac_mode = EDAC_SECDED; } static int __devinit mv64x60_mc_err_probe(struct platform_device *pdev) { struct mem_ctl_info *mci; + struct edac_mc_layer layers[2]; struct mv64x60_mc_pdata *pdata; struct resource *r; u32 ctl; @@ -701,7 +709,14 @@ static int __devinit mv64x60_mc_err_probe(struct platform_device *pdev) if (!devres_open_group(&pdev->dev, mv64x60_mc_err_probe, GFP_KERNEL)) return -ENOMEM; - mci = edac_mc_alloc(sizeof(struct mv64x60_mc_pdata), 1, 1, edac_mc_idx); + layers[0].type = EDAC_MC_LAYER_CHIP_SELECT; + layers[0].size = 1; + layers[0].is_virt_csrow = true; + layers[1].type = EDAC_MC_LAYER_CHANNEL; + layers[1].size = 1; + layers[1].is_virt_csrow = false; + mci = edac_mc_alloc(edac_mc_idx, ARRAY_SIZE(layers), layers, + sizeof(struct mv64x60_mc_pdata)); if (!mci) { printk(KERN_ERR "%s: No memory for CPU err\n", __func__); devres_release_group(&pdev->dev, mv64x60_mc_err_probe); diff --git a/drivers/edac/pasemi_edac.c b/drivers/edac/pasemi_edac.c index 7f71ee4..b095a90 100644 --- a/drivers/edac/pasemi_edac.c +++ b/drivers/edac/pasemi_edac.c @@ -110,15 +110,16 @@ static void pasemi_edac_process_error_info(struct mem_ctl_info *mci, u32 errsta) /* uncorrectable/multi-bit errors */ if (errsta & (MCDEBUG_ERRSTA_MBE_STATUS | MCDEBUG_ERRSTA_RFL_STATUS)) { - edac_mc_handle_ue(mci, mci->csrows[cs].first_page, 0, - cs, mci->ctl_name); + edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, + mci->csrows[cs].first_page, 0, 0, + cs, 0, -1, mci->ctl_name, "", NULL); } /* correctable/single-bit errors */ - if (errsta & MCDEBUG_ERRSTA_SBE_STATUS) { - edac_mc_handle_ce(mci, mci->csrows[cs].first_page, 0, - 0, cs, 0, mci->ctl_name); - } + if (errsta & MCDEBUG_ERRSTA_SBE_STATUS) + edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, + mci->csrows[cs].first_page, 0, 0, + cs, 0, -1, mci->ctl_name, "", NULL); } static void pasemi_edac_check(struct mem_ctl_info *mci) @@ -135,11 +136,13 @@ static int pasemi_edac_init_csrows(struct mem_ctl_info *mci, enum edac_type edac_mode) { struct csrow_info *csrow; + struct dimm_info *dimm; u32 rankcfg; int index; for (index = 0; index < mci->nr_csrows; index++) { csrow = &mci->csrows[index]; + dimm = csrow->channels[0].dimm; pci_read_config_dword(pdev, MCDRAM_RANKCFG + (index * 12), @@ -151,20 +154,20 @@ static int pasemi_edac_init_csrows(struct mem_ctl_info *mci, switch ((rankcfg & MCDRAM_RANKCFG_TYPE_SIZE_M) >> MCDRAM_RANKCFG_TYPE_SIZE_S) { case 0: - csrow->nr_pages = 128 << (20 - PAGE_SHIFT); + dimm->nr_pages = 128 << (20 - PAGE_SHIFT); break; case 1: - csrow->nr_pages = 256 << (20 - PAGE_SHIFT); + dimm->nr_pages = 256 << (20 - PAGE_SHIFT); break; case 2: case 3: - csrow->nr_pages = 512 << (20 - PAGE_SHIFT); + dimm->nr_pages = 512 << (20 - PAGE_SHIFT); break; case 4: - csrow->nr_pages = 1024 << (20 - PAGE_SHIFT); + dimm->nr_pages = 1024 << (20 - PAGE_SHIFT); break; case 5: - csrow->nr_pages = 2048 << (20 - PAGE_SHIFT); + dimm->nr_pages = 2048 << (20 - PAGE_SHIFT); break; default: edac_mc_printk(mci, KERN_ERR, @@ -174,13 +177,13 @@ static int pasemi_edac_init_csrows(struct mem_ctl_info *mci, } csrow->first_page = last_page_in_mmc; - csrow->last_page = csrow->first_page + csrow->nr_pages - 1; - last_page_in_mmc += csrow->nr_pages; + csrow->last_page = csrow->first_page + dimm->nr_pages - 1; + last_page_in_mmc += dimm->nr_pages; csrow->page_mask = 0; - csrow->grain = PASEMI_EDAC_ERROR_GRAIN; - csrow->mtype = MEM_DDR; - csrow->dtype = DEV_UNKNOWN; - csrow->edac_mode = edac_mode; + dimm->grain = PASEMI_EDAC_ERROR_GRAIN; + dimm->mtype = MEM_DDR; + dimm->dtype = DEV_UNKNOWN; + dimm->edac_mode = edac_mode; } return 0; } @@ -189,6 +192,7 @@ static int __devinit pasemi_edac_probe(struct pci_dev *pdev, const struct pci_device_id *ent) { struct mem_ctl_info *mci = NULL; + struct edac_mc_layer layers[2]; u32 errctl1, errcor, scrub, mcen; pci_read_config_dword(pdev, MCCFG_MCEN, &mcen); @@ -205,9 +209,14 @@ static int __devinit pasemi_edac_probe(struct pci_dev *pdev, MCDEBUG_ERRCTL1_RFL_LOG_EN; pci_write_config_dword(pdev, MCDEBUG_ERRCTL1, errctl1); - mci = edac_mc_alloc(0, PASEMI_EDAC_NR_CSROWS, PASEMI_EDAC_NR_CHANS, - system_mmc_id++); - + layers[0].type = EDAC_MC_LAYER_CHIP_SELECT; + layers[0].size = PASEMI_EDAC_NR_CSROWS; + layers[0].is_virt_csrow = true; + layers[1].type = EDAC_MC_LAYER_CHANNEL; + layers[1].size = PASEMI_EDAC_NR_CHANS; + layers[1].is_virt_csrow = false; + mci = edac_mc_alloc(system_mmc_id++, ARRAY_SIZE(layers), layers, + 0); if (mci == NULL) return -ENOMEM; diff --git a/drivers/edac/ppc4xx_edac.c b/drivers/edac/ppc4xx_edac.c index d427c69..f3f9fed 100644 --- a/drivers/edac/ppc4xx_edac.c +++ b/drivers/edac/ppc4xx_edac.c @@ -727,7 +727,10 @@ ppc4xx_edac_handle_ce(struct mem_ctl_info *mci, for (row = 0; row < mci->nr_csrows; row++) if (ppc4xx_edac_check_bank_error(status, row)) - edac_mc_handle_ce_no_info(mci, message); + edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, + 0, 0, 0, + row, 0, -1, + message, "", NULL); } /** @@ -755,7 +758,10 @@ ppc4xx_edac_handle_ue(struct mem_ctl_info *mci, for (row = 0; row < mci->nr_csrows; row++) if (ppc4xx_edac_check_bank_error(status, row)) - edac_mc_handle_ue(mci, page, offset, row, message); + edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, + page, offset, 0, + row, 0, -1, + message, "", NULL); } /** @@ -895,9 +901,8 @@ ppc4xx_edac_init_csrows(struct mem_ctl_info *mci, u32 mcopt1) enum mem_type mtype; enum dev_type dtype; enum edac_type edac_mode; - int row; - u32 mbxcf, size; - static u32 ppc4xx_last_page; + int row, j; + u32 mbxcf, size, nr_pages; /* Establish the memory type and width */ @@ -948,7 +953,7 @@ ppc4xx_edac_init_csrows(struct mem_ctl_info *mci, u32 mcopt1) case SDRAM_MBCF_SZ_2GB: case SDRAM_MBCF_SZ_4GB: case SDRAM_MBCF_SZ_8GB: - csi->nr_pages = SDRAM_MBCF_SZ_TO_PAGES(size); + nr_pages = SDRAM_MBCF_SZ_TO_PAGES(size); break; default: ppc4xx_edac_mc_printk(KERN_ERR, mci, @@ -959,10 +964,6 @@ ppc4xx_edac_init_csrows(struct mem_ctl_info *mci, u32 mcopt1) goto done; } - csi->first_page = ppc4xx_last_page; - csi->last_page = csi->first_page + csi->nr_pages - 1; - csi->page_mask = 0; - /* * It's unclear exactly what grain should be set to * here. The SDRAM_ECCES register allows resolution of @@ -975,15 +976,17 @@ ppc4xx_edac_init_csrows(struct mem_ctl_info *mci, u32 mcopt1) * possible values would be the PLB width (16), the * page size (PAGE_SIZE) or the memory width (2 or 4). */ + for (j = 0; j < csi->nr_channels; j++) { + struct dimm_info *dimm = csi->channels[j].dimm; - csi->grain = 1; - - csi->mtype = mtype; - csi->dtype = dtype; + dimm->nr_pages = nr_pages / csi->nr_channels; + dimm->grain = 1; - csi->edac_mode = edac_mode; + dimm->mtype = mtype; + dimm->dtype = dtype; - ppc4xx_last_page += csi->nr_pages; + dimm->edac_mode = edac_mode; + } } done: @@ -1236,6 +1239,7 @@ static int __devinit ppc4xx_edac_probe(struct platform_device *op) dcr_host_t dcr_host; const struct device_node *np = op->dev.of_node; struct mem_ctl_info *mci = NULL; + struct edac_mc_layer layers[2]; static int ppc4xx_edac_instance; /* @@ -1281,12 +1285,14 @@ static int __devinit ppc4xx_edac_probe(struct platform_device *op) * controller instance and perform the appropriate * initialization. */ - - mci = edac_mc_alloc(sizeof(struct ppc4xx_edac_pdata), - ppc4xx_edac_nr_csrows, - ppc4xx_edac_nr_chans, - ppc4xx_edac_instance); - + layers[0].type = EDAC_MC_LAYER_CHIP_SELECT; + layers[0].size = ppc4xx_edac_nr_csrows; + layers[0].is_virt_csrow = true; + layers[1].type = EDAC_MC_LAYER_CHANNEL; + layers[1].size = ppc4xx_edac_nr_chans; + layers[1].is_virt_csrow = false; + mci = edac_mc_alloc(ppc4xx_edac_instance, ARRAY_SIZE(layers), layers, + sizeof(struct ppc4xx_edac_pdata)); if (mci == NULL) { ppc4xx_edac_printk(KERN_ERR, "%s: " "Failed to allocate EDAC MC instance!\n", diff --git a/drivers/edac/r82600_edac.c b/drivers/edac/r82600_edac.c index 6d908ad..e1cacd1 100644 --- a/drivers/edac/r82600_edac.c +++ b/drivers/edac/r82600_edac.c @@ -179,10 +179,11 @@ static int r82600_process_error_info(struct mem_ctl_info *mci, error_found = 1; if (handle_errors) - edac_mc_handle_ce(mci, page, 0, /* not avail */ - syndrome, - edac_mc_find_csrow_by_page(mci, page), - 0, mci->ctl_name); + edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, + page, 0, syndrome, + edac_mc_find_csrow_by_page(mci, page), + 0, -1, + mci->ctl_name, "", NULL); } if (info->eapr & BIT(1)) { /* UE? */ @@ -190,9 +191,11 @@ static int r82600_process_error_info(struct mem_ctl_info *mci, if (handle_errors) /* 82600 doesn't give enough info */ - edac_mc_handle_ue(mci, page, 0, - edac_mc_find_csrow_by_page(mci, page), - mci->ctl_name); + edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, + page, 0, 0, + edac_mc_find_csrow_by_page(mci, page), + 0, -1, + mci->ctl_name, "", NULL); } return error_found; @@ -216,6 +219,7 @@ static void r82600_init_csrows(struct mem_ctl_info *mci, struct pci_dev *pdev, u8 dramcr) { struct csrow_info *csrow; + struct dimm_info *dimm; int index; u8 drbar; /* SDRAM Row Boundary Address Register */ u32 row_high_limit, row_high_limit_last; @@ -227,6 +231,7 @@ static void r82600_init_csrows(struct mem_ctl_info *mci, struct pci_dev *pdev, for (index = 0; index < mci->nr_csrows; index++) { csrow = &mci->csrows[index]; + dimm = csrow->channels[0].dimm; /* find the DRAM Chip Select Base address and mask */ pci_read_config_byte(pdev, R82600_DRBA + index, &drbar); @@ -247,16 +252,17 @@ static void r82600_init_csrows(struct mem_ctl_info *mci, struct pci_dev *pdev, csrow->first_page = row_base >> PAGE_SHIFT; csrow->last_page = (row_high_limit >> PAGE_SHIFT) - 1; - csrow->nr_pages = csrow->last_page - csrow->first_page + 1; + + dimm->nr_pages = csrow->last_page - csrow->first_page + 1; /* Error address is top 19 bits - so granularity is * * 14 bits */ - csrow->grain = 1 << 14; - csrow->mtype = reg_sdram ? MEM_RDDR : MEM_DDR; + dimm->grain = 1 << 14; + dimm->mtype = reg_sdram ? MEM_RDDR : MEM_DDR; /* FIXME - check that this is unknowable with this chipset */ - csrow->dtype = DEV_UNKNOWN; + dimm->dtype = DEV_UNKNOWN; /* Mode is global on 82600 */ - csrow->edac_mode = ecc_on ? EDAC_SECDED : EDAC_NONE; + dimm->edac_mode = ecc_on ? EDAC_SECDED : EDAC_NONE; row_high_limit_last = row_high_limit; } } @@ -264,6 +270,7 @@ static void r82600_init_csrows(struct mem_ctl_info *mci, struct pci_dev *pdev, static int r82600_probe1(struct pci_dev *pdev, int dev_idx) { struct mem_ctl_info *mci; + struct edac_mc_layer layers[2]; u8 dramcr; u32 eapr; u32 scrub_disabled; @@ -278,8 +285,13 @@ static int r82600_probe1(struct pci_dev *pdev, int dev_idx) debugf2("%s(): sdram refresh rate = %#0x\n", __func__, sdram_refresh_rate); debugf2("%s(): DRAMC register = %#0x\n", __func__, dramcr); - mci = edac_mc_alloc(0, R82600_NR_CSROWS, R82600_NR_CHANS, 0); - + layers[0].type = EDAC_MC_LAYER_CHIP_SELECT; + layers[0].size = R82600_NR_CSROWS; + layers[0].is_virt_csrow = true; + layers[1].type = EDAC_MC_LAYER_CHANNEL; + layers[1].size = R82600_NR_CHANS; + layers[1].is_virt_csrow = false; + mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers, 0); if (mci == NULL) return -ENOMEM; diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index 123204f..4adaf4b 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -314,8 +314,6 @@ struct sbridge_pvt { struct sbridge_info info; struct sbridge_channel channel[NUM_CHANNELS]; - int csrow_map[NUM_CHANNELS][MAX_DIMMS]; - /* Memory type detection */ bool is_mirrored, is_lockstep, is_close_pg; @@ -487,29 +485,14 @@ static struct pci_dev *get_pdev_slot_func(u8 bus, unsigned slot, } /** - * sbridge_get_active_channels() - gets the number of channels and csrows + * check_if_ecc_is_active() - Checks if ECC is active * bus: Device bus - * @channels: Number of channels that will be returned - * @csrows: Number of csrows found - * - * Since EDAC core needs to know in advance the number of available channels - * and csrows, in order to allocate memory for csrows/channels, it is needed - * to run two similar steps. At the first step, implemented on this function, - * it checks the number of csrows/channels present at one socket, identified - * by the associated PCI bus. - * this is used in order to properly allocate the size of mci components. - * Note: one csrow is one dimm. */ -static int sbridge_get_active_channels(const u8 bus, unsigned *channels, - unsigned *csrows) +static int check_if_ecc_is_active(const u8 bus) { struct pci_dev *pdev = NULL; - int i, j; u32 mcmtr; - *channels = 0; - *csrows = 0; - pdev = get_pdev_slot_func(bus, 15, 0); if (!pdev) { sbridge_printk(KERN_ERR, "Couldn't find PCI device " @@ -523,41 +506,14 @@ static int sbridge_get_active_channels(const u8 bus, unsigned *channels, sbridge_printk(KERN_ERR, "ECC is disabled. Aborting\n"); return -ENODEV; } - - for (i = 0; i < NUM_CHANNELS; i++) { - u32 mtr; - - /* Device 15 functions 2 - 5 */ - pdev = get_pdev_slot_func(bus, 15, 2 + i); - if (!pdev) { - sbridge_printk(KERN_ERR, "Couldn't find PCI device " - "%2x.%02d.%d!!!\n", - bus, 15, 2 + i); - return -ENODEV; - } - (*channels)++; - - for (j = 0; j < ARRAY_SIZE(mtr_regs); j++) { - pci_read_config_dword(pdev, mtr_regs[j], &mtr); - debugf1("Bus#%02x channel #%d MTR%d = %x\n", bus, i, j, mtr); - if (IS_DIMM_PRESENT(mtr)) - (*csrows)++; - } - } - - debugf0("Number of active channels: %d, number of active dimms: %d\n", - *channels, *csrows); - return 0; } -static int get_dimm_config(const struct mem_ctl_info *mci) +static int get_dimm_config(struct mem_ctl_info *mci) { struct sbridge_pvt *pvt = mci->pvt_info; - struct csrow_info *csr; + struct dimm_info *dimm; int i, j, banks, ranks, rows, cols, size, npages; - int csrow = 0; - unsigned long last_page = 0; u32 reg; enum edac_type mode; enum mem_type mtype; @@ -616,6 +572,8 @@ static int get_dimm_config(const struct mem_ctl_info *mci) u32 mtr; for (j = 0; j < ARRAY_SIZE(mtr_regs); j++) { + dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers, + i, j, 0); pci_read_config_dword(pvt->pci_tad[i], mtr_regs[j], &mtr); debugf4("Channel #%d MTR%d = %x\n", i, j, mtr); @@ -634,29 +592,15 @@ static int get_dimm_config(const struct mem_ctl_info *mci) pvt->sbridge_dev->mc, i, j, size, npages, banks, ranks, rows, cols); - csr = &mci->csrows[csrow]; - - csr->first_page = last_page; - csr->last_page = last_page + npages - 1; - csr->page_mask = 0UL; /* Unused */ - csr->nr_pages = npages; - csr->grain = 32; - csr->csrow_idx = csrow; - csr->dtype = (banks == 8) ? DEV_X8 : DEV_X4; - csr->ce_count = 0; - csr->ue_count = 0; - csr->mtype = mtype; - csr->edac_mode = mode; - csr->nr_channels = 1; - csr->channels[0].chan_idx = i; - csr->channels[0].ce_count = 0; - pvt->csrow_map[i][j] = csrow; - snprintf(csr->channels[0].label, - sizeof(csr->channels[0].label), + + dimm->nr_pages = npages; + dimm->grain = 32; + dimm->dtype = (banks == 8) ? DEV_X8 : DEV_X4; + dimm->mtype = mtype; + dimm->edac_mode = mode; + snprintf(dimm->label, sizeof(dimm->label), "CPU_SrcID#%u_Channel#%u_DIMM#%u", pvt->sbridge_dev->source_id, i, j); - last_page += npages; - csrow++; } } } @@ -844,11 +788,10 @@ static int get_memory_error_data(struct mem_ctl_info *mci, u8 *socket, long *channel_mask, u8 *rank, - char *area_type) + char **area_type, char *msg) { struct mem_ctl_info *new_mci; struct sbridge_pvt *pvt = mci->pvt_info; - char msg[256]; int n_rir, n_sads, n_tads, sad_way, sck_xch; int sad_interl, idx, base_ch; int interleave_mode; @@ -870,12 +813,10 @@ static int get_memory_error_data(struct mem_ctl_info *mci, */ if ((addr > (u64) pvt->tolm) && (addr < (1LL << 32))) { sprintf(msg, "Error at TOLM area, on addr 0x%08Lx", addr); - edac_mc_handle_ce_no_info(mci, msg); return -EINVAL; } if (addr >= (u64)pvt->tohm) { sprintf(msg, "Error at MMIOH area, on addr 0x%016Lx", addr); - edac_mc_handle_ce_no_info(mci, msg); return -EINVAL; } @@ -892,7 +833,6 @@ static int get_memory_error_data(struct mem_ctl_info *mci, limit = SAD_LIMIT(reg); if (limit <= prv) { sprintf(msg, "Can't discover the memory socket"); - edac_mc_handle_ce_no_info(mci, msg); return -EINVAL; } if (addr <= limit) @@ -901,10 +841,9 @@ static int get_memory_error_data(struct mem_ctl_info *mci, } if (n_sads == MAX_SAD) { sprintf(msg, "Can't discover the memory socket"); - edac_mc_handle_ce_no_info(mci, msg); return -EINVAL; } - area_type = get_dram_attr(reg); + *area_type = get_dram_attr(reg); interleave_mode = INTERLEAVE_MODE(reg); pci_read_config_dword(pvt->pci_sad0, interleave_list[n_sads], @@ -942,7 +881,6 @@ static int get_memory_error_data(struct mem_ctl_info *mci, break; default: sprintf(msg, "Can't discover socket interleave"); - edac_mc_handle_ce_no_info(mci, msg); return -EINVAL; } *socket = sad_interleave[idx]; @@ -957,7 +895,6 @@ static int get_memory_error_data(struct mem_ctl_info *mci, if (!new_mci) { sprintf(msg, "Struct for socket #%u wasn't initialized", *socket); - edac_mc_handle_ce_no_info(mci, msg); return -EINVAL; } mci = new_mci; @@ -973,7 +910,6 @@ static int get_memory_error_data(struct mem_ctl_info *mci, limit = TAD_LIMIT(reg); if (limit <= prv) { sprintf(msg, "Can't discover the memory channel"); - edac_mc_handle_ce_no_info(mci, msg); return -EINVAL; } if (addr <= limit) @@ -1013,7 +949,6 @@ static int get_memory_error_data(struct mem_ctl_info *mci, break; default: sprintf(msg, "Can't discover the TAD target"); - edac_mc_handle_ce_no_info(mci, msg); return -EINVAL; } *channel_mask = 1 << base_ch; @@ -1027,7 +962,6 @@ static int get_memory_error_data(struct mem_ctl_info *mci, break; default: sprintf(msg, "Invalid mirror set. Can't decode addr"); - edac_mc_handle_ce_no_info(mci, msg); return -EINVAL; } } else @@ -1055,7 +989,6 @@ static int get_memory_error_data(struct mem_ctl_info *mci, if (offset > addr) { sprintf(msg, "Can't calculate ch addr: TAD offset 0x%08Lx is too high for addr 0x%08Lx!", offset, addr); - edac_mc_handle_ce_no_info(mci, msg); return -EINVAL; } addr -= offset; @@ -1095,7 +1028,6 @@ static int get_memory_error_data(struct mem_ctl_info *mci, if (n_rir == MAX_RIR_RANGES) { sprintf(msg, "Can't discover the memory rank for ch addr 0x%08Lx", ch_addr); - edac_mc_handle_ce_no_info(mci, msg); return -EINVAL; } rir_way = RIR_WAY(reg); @@ -1409,7 +1341,8 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci, { struct mem_ctl_info *new_mci; struct sbridge_pvt *pvt = mci->pvt_info; - char *type, *optype, *msg, *recoverable_msg; + enum hw_event_mc_err_type tp_event; + char *type, *optype, msg[256]; bool ripv = GET_BITFIELD(m->mcgstatus, 0, 0); bool overflow = GET_BITFIELD(m->status, 62, 62); bool uncorrected_error = GET_BITFIELD(m->status, 61, 61); @@ -1421,13 +1354,21 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci, u32 optypenum = GET_BITFIELD(m->status, 4, 6); long channel_mask, first_channel; u8 rank, socket; - int csrow, rc, dimm; - char *area_type = "Unknown"; - - if (ripv) - type = "NON_FATAL"; - else - type = "FATAL"; + int rc, dimm; + char *area_type = NULL; + + if (uncorrected_error) { + if (ripv) { + type = "FATAL"; + tp_event = HW_EVENT_ERR_FATAL; + } else { + type = "NON_FATAL"; + tp_event = HW_EVENT_ERR_UNCORRECTED; + } + } else { + type = "CORRECTED"; + tp_event = HW_EVENT_ERR_CORRECTED; + } /* * According with Table 15-9 of the Intel Architecture spec vol 3A, @@ -1445,19 +1386,19 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci, } else { switch (optypenum) { case 0: - optype = "generic undef request"; + optype = "generic undef request error"; break; case 1: - optype = "memory read"; + optype = "memory read error"; break; case 2: - optype = "memory write"; + optype = "memory write error"; break; case 3: - optype = "addr/cmd"; + optype = "addr/cmd error"; break; case 4: - optype = "memory scrubbing"; + optype = "memory scrubbing error"; break; default: optype = "reserved"; @@ -1466,13 +1407,13 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci, } rc = get_memory_error_data(mci, m->addr, &socket, - &channel_mask, &rank, area_type); + &channel_mask, &rank, &area_type, msg); if (rc < 0) - return; + goto err_parsing; new_mci = get_mci_for_node_id(socket); if (!new_mci) { - edac_mc_handle_ce_no_info(mci, "Error: socket got corrupted!"); - return; + strcpy(msg, "Error: socket got corrupted!"); + goto err_parsing; } mci = new_mci; pvt = mci->pvt_info; @@ -1486,45 +1427,39 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci, else dimm = 2; - csrow = pvt->csrow_map[first_channel][dimm]; - - if (uncorrected_error && recoverable) - recoverable_msg = " recoverable"; - else - recoverable_msg = ""; /* - * FIXME: What should we do with "channel" information on mcelog? - * Probably, we can just discard it, as the channel information - * comes from the get_memory_error_data() address decoding + * FIXME: On some memory configurations (mirror, lockstep), the + * Memory Controller can't point the error to a single DIMM. The + * EDAC core should be handling the channel mask, in order to point + * to the group of dimm's where the error may be happening. */ - msg = kasprintf(GFP_ATOMIC, - "%d %s error(s): %s on %s area %s%s: cpu=%d Err=%04x:%04x (ch=%d), " - "addr = 0x%08llx => socket=%d, Channel=%ld(mask=%ld), rank=%d\n", - core_err_cnt, - area_type, - optype, - type, - recoverable_msg, - overflow ? "OVERFLOW" : "", - m->cpu, - mscod, errcode, - channel, /* 1111b means not specified */ - (long long) m->addr, - socket, - first_channel, /* This is the real channel on SB */ - channel_mask, - rank); + snprintf(msg, sizeof(msg), + "count:%d%s%s area:%s err_code:%04x:%04x socket:%d channel_mask:%ld rank:%d", + core_err_cnt, + overflow ? " OVERFLOW" : "", + (uncorrected_error && recoverable) ? " recoverable" : "", + area_type, + mscod, errcode, + socket, + channel_mask, + rank); debugf0("%s", msg); + /* FIXME: need support for channel mask */ + /* Call the helper to output message */ - if (uncorrected_error) - edac_mc_handle_fbd_ue(mci, csrow, 0, 0, msg); - else - edac_mc_handle_fbd_ce(mci, csrow, 0, msg); + edac_mc_handle_error(tp_event, mci, + m->addr >> PAGE_SHIFT, m->addr & ~PAGE_MASK, 0, + channel, dimm, -1, + optype, msg, m); + return; +err_parsing: + edac_mc_handle_error(tp_event, mci, 0, 0, 0, + -1, -1, -1, + msg, "", m); - kfree(msg); } /* @@ -1683,16 +1618,25 @@ static void sbridge_unregister_mci(struct sbridge_dev *sbridge_dev) static int sbridge_register_mci(struct sbridge_dev *sbridge_dev) { struct mem_ctl_info *mci; + struct edac_mc_layer layers[2]; struct sbridge_pvt *pvt; - int rc, channels, csrows; + int rc; /* Check the number of active and not disabled channels */ - rc = sbridge_get_active_channels(sbridge_dev->bus, &channels, &csrows); + rc = check_if_ecc_is_active(sbridge_dev->bus); if (unlikely(rc < 0)) return rc; /* allocate a new MC control structure */ - mci = edac_mc_alloc(sizeof(*pvt), csrows, channels, sbridge_dev->mc); + layers[0].type = EDAC_MC_LAYER_CHANNEL; + layers[0].size = NUM_CHANNELS; + layers[0].is_virt_csrow = false; + layers[1].type = EDAC_MC_LAYER_SLOT; + layers[1].size = MAX_DIMMS; + layers[1].is_virt_csrow = true; + mci = edac_mc_alloc(sbridge_dev->mc, ARRAY_SIZE(layers), layers, + sizeof(*pvt)); + if (unlikely(!mci)) return -ENOMEM; diff --git a/drivers/edac/tile_edac.c b/drivers/edac/tile_edac.c index e99d009..7bb4614 100644 --- a/drivers/edac/tile_edac.c +++ b/drivers/edac/tile_edac.c @@ -71,7 +71,10 @@ static void tile_edac_check(struct mem_ctl_info *mci) if (mem_error.sbe_count != priv->ce_count) { dev_dbg(mci->dev, "ECC CE err on node %d\n", priv->node); priv->ce_count = mem_error.sbe_count; - edac_mc_handle_ce(mci, 0, 0, 0, 0, 0, mci->ctl_name); + edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, + 0, 0, 0, + 0, 0, -1, + mci->ctl_name, "", NULL); } } @@ -84,6 +87,7 @@ static int __devinit tile_edac_init_csrows(struct mem_ctl_info *mci) struct csrow_info *csrow = &mci->csrows[0]; struct tile_edac_priv *priv = mci->pvt_info; struct mshim_mem_info mem_info; + struct dimm_info *dimm = csrow->channels[0].dimm; if (hv_dev_pread(priv->hv_devhdl, 0, (HV_VirtAddr)&mem_info, sizeof(struct mshim_mem_info), MSHIM_MEM_INFO_OFF) != @@ -93,27 +97,25 @@ static int __devinit tile_edac_init_csrows(struct mem_ctl_info *mci) } if (mem_info.mem_ecc) - csrow->edac_mode = EDAC_SECDED; + dimm->edac_mode = EDAC_SECDED; else - csrow->edac_mode = EDAC_NONE; + dimm->edac_mode = EDAC_NONE; switch (mem_info.mem_type) { case DDR2: - csrow->mtype = MEM_DDR2; + dimm->mtype = MEM_DDR2; break; case DDR3: - csrow->mtype = MEM_DDR3; + dimm->mtype = MEM_DDR3; break; default: return -1; } - csrow->first_page = 0; - csrow->nr_pages = mem_info.mem_size >> PAGE_SHIFT; - csrow->last_page = csrow->first_page + csrow->nr_pages - 1; - csrow->grain = TILE_EDAC_ERROR_GRAIN; - csrow->dtype = DEV_UNKNOWN; + dimm->nr_pages = mem_info.mem_size >> PAGE_SHIFT; + dimm->grain = TILE_EDAC_ERROR_GRAIN; + dimm->dtype = DEV_UNKNOWN; return 0; } @@ -123,6 +125,7 @@ static int __devinit tile_edac_mc_probe(struct platform_device *pdev) char hv_file[32]; int hv_devhdl; struct mem_ctl_info *mci; + struct edac_mc_layer layers[2]; struct tile_edac_priv *priv; int rc; @@ -132,8 +135,14 @@ static int __devinit tile_edac_mc_probe(struct platform_device *pdev) return -EINVAL; /* A TILE MC has a single channel and one chip-select row. */ - mci = edac_mc_alloc(sizeof(struct tile_edac_priv), - TILE_EDAC_NR_CSROWS, TILE_EDAC_NR_CHANS, pdev->id); + layers[0].type = EDAC_MC_LAYER_CHIP_SELECT; + layers[0].size = TILE_EDAC_NR_CSROWS; + layers[0].is_virt_csrow = true; + layers[1].type = EDAC_MC_LAYER_CHANNEL; + layers[1].size = TILE_EDAC_NR_CHANS; + layers[1].is_virt_csrow = false; + mci = edac_mc_alloc(pdev->id, ARRAY_SIZE(layers), layers, + sizeof(struct tile_edac_priv)); if (mci == NULL) return -ENOMEM; priv = mci->pvt_info; diff --git a/drivers/edac/x38_edac.c b/drivers/edac/x38_edac.c index a438297..1ac7962 100644 --- a/drivers/edac/x38_edac.c +++ b/drivers/edac/x38_edac.c @@ -215,19 +215,26 @@ static void x38_process_error_info(struct mem_ctl_info *mci, return; if ((info->errsts ^ info->errsts2) & X38_ERRSTS_BITS) { - edac_mc_handle_ce_no_info(mci, "UE overwrote CE"); + edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 0, 0, 0, + -1, -1, -1, + "UE overwrote CE", "", NULL); info->errsts = info->errsts2; } for (channel = 0; channel < x38_channel_num; channel++) { log = info->eccerrlog[channel]; if (log & X38_ECCERRLOG_UE) { - edac_mc_handle_ue(mci, 0, 0, - eccerrlog_row(channel, log), "x38 UE"); + edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, + 0, 0, 0, + eccerrlog_row(channel, log), + -1, -1, + "x38 UE", "", NULL); } else if (log & X38_ECCERRLOG_CE) { - edac_mc_handle_ce(mci, 0, 0, - eccerrlog_syndrome(log), - eccerrlog_row(channel, log), 0, "x38 CE"); + edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, + 0, 0, eccerrlog_syndrome(log), + eccerrlog_row(channel, log), + -1, -1, + "x38 CE", "", NULL); } } } @@ -317,9 +324,9 @@ static unsigned long drb_to_nr_pages( static int x38_probe1(struct pci_dev *pdev, int dev_idx) { int rc; - int i; + int i, j; struct mem_ctl_info *mci = NULL; - unsigned long last_page; + struct edac_mc_layer layers[2]; u16 drbs[X38_CHANNELS][X38_RANKS_PER_CHANNEL]; bool stacked; void __iomem *window; @@ -335,7 +342,13 @@ static int x38_probe1(struct pci_dev *pdev, int dev_idx) how_many_channel(pdev); /* FIXME: unconventional pvt_info usage */ - mci = edac_mc_alloc(0, X38_RANKS, x38_channel_num, 0); + layers[0].type = EDAC_MC_LAYER_CHIP_SELECT; + layers[0].size = X38_RANKS; + layers[0].is_virt_csrow = true; + layers[1].type = EDAC_MC_LAYER_CHANNEL; + layers[1].size = x38_channel_num; + layers[1].is_virt_csrow = false; + mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers, 0); if (!mci) return -ENOMEM; @@ -363,7 +376,6 @@ static int x38_probe1(struct pci_dev *pdev, int dev_idx) * cumulative; the last one will contain the total memory * contained in all ranks. */ - last_page = -1UL; for (i = 0; i < mci->nr_csrows; i++) { unsigned long nr_pages; struct csrow_info *csrow = &mci->csrows[i]; @@ -372,20 +384,18 @@ static int x38_probe1(struct pci_dev *pdev, int dev_idx) i / X38_RANKS_PER_CHANNEL, i % X38_RANKS_PER_CHANNEL); - if (nr_pages == 0) { - csrow->mtype = MEM_EMPTY; + if (nr_pages == 0) continue; - } - csrow->first_page = last_page + 1; - last_page += nr_pages; - csrow->last_page = last_page; - csrow->nr_pages = nr_pages; + for (j = 0; j < x38_channel_num; j++) { + struct dimm_info *dimm = csrow->channels[j].dimm; - csrow->grain = nr_pages << PAGE_SHIFT; - csrow->mtype = MEM_DDR2; - csrow->dtype = DEV_UNKNOWN; - csrow->edac_mode = EDAC_UNKNOWN; + dimm->nr_pages = nr_pages / x38_channel_num; + dimm->grain = nr_pages << PAGE_SHIFT; + dimm->mtype = MEM_DDR2; + dimm->dtype = DEV_UNKNOWN; + dimm->edac_mode = EDAC_UNKNOWN; + } } x38_clear_error_info(mci); diff --git a/include/linux/edac.h b/include/linux/edac.h index c621d76..91ba3ba 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -71,6 +71,25 @@ enum dev_type { #define DEV_FLAG_X64 BIT(DEV_X64) /** + * enum hw_event_mc_err_type - type of the detected error + * + * @HW_EVENT_ERR_CORRECTED: Corrected Error - Indicates that an ECC + * corrected error was detected + * @HW_EVENT_ERR_UNCORRECTED: Uncorrected Error - Indicates an error that + * can't be corrected by ECC, but it is not + * fatal (maybe it is on an unused memory area, + * or the memory controller could recover from + * it for example, by re-trying the operation). + * @HW_EVENT_ERR_FATAL: Fatal Error - Uncorrected error that could not + * be recovered. + */ +enum hw_event_mc_err_type { + HW_EVENT_ERR_CORRECTED, + HW_EVENT_ERR_UNCORRECTED, + HW_EVENT_ERR_FATAL, +}; + +/** * enum mem_type - memory types. For a more detailed reference, please see * http://en.wikipedia.org/wiki/DRAM * @@ -313,38 +332,141 @@ enum scrub_type { */ /** + * enum edac_mc_layer - memory controller hierarchy layer + * + * @EDAC_MC_LAYER_BRANCH: memory layer is named "branch" + * @EDAC_MC_LAYER_CHANNEL: memory layer is named "channel" + * @EDAC_MC_LAYER_SLOT: memory layer is named "slot" + * @EDAC_MC_LAYER_CHIP_SELECT: memory layer is named "chip select" + * + * This enum is used by the drivers to tell edac_mc_sysfs what name should + * be used when describing a memory stick location. + */ +enum edac_mc_layer_type { + EDAC_MC_LAYER_BRANCH, + EDAC_MC_LAYER_CHANNEL, + EDAC_MC_LAYER_SLOT, + EDAC_MC_LAYER_CHIP_SELECT, +}; + +/** + * struct edac_mc_layer - describes the memory controller hierarchy + * @layer: layer type + * @size: number of components per layer. For example, + * if the channel layer has two channels, size = 2 + * @is_virt_csrow: This layer is part of the "csrow" when old API + * compatibility mode is enabled. Otherwise, it is + * a channel + */ +struct edac_mc_layer { + enum edac_mc_layer_type type; + unsigned size; + bool is_virt_csrow; +}; + +/* + * Maximum number of layers used by the memory controller to uniquely + * identify a single memory stick. + * NOTE: Changing this constant requires not only to change the constant + * below, but also to change the existing code at the core, as there are + * some code there that are optimized for 3 layers. + */ +#define EDAC_MAX_LAYERS 3 + +/** + * EDAC_DIMM_PTR - Macro responsible to find a pointer inside a pointer array + * for the element given by [layer0,layer1,layer2] position + * + * @layers: a struct edac_mc_layer array, describing how many elements + * were allocated for each layer + * @var: name of the var where we want to get the pointer + * (like mci->dimms) + * @n_layers: Number of layers at the @layers array + * @layer0: layer0 position + * @layer1: layer1 position. Unused if n_layers < 2 + * @layer2: layer2 position. Unused if n_layers < 3 + * + * For 1 layer, this macro returns &var[layer0] + * For 2 layers, this macro is similar to allocate a bi-dimensional array + * and to return "&var[layer0][layer1]" + * For 3 layers, this macro is similar to allocate a tri-dimensional array + * and to return "&var[layer0][layer1][layer2]" + * + * A loop could be used here to make it more generic, but, as we only have + * 3 layers, this is a little faster. + * By design, layers can never be 0 or more than 3. If that ever happens, + * a NULL is returned, causing an OOPS during the memory allocation routine, + * with would point to the developer that he's doing something wrong. + */ +#define EDAC_DIMM_PTR(layers, var, nlayers, layer0, layer1, layer2) ({ \ + typeof(var) __p; \ + if ((nlayers) == 1) \ + __p = &var[layer0]; \ + else if ((nlayers) == 2) \ + __p = &var[(layer1) + ((layers[1]).size * (layer0))]; \ + else if ((nlayers) == 3) \ + __p = &var[(layer2) + ((layers[2]).size * ((layer1) + \ + ((layers[1]).size * (layer0))))]; \ + else \ + __p = NULL; \ + __p; \ +}) + + +/* FIXME: add the proper per-location error counts */ +struct dimm_info { + char label[EDAC_MC_LABEL_LEN + 1]; /* DIMM label on motherboard */ + + /* Memory location data */ + unsigned location[EDAC_MAX_LAYERS]; + + struct mem_ctl_info *mci; /* the parent */ + + u32 grain; /* granularity of reported error in bytes */ + enum dev_type dtype; /* memory device type */ + enum mem_type mtype; /* memory dimm type */ + enum edac_type edac_mode; /* EDAC mode for this dimm */ + + u32 nr_pages; /* number of pages on this dimm */ + + unsigned csrow, cschannel; /* Points to the old API data */ +}; + +/** * struct rank_info - contains the information for one DIMM rank * * @chan_idx: channel number where the rank is (typically, 0 or 1) * @ce_count: number of correctable errors for this rank - * @label: DIMM label. Different ranks for the same DIMM should be - * filled, on userspace, with the same label. - * FIXME: The core currently won't enforce it. * @csrow: A pointer to the chip select row structure (the parent * structure). The location of the rank is given by * the (csrow->csrow_idx, chan_idx) vector. + * @dimm: A pointer to the DIMM structure, where the DIMM label + * information is stored. + * + * FIXME: Currently, the EDAC core model will assume one DIMM per rank. + * This is a bad assumption, but it makes this patch easier. Later + * patches in this series will fix this issue. */ struct rank_info { int chan_idx; - u32 ce_count; - char label[EDAC_MC_LABEL_LEN + 1]; - struct csrow_info *csrow; /* the parent */ + struct csrow_info *csrow; + struct dimm_info *dimm; + + u32 ce_count; /* Correctable Errors for this csrow */ }; struct csrow_info { - unsigned long first_page; /* first page number in dimm */ - unsigned long last_page; /* last page number in dimm */ + /* Used only by edac_mc_find_csrow_by_page() */ + unsigned long first_page; /* first page number in csrow */ + unsigned long last_page; /* last page number in csrow */ unsigned long page_mask; /* used for interleaving - - * 0UL for non intlv - */ - u32 nr_pages; /* number of pages in csrow */ - u32 grain; /* granularity of reported error in bytes */ - int csrow_idx; /* the chip-select row */ - enum dev_type dtype; /* memory device type */ + * 0UL for non intlv */ + + int csrow_idx; /* the chip-select row */ + u32 ue_count; /* Uncorrectable Errors for this csrow */ u32 ce_count; /* Correctable Errors for this csrow */ - enum mem_type mtype; /* memory csrow type */ - enum edac_type edac_mode; /* EDAC mode for this csrow */ + struct mem_ctl_info *mci; /* the parent */ struct kobject kobj; /* sysfs kobject for this csrow */ @@ -426,8 +548,20 @@ struct mem_ctl_info { unsigned long (*ctl_page_to_phys) (struct mem_ctl_info * mci, unsigned long page); int mc_idx; - int nr_csrows; struct csrow_info *csrows; + unsigned nr_csrows, num_cschannel; + + /* Memory Controller hierarchy */ + unsigned n_layers; + struct edac_mc_layer *layers; + bool mem_is_per_rank; + + /* + * DIMM info. Will eventually remove the entire csrows_info some day + */ + unsigned tot_dimms; + struct dimm_info *dimms; + /* * FIXME - what about controllers on other busses? - IDs must be * unique. dev pointer should be sufficiently unique, but @@ -440,12 +574,16 @@ struct mem_ctl_info { const char *dev_name; char proc_name[MC_PROC_NAME_MAX_LEN + 1]; void *pvt_info; - u32 ue_noinfo_count; /* Uncorrectable Errors w/o info */ - u32 ce_noinfo_count; /* Correctable Errors w/o info */ - u32 ue_count; /* Total Uncorrectable Errors for this MC */ - u32 ce_count; /* Total Correctable Errors for this MC */ unsigned long start_time; /* mci load start time (in jiffies) */ + /* + * drivers shouldn't access those fields directly, as the core + * already handles that. + */ + u32 ce_noinfo_count, ue_noinfo_count; + u32 ue_mc, ce_mc; + u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS]; + struct completion complete; /* edac sysfs device control */ @@ -458,7 +596,7 @@ struct mem_ctl_info { * by the low level driver. * * Set by the low level driver to provide attributes at the - * controller level, same level as 'ue_count' and 'ce_count' above. + * controller level. * An array of structures, NULL terminated * * If attributes are desired, then set to array of attributes |