From 20f4d69243785167449e6e765ce9f2a7b097ac45 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 29 Sep 2016 13:43:21 -0700 Subject: [PATCH 01/26] EDAC, {sb,skx}_edac: Use Intel model macros instead of open-coding them We now have symbolic names for a bunch of Intel CPU models via asm/intel-family.h. The original conversion missed the EDAC drivers. Convert them. Signed-off-by: Dave Hansen Cc: Tony Luck Cc: linux-edac Link: http://lkml.kernel.org/r/20160929204321.9FAE5F84@viggo.jf.intel.com [ Remove comment, macro name is descriptive enough. ] Signed-off-by: Borislav Petkov --- drivers/edac/sb_edac.c | 13 +++++++------ drivers/edac/skx_edac.c | 3 ++- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index 54775221a01f..843ac6c351fd 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -3365,12 +3366,12 @@ fail0: { X86_VENDOR_INTEL, 6, model, 0, (unsigned long)&table } static const struct x86_cpu_id sbridge_cpuids[] = { - ICPU(0x2d, pci_dev_descr_sbridge_table), /* SANDY_BRIDGE */ - ICPU(0x3e, pci_dev_descr_ibridge_table), /* IVY_BRIDGE */ - ICPU(0x3f, pci_dev_descr_haswell_table), /* HASWELL */ - ICPU(0x4f, pci_dev_descr_broadwell_table), /* BROADWELL */ - ICPU(0x56, pci_dev_descr_broadwell_table), /* BROADWELL-DE */ - ICPU(0x57, pci_dev_descr_knl_table), /* KNIGHTS_LANDING */ + ICPU(INTEL_FAM6_SANDYBRIDGE_X, pci_dev_descr_sbridge_table), + ICPU(INTEL_FAM6_IVYBRIDGE_X, pci_dev_descr_ibridge_table), + ICPU(INTEL_FAM6_HASWELL_X, pci_dev_descr_haswell_table), + ICPU(INTEL_FAM6_BROADWELL_X, pci_dev_descr_broadwell_table), + ICPU(INTEL_FAM6_BROADWELL_XEON_D, pci_dev_descr_broadwell_table), + ICPU(INTEL_FAM6_XEON_PHI_KNL, pci_dev_descr_knl_table), { } }; MODULE_DEVICE_TABLE(x86cpu, sbridge_cpuids); diff --git a/drivers/edac/skx_edac.c b/drivers/edac/skx_edac.c index 0ff4878c2aa1..f3b3d3be1e84 100644 --- a/drivers/edac/skx_edac.c +++ b/drivers/edac/skx_edac.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -263,7 +264,7 @@ fail: } const struct x86_cpu_id skx_cpuids[] = { - { X86_VENDOR_INTEL, 6, 0x55, 0, 0 }, /* Skylake */ + { X86_VENDOR_INTEL, 6, INTEL_FAM6_SKYLAKE_X, 0, 0 }, { } }; MODULE_DEVICE_TABLE(x86cpu, skx_cpuids); From 9a9260ca926c6e7b6bcfd3c93a1820d86565ff4f Mon Sep 17 00:00:00 2001 From: Piotr Luc Date: Thu, 13 Oct 2016 17:30:59 +0200 Subject: [PATCH 02/26] EDAC, sb_edac: Add Knights Mill support Add Knights Mill (KNM) to the list of CPU models supported by sb_edac. Signed-off-by: Piotr Luc Reviewed-by: Dave Hansen Cc: Tony Luck Cc: linux-edac Link: http://lkml.kernel.org/r/20161013153105.2517-6-piotr.luc@intel.com Signed-off-by: Borislav Petkov --- drivers/edac/sb_edac.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index 843ac6c351fd..c1ad0eb7d5dd 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -3372,6 +3372,7 @@ static const struct x86_cpu_id sbridge_cpuids[] = { ICPU(INTEL_FAM6_BROADWELL_X, pci_dev_descr_broadwell_table), ICPU(INTEL_FAM6_BROADWELL_XEON_D, pci_dev_descr_broadwell_table), ICPU(INTEL_FAM6_XEON_PHI_KNL, pci_dev_descr_knl_table), + ICPU(INTEL_FAM6_XEON_PHI_KNM, pci_dev_descr_knl_table), { } }; MODULE_DEVICE_TABLE(x86cpu, sbridge_cpuids); From 240ea9214a6042f3807fd09f149d74ae5700e117 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Sat, 22 Oct 2016 14:38:18 +0000 Subject: [PATCH 03/26] EDAC, skx_edac: Fix non static symbol warnings Fix the following sparse warnings: drivers/edac/skx_edac.c:266:25: warning: symbol 'skx_cpuids' was not declared. Should it be static? drivers/edac/skx_edac.c:1040:12: warning: symbol 'skx_init' was not declared. Should it be static? Signed-off-by: Wei Yongjun Cc: Tony Luck Cc: linux-edac Link: http://lkml.kernel.org/r/1477147098-2842-1-git-send-email-weiyj.lk@gmail.com Signed-off-by: Borislav Petkov --- drivers/edac/skx_edac.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/edac/skx_edac.c b/drivers/edac/skx_edac.c index f3b3d3be1e84..9edcb29b3001 100644 --- a/drivers/edac/skx_edac.c +++ b/drivers/edac/skx_edac.c @@ -263,7 +263,7 @@ fail: return -ENODEV; } -const struct x86_cpu_id skx_cpuids[] = { +static const struct x86_cpu_id skx_cpuids[] = { { X86_VENDOR_INTEL, 6, INTEL_FAM6_SKYLAKE_X, 0, 0 }, { } }; @@ -1037,7 +1037,7 @@ static void skx_remove(void) * search for all the devices we need * check which DIMMs are present. */ -int __init skx_init(void) +static int __init skx_init(void) { const struct x86_cpu_id *id; const struct munit *m; From 90e493d7d51c791a2adc2de962cbd5000f1b7460 Mon Sep 17 00:00:00 2001 From: Thor Thayer Date: Wed, 19 Oct 2016 14:53:47 -0500 Subject: [PATCH 04/26] EDAC, altera: Disable IRQs while injecting SDRAM errors Disable IRQs while injecting SDRAM errors. The RT patches exposed a spinlock deadlock where the spinlock taken for the regmap write deadlocked with the IRQ clear regmap write. Error injection is not normally enabled for ECC but only for testing. Signed-off-by: Thor Thayer Cc: linux-edac Link: http://lkml.kernel.org/r/1476906827-9412-1-git-send-email-tthayer@opensource.altera.com Signed-off-by: Borislav Petkov --- drivers/edac/altera_edac.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/edac/altera_edac.c b/drivers/edac/altera_edac.c index 58d3e2b39b5b..6421cc3c7dc1 100644 --- a/drivers/edac/altera_edac.c +++ b/drivers/edac/altera_edac.c @@ -153,13 +153,17 @@ static ssize_t altr_sdr_mc_err_inject_write(struct file *file, if (count == 3) { edac_printk(KERN_ALERT, EDAC_MC, "Inject Double bit error\n"); + local_irq_disable(); regmap_write(drvdata->mc_vbase, priv->ce_ue_trgr_offset, (read_reg | priv->ue_set_mask)); + local_irq_enable(); } else { edac_printk(KERN_ALERT, EDAC_MC, "Inject Single bit error\n"); + local_irq_disable(); regmap_write(drvdata->mc_vbase, priv->ce_ue_trgr_offset, (read_reg | priv->ce_set_mask)); + local_irq_enable(); } ptemp[0] = 0x5A5A5A5A; From c73e8833bec5a8ed4f55db0ca964cc1e998656df Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 14 Nov 2016 13:26:11 +0100 Subject: [PATCH 05/26] EDAC, mc: Fix locking around mc_devices list When accessing the mc_devices list of memory controller descriptors, we need to hold mem_ctls_mutex. This was not always the case, fix that. Make all external callers call a version which grabs the mutex since the last is local to edac_mc.c. Reported-by: Yazen Ghannam Signed-off-by: Borislav Petkov --- drivers/edac/edac_mc.c | 49 ++++++++++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 18 deletions(-) diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index c3ee3ad98a63..d2ea9c4f1824 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -482,15 +482,8 @@ void edac_mc_free(struct mem_ctl_info *mci) } EXPORT_SYMBOL_GPL(edac_mc_free); - -/** - * find_mci_by_dev - * - * scan list of controllers looking for the one that manages - * the 'dev' device - * @dev: pointer to a struct device related with the MCI - */ -struct mem_ctl_info *find_mci_by_dev(struct device *dev) +/* Caller must hold mem_ctls_mutex */ +static struct mem_ctl_info *__find_mci_by_dev(struct device *dev) { struct mem_ctl_info *mci; struct list_head *item; @@ -506,6 +499,24 @@ struct mem_ctl_info *find_mci_by_dev(struct device *dev) return NULL; } + +/** + * find_mci_by_dev + * + * scan list of controllers looking for the one that manages + * the 'dev' device + * @dev: pointer to a struct device related with the MCI + */ +struct mem_ctl_info *find_mci_by_dev(struct device *dev) +{ + struct mem_ctl_info *ret; + + mutex_lock(&mem_ctls_mutex); + ret = __find_mci_by_dev(dev); + mutex_unlock(&mem_ctls_mutex); + + return ret; +} EXPORT_SYMBOL_GPL(find_mci_by_dev); /* @@ -588,7 +599,7 @@ static int add_mc_to_global_list(struct mem_ctl_info *mci) insert_before = &mc_devices; - p = find_mci_by_dev(mci->pdev); + p = __find_mci_by_dev(mci->pdev); if (unlikely(p != NULL)) goto fail0; @@ -640,26 +651,28 @@ static int del_mc_from_global_list(struct mem_ctl_info *mci) * * If found, return a pointer to the structure. * Else return NULL. - * - * Caller must hold mem_ctls_mutex. */ struct mem_ctl_info *edac_mc_find(int idx) { + struct mem_ctl_info *mci = NULL; struct list_head *item; - struct mem_ctl_info *mci; + + mutex_lock(&mem_ctls_mutex); list_for_each(item, &mc_devices) { mci = list_entry(item, struct mem_ctl_info, link); if (mci->mc_idx >= idx) { - if (mci->mc_idx == idx) - return mci; - + if (mci->mc_idx == idx) { + goto unlock; + } break; } } - return NULL; +unlock: + mutex_unlock(&mem_ctls_mutex); + return mci; } EXPORT_SYMBOL(edac_mc_find); @@ -779,7 +792,7 @@ struct mem_ctl_info *edac_mc_del_mc(struct device *dev) mutex_lock(&mem_ctls_mutex); /* find the requested mci struct in the global list */ - mci = find_mci_by_dev(dev); + mci = __find_mci_by_dev(dev); if (mci == NULL) { mutex_unlock(&mem_ctls_mutex); return NULL; From 8176170e03db7289ca14673718f1a7f6aae51706 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 14 Nov 2016 23:11:04 +0000 Subject: [PATCH 06/26] EDAC, xgene: Fix spelling mistake in error messages Trivial fix to spelling mistake "Mutilple" to "Multiple" in error messages. Signed-off-by: Colin Ian King Reviewed-by: Loc Ho Cc: linux-edac Link: http://lkml.kernel.org/r/20161114231104.5585-1-colin.king@canonical.com Signed-off-by: Borislav Petkov --- drivers/edac/xgene_edac.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/edac/xgene_edac.c b/drivers/edac/xgene_edac.c index bf19b6e3bd12..5569391ea800 100644 --- a/drivers/edac/xgene_edac.c +++ b/drivers/edac/xgene_edac.c @@ -1602,16 +1602,16 @@ static void xgene_edac_pa_report(struct edac_device_ctl_info *edac_dev) dev_err(edac_dev->dev, "IOB PA read data RAM error\n"); if (reg & IOBPA_M_RDATA_CORRUPT_MASK) dev_err(edac_dev->dev, - "Mutilple IOB PA read data RAM error\n"); + "Multiple IOB PA read data RAM error\n"); if (reg & IOBPA_WDATA_CORRUPT_MASK) dev_err(edac_dev->dev, "IOB PA write data RAM error\n"); if (reg & IOBPA_M_WDATA_CORRUPT_MASK) dev_err(edac_dev->dev, - "Mutilple IOB PA write data RAM error\n"); + "Multiple IOB PA write data RAM error\n"); if (reg & IOBPA_TRANS_CORRUPT_MASK) dev_err(edac_dev->dev, "IOB PA transaction error\n"); if (reg & IOBPA_M_TRANS_CORRUPT_MASK) - dev_err(edac_dev->dev, "Mutilple IOB PA transaction error\n"); + dev_err(edac_dev->dev, "Multiple IOB PA transaction error\n"); if (reg & IOBPA_REQIDRAM_CORRUPT_MASK) dev_err(edac_dev->dev, "IOB PA transaction ID RAM error\n"); if (reg & IOBPA_M_REQIDRAM_CORRUPT_MASK) From 27bda205ba93c02d8b5dcd1d5c2acc84d889ca6a Mon Sep 17 00:00:00 2001 From: Yanjiang Jin Date: Thu, 17 Nov 2016 10:56:20 +0800 Subject: [PATCH 07/26] EDAC, mpc85xx: Implement remove method for the platform driver If we execute the below steps without this patch: modprobe mpc85xx_edac [The first insmod, everything is well.] modprobe -r mpc85xx_edac modprobe mpc85xx_edac [insmod again, error happens.] We would get the error messages as below: BUG: recent printk recursion! Oops: Kernel access of bad area, sig: 11 [#48] Modules linked in: mpc85xx_edac edac_core softdog [last unloaded: mpc85xx_edac] CPU: 5 PID: 14773 Comm: modprobe Tainted: G D C 4.8.3-rt2 .vsnprintf .vscnprintf .vprintk_emit .printk .edac_pci_add_device .mpc85xx_pci_err_probe .platform_drv_probe .driver_probe_device .__driver_attach .bus_for_each_dev .driver_attach .bus_add_driver .driver_register .__platform_register_drivers .mpc85xx_mc_init .do_one_initcall .do_init_module .load_module .SyS_finit_module system_call Address this by cleaning up properly when removing the platform driver. Tested on a T4240QDS board. Signed-off-by: Yanjiang Jin Acked-by: Johannes Thumshirn Cc: linux-edac Cc: york.sun@nxp.com Link: http://lkml.kernel.org/r/1479351380-17109-2-git-send-email-yanjiang.jin@windriver.com [ Boris: massage commit message. ] Signed-off-by: Borislav Petkov --- drivers/edac/mpc85xx_edac.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/edac/mpc85xx_edac.c b/drivers/edac/mpc85xx_edac.c index ff0567526ee3..c62602141f95 100644 --- a/drivers/edac/mpc85xx_edac.c +++ b/drivers/edac/mpc85xx_edac.c @@ -300,6 +300,22 @@ err: return res; } +static int mpc85xx_pci_err_remove(struct platform_device *op) +{ + struct edac_pci_ctl_info *pci = dev_get_drvdata(&op->dev); + struct mpc85xx_pci_pdata *pdata = pci->pvt_info; + + edac_dbg(0, "\n"); + + out_be32(pdata->pci_vbase + MPC85XX_PCI_ERR_ADDR, orig_pci_err_cap_dr); + out_be32(pdata->pci_vbase + MPC85XX_PCI_ERR_EN, orig_pci_err_en); + + edac_pci_del_device(&op->dev); + edac_pci_free_ctl_info(pci); + + return 0; +} + static const struct platform_device_id mpc85xx_pci_err_match[] = { { .name = "mpc85xx-pci-edac" @@ -309,6 +325,7 @@ static const struct platform_device_id mpc85xx_pci_err_match[] = { static struct platform_driver mpc85xx_pci_err_driver = { .probe = mpc85xx_pci_err_probe, + .remove = mpc85xx_pci_err_remove, .id_table = mpc85xx_pci_err_match, .driver = { .name = "mpc85xx_pci_err", From 1e8096bb2031c53b6bf3adc7667b4b2bdf2a1ac6 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Thu, 17 Nov 2016 17:57:28 -0500 Subject: [PATCH 08/26] EDAC: Add LRDDR4 DRAM type AMD Fam17h systems can support Load-Reduced DDR4 DIMMs. So add this new type to edac.h in preparation for the Fam17h EDAC update. Also, let's fix a format issue with the LRDDR3 line while we're here. Signed-off-by: Yazen Ghannam Cc: Aravind Gopalakrishnan Cc: linux-edac Link: http://lkml.kernel.org/r/1479423463-8536-3-git-send-email-Yazen.Ghannam@amd.com Signed-off-by: Borislav Petkov --- include/linux/edac.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/linux/edac.h b/include/linux/edac.h index 9e0d78966552..bf2bf87bb2f9 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -192,10 +192,11 @@ static inline char *mc_event_error_type(const unsigned int err_type) * @MEM_DDR3: DDR3 RAM * @MEM_RDDR3: Registered DDR3 RAM * This is a variant of the DDR3 memories. - * @MEM_LRDDR3 Load-Reduced DDR3 memory. + * @MEM_LRDDR3: Load-Reduced DDR3 memory. * @MEM_DDR4: Unbuffered DDR4 RAM * @MEM_RDDR4: Registered DDR4 RAM * This is a variant of the DDR4 memories. + * @MEM_LRDDR4: Load-Reduced DDR4 memory. */ enum mem_type { MEM_EMPTY = 0, @@ -218,6 +219,7 @@ enum mem_type { MEM_LRDDR3, MEM_DDR4, MEM_RDDR4, + MEM_LRDDR4, }; #define MEM_FLAG_EMPTY BIT(MEM_EMPTY) @@ -239,6 +241,7 @@ enum mem_type { #define MEM_FLAG_RDDR3 BIT(MEM_RDDR3) #define MEM_FLAG_DDR4 BIT(MEM_DDR4) #define MEM_FLAG_RDDR4 BIT(MEM_RDDR4) +#define MEM_FLAG_LRDDR4 BIT(MEM_LRDDR4) /** * enum edac-type - Error Detection and Correction capabilities and mode From 5c332202f805170436b210938fe3ad7f6b29bdbc Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Thu, 17 Nov 2016 17:57:29 -0500 Subject: [PATCH 09/26] EDAC, mce_amd: Rename nb_bus_decoder to dram_ecc_decoder nb_bus_decoder() is only used for DRAM ECC errors so rename it so that the name is more generic and descriptive. Also, call it for DRAM ECC errors on SMCA systems. [ Boris: rename it to real function name with a verb in it. ] Signed-off-by: Yazen Ghannam Cc: Aravind Gopalakrishnan Cc: linux-edac Link: http://lkml.kernel.org/r/1479423463-8536-4-git-send-email-Yazen.Ghannam@amd.com Signed-off-by: Borislav Petkov --- drivers/edac/mce_amd.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index daaac2c79ca7..ac11bccba809 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -8,7 +8,7 @@ static struct amd_decoder_ops *fam_ops; static u8 xec_mask = 0xf; static bool report_gart_errors; -static void (*nb_bus_decoder)(int node_id, struct mce *m); +static void (*decode_dram_ecc)(int node_id, struct mce *m); void amd_report_gart_errors(bool v) { @@ -18,16 +18,16 @@ EXPORT_SYMBOL_GPL(amd_report_gart_errors); void amd_register_ecc_decoder(void (*f)(int, struct mce *)) { - nb_bus_decoder = f; + decode_dram_ecc = f; } EXPORT_SYMBOL_GPL(amd_register_ecc_decoder); void amd_unregister_ecc_decoder(void (*f)(int, struct mce *)) { - if (nb_bus_decoder) { - WARN_ON(nb_bus_decoder != f); + if (decode_dram_ecc) { + WARN_ON(decode_dram_ecc != f); - nb_bus_decoder = NULL; + decode_dram_ecc = NULL; } } EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder); @@ -763,8 +763,8 @@ static void decode_mc4_mce(struct mce *m) pr_cont("%s.\n", mc4_mce_desc[xec]); - if (nb_bus_decoder) - nb_bus_decoder(node_id, m); + if (decode_dram_ecc) + decode_dram_ecc(node_id, m); return; } break; @@ -877,6 +877,13 @@ static void decode_smca_errors(struct mce *m) pr_emerg(HW_ERR "%s Error: ", ip_name); pr_cont("%s.\n", smca_mce_descs[bank_type].descs[xec]); } + + /* + * amd_get_nb_id() returns the last level cache id. + * The last level cache on Fam17h is 1 level below the node. + */ + if (bank_type == SMCA_UMC && xec == 0 && decode_dram_ecc) + decode_dram_ecc(amd_get_nb_id(m->extcpu) >> 1, m); } static inline void amd_decode_err_code(u16 ec) From e7934b70d76557551b205449cba1af2153a48e42 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Thu, 17 Nov 2016 17:57:30 -0500 Subject: [PATCH 10/26] EDAC, amd64: Change target of pci_name from F2 to F3 AMD Fam17h will not be using PCI function 2 for EDAC, but will continue to use function 3. So let's get the name of F3 instead of F2 to support Fam17h and previous families. Signed-off-by: Yazen Ghannam Cc: Aravind Gopalakrishnan Cc: linux-edac Link: http://lkml.kernel.org/r/1479423463-8536-5-git-send-email-Yazen.Ghannam@amd.com Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index ee181c53626f..a38b82de0b6f 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2703,7 +2703,7 @@ static void setup_mci_misc_attrs(struct mem_ctl_info *mci, mci->mod_name = EDAC_MOD_STR; mci->mod_ver = EDAC_AMD64_VERSION; mci->ctl_name = fam->ctl_name; - mci->dev_name = pci_name(pvt->F2); + mci->dev_name = pci_name(pvt->F3); mci->ctl_page_to_phys = NULL; /* memory scrubber interface */ From e70984d9ebb4b3d143bf33eaca39d4b3d1c352ff Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Thu, 17 Nov 2016 17:57:31 -0500 Subject: [PATCH 11/26] EDAC, amd64: Rename __log_bus_error() to be more specific We only use __log_bus_error() to log DRAM ECC errors, so let's change the name to reflect this. We'll also use this function for DRAM ECC errors on Fam17h, but we'll call it from a different function than decode_bus_error(). Signed-off-by: Yazen Ghannam Cc: Aravind Gopalakrishnan Cc: linux-edac Link: http://lkml.kernel.org/r/1479423463-8536-6-git-send-email-Yazen.Ghannam@amd.com Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index a38b82de0b6f..aaff0b9cdaa8 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2155,7 +2155,7 @@ static int get_channel_from_ecc_syndrome(struct mem_ctl_info *mci, u16 syndrome) return map_err_sym_to_channel(err_sym, pvt->ecc_sym_sz); } -static void __log_bus_error(struct mem_ctl_info *mci, struct err_info *err, +static void __log_ecc_error(struct mem_ctl_info *mci, struct err_info *err, u8 ecc_type) { enum hw_event_mc_err_type err_type; @@ -2227,7 +2227,7 @@ static inline void decode_bus_error(int node_id, struct mce *m) pvt->ops->map_sysaddr_to_csrow(mci, sys_addr, &err); - __log_bus_error(mci, &err, ecc_type); + __log_ecc_error(mci, &err, ecc_type); } /* From d12a969ebbfcfc25853c4147d42b388f758e8784 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Thu, 17 Nov 2016 17:57:32 -0500 Subject: [PATCH 12/26] EDAC, amd64: Add Deferred Error type Currently, deferred errors are classified as correctable in EDAC. Add a new error type for deferred errors so that they are correctly reported to the user. Signed-off-by: Yazen Ghannam Cc: Aravind Gopalakrishnan Cc: linux-edac Link: http://lkml.kernel.org/r/1479423463-8536-7-git-send-email-Yazen.Ghannam@amd.com Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 2 ++ include/linux/edac.h | 3 +++ 2 files changed, 5 insertions(+) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index aaff0b9cdaa8..dd45cff02388 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2165,6 +2165,8 @@ static void __log_ecc_error(struct mem_ctl_info *mci, struct err_info *err, err_type = HW_EVENT_ERR_CORRECTED; else if (ecc_type == 1) err_type = HW_EVENT_ERR_UNCORRECTED; + else if (ecc_type == 3) + err_type = HW_EVENT_ERR_DEFERRED; else { WARN(1, "Something is rotten in the state of Denmark.\n"); return; diff --git a/include/linux/edac.h b/include/linux/edac.h index bf2bf87bb2f9..cb56dcba68c6 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -134,6 +134,7 @@ enum dev_type { enum hw_event_mc_err_type { HW_EVENT_ERR_CORRECTED, HW_EVENT_ERR_UNCORRECTED, + HW_EVENT_ERR_DEFERRED, HW_EVENT_ERR_FATAL, HW_EVENT_ERR_INFO, }; @@ -145,6 +146,8 @@ static inline char *mc_event_error_type(const unsigned int err_type) return "Corrected"; case HW_EVENT_ERR_UNCORRECTED: return "Uncorrected"; + case HW_EVENT_ERR_DEFERRED: + return "Deferred"; case HW_EVENT_ERR_FATAL: return "Fatal"; default: From 044e7a414be9ba20826e5fd482214686193fe7b6 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Tue, 22 Nov 2016 15:40:16 -0600 Subject: [PATCH 13/26] EDAC, amd64: Don't force-enable ECC checking on newer systems It's not recommended for the OS to try and force-enable ECC checking. This is considered a firmware task since it includes memory training, etc, so don't change ECC settings on Fam17h or newer systems and inform the user. Signed-off-by: Yazen Ghannam Cc: Aravind Gopalakrishnan Cc: linux-edac Link: http://lkml.kernel.org/r/1479850816-1595-1-git-send-email-Yazen.Ghannam@amd.com [ Put the "forcing" message in an else branch. ] Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index dd45cff02388..ca1d63aa4e59 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2629,7 +2629,6 @@ static void restore_ecc_error_reporting(struct ecc_settings *s, u16 nid, { u32 value, mask = 0x3; /* UECC/CECC enable */ - if (!s->nbctl_valid) return; @@ -2895,7 +2894,11 @@ static int probe_one_instance(unsigned int nid) if (!ecc_enable_override) goto err_enable; - amd64_warn("Forcing ECC on!\n"); + if (boot_cpu_data.x86 >= 0x17) { + amd64_warn("Forcing ECC on is not recommended on newer systems. Please enable ECC in BIOS."); + goto err_enable; + } else + amd64_warn("Forcing ECC on!\n"); if (!enable_ecc_error_reporting(s, nid, F3)) goto err_enable; @@ -2904,7 +2907,9 @@ static int probe_one_instance(unsigned int nid) ret = init_one_instance(nid); if (ret < 0) { amd64_err("Error probing instance: %d\n", nid); - restore_ecc_error_reporting(s, nid, F3); + + if (boot_cpu_data.x86 < 0x17) + restore_ecc_error_reporting(s, nid, F3); } return ret; From 196b79fcc8ed4e3c565a746b06125596bee06b62 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Thu, 17 Nov 2016 17:57:34 -0500 Subject: [PATCH 14/26] EDAC, amd64: Extend ecc_enabled() to Fam17h Update the ecc_enabled() function to work on Fam17h. This entails reading a different set of registers and using the SMN (System Management Network) rather than PCI devices. Signed-off-by: Yazen Ghannam Cc: Aravind Gopalakrishnan Cc: linux-edac Cc: x86-ml Link: http://lkml.kernel.org/r/1479423463-8536-9-git-send-email-Yazen.Ghannam@amd.com [ Fixup ecc_en assignment and get_umc_base(). ] Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 50 +++++++++++++++++++++++++++++++-------- drivers/edac/amd64_edac.h | 16 +++++++++++++ 2 files changed, 56 insertions(+), 10 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index ca1d63aa4e59..870f56713c22 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2664,21 +2664,51 @@ static const char *ecc_msg = static bool ecc_enabled(struct pci_dev *F3, u16 nid) { - u32 value; - u8 ecc_en = 0; bool nb_mce_en = false; + u8 ecc_en = 0, i; + u32 value; - amd64_read_pci_cfg(F3, NBCFG, &value); + if (boot_cpu_data.x86 >= 0x17) { + u8 umc_en_mask = 0, ecc_en_mask = 0; + + for (i = 0; i < NUM_UMCS; i++) { + u32 base = get_umc_base(i); + + /* Only check enabled UMCs. */ + if (amd_smn_read(nid, base + UMCCH_SDP_CTRL, &value)) + continue; + + if (!(value & UMC_SDP_INIT)) + continue; + + umc_en_mask |= BIT(i); + + if (amd_smn_read(nid, base + UMCCH_UMC_CAP_HI, &value)) + continue; + + if (value & UMC_ECC_ENABLED) + ecc_en_mask |= BIT(i); + } + + /* Check whether at least one UMC is enabled: */ + if (umc_en_mask) + ecc_en = umc_en_mask == ecc_en_mask; + + /* Assume UMC MCA banks are enabled. */ + nb_mce_en = true; + } else { + amd64_read_pci_cfg(F3, NBCFG, &value); + + ecc_en = !!(value & NBCFG_ECC_ENABLE); + + nb_mce_en = nb_mce_bank_enabled_on_node(nid); + if (!nb_mce_en) + amd64_notice("NB MCE bank disabled, set MSR 0x%08x[4] on node %d to enable.\n", + MSR_IA32_MCG_CTL, nid); + } - ecc_en = !!(value & NBCFG_ECC_ENABLE); amd64_info("DRAM ECC %s.\n", (ecc_en ? "enabled" : "disabled")); - nb_mce_en = nb_mce_bank_enabled_on_node(nid); - if (!nb_mce_en) - amd64_notice("NB MCE bank disabled, set MSR " - "0x%08x[4] on node %d to enable.\n", - MSR_IA32_MCG_CTL, nid); - if (!ecc_en || !nb_mce_en) { amd64_notice("%s", ecc_msg); return false; diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index c08870479054..96c1f5d6d130 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -248,6 +248,16 @@ /* MSRs */ #define MSR_MCGCTL_NBE BIT(4) +/* UMC CH register offsets */ +#define UMCCH_SDP_CTRL 0x104 +#define UMCCH_UMC_CAP_HI 0xDF4 + +/* UMC CH bitfields */ +#define UMC_ECC_ENABLED BIT(30) +#define UMC_SDP_INIT BIT(31) + +#define NUM_UMCS 2 + enum amd_families { K8_CPUS = 0, F10_CPUS, @@ -354,6 +364,12 @@ struct err_info { u32 offset; }; +static inline u32 get_umc_base(u8 channel) +{ + /* ch0: 0x50000, ch1: 0x150000 */ + return 0x50000 + (!!channel << 20); +} + static inline u64 get_dram_base(struct amd64_pvt *pvt, u8 i) { u64 addr = ((u64)pvt->ranges[i].base.lo & 0xffff0000) << 8; From f1cbbec9fce958d3d71ed815a01c815b35533f1f Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Thu, 17 Nov 2016 17:57:35 -0500 Subject: [PATCH 15/26] EDAC, amd64: Add AMD Fam17h family type and ops Add a family type and associated ops for Fam17h. Define a struct to hold all the UMC registers that we need. Make this a part of struct amd64_pvt in order to maximize code reuse in the rest of the driver. Signed-off-by: Yazen Ghannam Cc: Aravind Gopalakrishnan Cc: linux-edac Cc: x86-ml Link: http://lkml.kernel.org/r/1479423463-8536-10-git-send-email-Yazen.Ghannam@amd.com Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 44 +++++++++++++++++++++++++++++++++++++++ drivers/edac/amd64_edac.h | 11 +++++++++- 2 files changed, 54 insertions(+), 1 deletion(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 870f56713c22..bb70392bb115 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -1210,6 +1210,19 @@ static int f1x_early_channel_count(struct amd64_pvt *pvt) return channels; } +static int f17_early_channel_count(struct amd64_pvt *pvt) +{ + int i, channels = 0; + + /* SDP Control bit 31 (SdpInit) is clear for unused UMC channels */ + for (i = 0; i < NUM_UMCS; i++) + channels += !!(pvt->umc[i].sdp_ctrl & UMC_SDP_INIT); + + amd64_info("MCT channel count: %d\n", channels); + + return channels; +} + static int ddr3_cs_size(unsigned i, bool dct_width) { unsigned shift = 0; @@ -1337,6 +1350,23 @@ static int f16_dbam_to_chip_select(struct amd64_pvt *pvt, u8 dct, return ddr3_cs_size(cs_mode, false); } +static int f17_base_addr_to_cs_size(struct amd64_pvt *pvt, u8 umc, + unsigned int cs_mode, int csrow_nr) +{ + u32 base_addr = pvt->csels[umc].csbases[csrow_nr]; + + /* Each mask is used for every two base addresses. */ + u32 addr_mask = pvt->csels[umc].csmasks[csrow_nr >> 1]; + + /* Register [31:1] = Address [39:9]. Size is in kBs here. */ + u32 size = ((addr_mask >> 1) - (base_addr >> 1) + 1) >> 1; + + edac_dbg(1, "BaseAddr: 0x%x, AddrMask: 0x%x\n", base_addr, addr_mask); + + /* Return size in MBs. */ + return size >> 10; +} + static void read_dram_ctl_register(struct amd64_pvt *pvt) { @@ -1989,6 +2019,15 @@ static struct amd64_family_type family_types[] = { .dbam_to_cs = f16_dbam_to_chip_select, } }, + [F17_CPUS] = { + .ctl_name = "F17h", + .f0_id = PCI_DEVICE_ID_AMD_17H_DF_F0, + .f6_id = PCI_DEVICE_ID_AMD_17H_DF_F6, + .ops = { + .early_channel_count = f17_early_channel_count, + .dbam_to_cs = f17_base_addr_to_cs_size, + } + }, }; /* @@ -2790,6 +2829,11 @@ static struct amd64_family_type *per_family_init(struct amd64_pvt *pvt) pvt->ops = &family_types[F16_CPUS].ops; break; + case 0x17: + fam_type = &family_types[F17_CPUS]; + pvt->ops = &family_types[F17_CPUS].ops; + break; + default: amd64_err("Unsupported family!\n"); return NULL; diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index 96c1f5d6d130..ada39f165a9b 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -118,6 +118,8 @@ #define PCI_DEVICE_ID_AMD_16H_NB_F2 0x1532 #define PCI_DEVICE_ID_AMD_16H_M30H_NB_F1 0x1581 #define PCI_DEVICE_ID_AMD_16H_M30H_NB_F2 0x1582 +#define PCI_DEVICE_ID_AMD_17H_DF_F0 0x1460 +#define PCI_DEVICE_ID_AMD_17H_DF_F6 0x1466 /* * Function 1 - Address Map @@ -266,6 +268,7 @@ enum amd_families { F15_M60H_CPUS, F16_CPUS, F16_M30H_CPUS, + F17_CPUS, NUM_FAMILIES, }; @@ -298,6 +301,10 @@ struct chip_select { u8 m_cnt; }; +struct amd64_umc { + u32 sdp_ctrl; /* SDP Control reg */ +}; + struct amd64_pvt { struct low_ops *ops; @@ -345,6 +352,8 @@ struct amd64_pvt { /* cache the dram_type */ enum mem_type dram_type; + + struct amd64_umc *umc; /* UMC registers */ }; enum err_codes { @@ -438,7 +447,7 @@ struct low_ops { struct amd64_family_type { const char *ctl_name; - u16 f1_id, f2_id; + u16 f0_id, f1_id, f2_id, f6_id; struct low_ops ops; }; From 936fc3afaa8abc20dfea306c9b6d19a6e7ca5caf Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Thu, 17 Nov 2016 17:57:36 -0500 Subject: [PATCH 16/26] EDAC, amd64: Reserve correct PCI devices on AMD Fam17h Fam17h needs PCI device functions 0 and 6 instead of 1 and 2 as on older systems. Update struct amd64_pvt to hold the new functions and reserve them if on Fam17h. Also, allocate an array of UMC structs within our newly allocated PVT struct. Signed-off-by: Yazen Ghannam Cc: Aravind Gopalakrishnan Cc: linux-edac Cc: x86-ml Link: http://lkml.kernel.org/r/1479423463-8536-11-git-send-email-Yazen.Ghannam@amd.com [ init_one_instance() error handling, shorten lines, unbreak >80 cols lines. ] Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 85 +++++++++++++++++++++++++++++++-------- drivers/edac/amd64_edac.h | 2 +- 2 files changed, 69 insertions(+), 18 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index bb70392bb115..1f3de3402d48 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2274,30 +2274,55 @@ static inline void decode_bus_error(int node_id, struct mce *m) /* * Use pvt->F3 which contains the F3 CPU PCI device to get the related * F1 (AddrMap) and F2 (Dct) devices. Return negative value on error. + * Reserve F0 and F6 on systems with a UMC. */ -static int reserve_mc_sibling_devs(struct amd64_pvt *pvt, u16 f1_id, u16 f2_id) +static int +reserve_mc_sibling_devs(struct amd64_pvt *pvt, u16 pci_id1, u16 pci_id2) { + if (pvt->umc) { + pvt->F0 = pci_get_related_function(pvt->F3->vendor, pci_id1, pvt->F3); + if (!pvt->F0) { + amd64_err("error F0 device not found: vendor %x device 0x%x (broken BIOS?)\n", + PCI_VENDOR_ID_AMD, pci_id1); + return -ENODEV; + } + + pvt->F6 = pci_get_related_function(pvt->F3->vendor, pci_id2, pvt->F3); + if (!pvt->F6) { + pci_dev_put(pvt->F0); + pvt->F0 = NULL; + + amd64_err("error F6 device not found: vendor %x device 0x%x (broken BIOS?)\n", + PCI_VENDOR_ID_AMD, pci_id2); + + return -ENODEV; + } + edac_dbg(1, "F0: %s\n", pci_name(pvt->F0)); + edac_dbg(1, "F3: %s\n", pci_name(pvt->F3)); + edac_dbg(1, "F6: %s\n", pci_name(pvt->F6)); + + return 0; + } + /* Reserve the ADDRESS MAP Device */ - pvt->F1 = pci_get_related_function(pvt->F3->vendor, f1_id, pvt->F3); + pvt->F1 = pci_get_related_function(pvt->F3->vendor, pci_id1, pvt->F3); if (!pvt->F1) { - amd64_err("error address map device not found: " - "vendor %x device 0x%x (broken BIOS?)\n", - PCI_VENDOR_ID_AMD, f1_id); + amd64_err("error address map device not found: vendor %x device 0x%x (broken BIOS?)\n", + PCI_VENDOR_ID_AMD, pci_id1); return -ENODEV; } /* Reserve the DCT Device */ - pvt->F2 = pci_get_related_function(pvt->F3->vendor, f2_id, pvt->F3); + pvt->F2 = pci_get_related_function(pvt->F3->vendor, pci_id2, pvt->F3); if (!pvt->F2) { pci_dev_put(pvt->F1); pvt->F1 = NULL; - amd64_err("error F2 device not found: " - "vendor %x device 0x%x (broken BIOS?)\n", - PCI_VENDOR_ID_AMD, f2_id); - - return -ENODEV; + amd64_err("error F2 device not found: vendor %x device 0x%x (broken BIOS?)\n", + PCI_VENDOR_ID_AMD, pci_id2); + return -ENODEV; } + edac_dbg(1, "F1: %s\n", pci_name(pvt->F1)); edac_dbg(1, "F2: %s\n", pci_name(pvt->F2)); edac_dbg(1, "F3: %s\n", pci_name(pvt->F3)); @@ -2307,8 +2332,13 @@ static int reserve_mc_sibling_devs(struct amd64_pvt *pvt, u16 f1_id, u16 f2_id) static void free_mc_sibling_devs(struct amd64_pvt *pvt) { - pci_dev_put(pvt->F1); - pci_dev_put(pvt->F2); + if (pvt->umc) { + pci_dev_put(pvt->F0); + pci_dev_put(pvt->F6); + } else { + pci_dev_put(pvt->F1); + pci_dev_put(pvt->F2); + } } /* @@ -2864,6 +2894,7 @@ static int init_one_instance(unsigned int nid) struct mem_ctl_info *mci = NULL; struct edac_mc_layer layers[2]; struct amd64_pvt *pvt = NULL; + u16 pci_id1, pci_id2; int err = 0, ret; ret = -ENOMEM; @@ -2879,10 +2910,23 @@ static int init_one_instance(unsigned int nid) if (!fam_type) goto err_free; - ret = -ENODEV; - err = reserve_mc_sibling_devs(pvt, fam_type->f1_id, fam_type->f2_id); + if (pvt->fam >= 0x17) { + pvt->umc = kcalloc(NUM_UMCS, sizeof(struct amd64_umc), GFP_KERNEL); + if (!pvt->umc) { + ret = -ENOMEM; + goto err_free; + } + + pci_id1 = fam_type->f0_id; + pci_id2 = fam_type->f6_id; + } else { + pci_id1 = fam_type->f1_id; + pci_id2 = fam_type->f2_id; + } + + err = reserve_mc_sibling_devs(pvt, pci_id1, pci_id2); if (err) - goto err_free; + goto err_post_init; read_mc_regs(pvt); @@ -2942,6 +2986,10 @@ err_add_mc: err_siblings: free_mc_sibling_devs(pvt); +err_post_init: + if (pvt->fam >= 0x17) + kfree(pvt->umc); + err_free: kfree(pvt); @@ -3044,7 +3092,10 @@ static void setup_pci_device(void) return; pvt = mci->pvt_info; - pci_ctl = edac_pci_create_generic_ctl(&pvt->F2->dev, EDAC_MOD_STR); + if (pvt->umc) + pci_ctl = edac_pci_create_generic_ctl(&pvt->F0->dev, EDAC_MOD_STR); + else + pci_ctl = edac_pci_create_generic_ctl(&pvt->F2->dev, EDAC_MOD_STR); if (!pci_ctl) { pr_warn("%s(): Unable to create PCI control\n", __func__); pr_warn("%s(): PCI error report via EDAC not set\n", __func__); diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index ada39f165a9b..4ca7d249f02e 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -309,7 +309,7 @@ struct amd64_pvt { struct low_ops *ops; /* pci_device handles which we utilize */ - struct pci_dev *F1, *F2, *F3; + struct pci_dev *F0, *F1, *F2, *F3, *F6; u16 mc_node_id; /* MC index of this MC node */ u8 fam; /* CPU family */ From b64ce7cd7f540c64e3fbeaeee3ddb59bc9ab1a3b Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Thu, 17 Nov 2016 17:57:37 -0500 Subject: [PATCH 17/26] EDAC, amd64: Read MC registers on AMD Fam17h Fam17h has a different set of registers and bitfields. Most of these registers are read through SMN (System Management Network) rather than PCI config space. Also, the derivation of various values is now different. Update amd64_edac to read the appropriate registers and extract the correct values for Fam17h. Signed-off-by: Yazen Ghannam Cc: Aravind Gopalakrishnan Cc: linux-edac Cc: x86-ml Link: http://lkml.kernel.org/r/1479423463-8536-12-git-send-email-Yazen.Ghannam@amd.com [ Save us the indentation level in read_mc_regs(), add defines ] Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 172 +++++++++++++++++++++++++++++--------- drivers/edac/amd64_edac.h | 13 +++ 2 files changed, 146 insertions(+), 39 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 1f3de3402d48..461da1323f2f 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -789,46 +789,78 @@ static void prep_chip_selects(struct amd64_pvt *pvt) */ static void read_dct_base_mask(struct amd64_pvt *pvt) { - int cs; + int base_reg0, base_reg1, mask_reg0, mask_reg1, cs; prep_chip_selects(pvt); + if (pvt->umc) { + base_reg0 = get_umc_base(0) + UMCCH_BASE_ADDR; + base_reg1 = get_umc_base(1) + UMCCH_BASE_ADDR; + mask_reg0 = get_umc_base(0) + UMCCH_ADDR_MASK; + mask_reg1 = get_umc_base(1) + UMCCH_ADDR_MASK; + } else { + base_reg0 = DCSB0; + base_reg1 = DCSB1; + mask_reg0 = DCSM0; + mask_reg1 = DCSM1; + } + for_each_chip_select(cs, 0, pvt) { - int reg0 = DCSB0 + (cs * 4); - int reg1 = DCSB1 + (cs * 4); + int reg0 = base_reg0 + (cs * 4); + int reg1 = base_reg1 + (cs * 4); u32 *base0 = &pvt->csels[0].csbases[cs]; u32 *base1 = &pvt->csels[1].csbases[cs]; - if (!amd64_read_dct_pci_cfg(pvt, 0, reg0, base0)) - edac_dbg(0, " DCSB0[%d]=0x%08x reg: F2x%x\n", - cs, *base0, reg0); + if (pvt->umc) { + if (!amd_smn_read(pvt->mc_node_id, reg0, base0)) + edac_dbg(0, " DCSB0[%d]=0x%08x reg: 0x%x\n", + cs, *base0, reg0); - if (pvt->fam == 0xf) - continue; + if (!amd_smn_read(pvt->mc_node_id, reg1, base1)) + edac_dbg(0, " DCSB1[%d]=0x%08x reg: 0x%x\n", + cs, *base1, reg1); + } else { + if (!amd64_read_dct_pci_cfg(pvt, 0, reg0, base0)) + edac_dbg(0, " DCSB0[%d]=0x%08x reg: F2x%x\n", + cs, *base0, reg0); - if (!amd64_read_dct_pci_cfg(pvt, 1, reg0, base1)) - edac_dbg(0, " DCSB1[%d]=0x%08x reg: F2x%x\n", - cs, *base1, (pvt->fam == 0x10) ? reg1 + if (pvt->fam == 0xf) + continue; + + if (!amd64_read_dct_pci_cfg(pvt, 1, reg0, base1)) + edac_dbg(0, " DCSB1[%d]=0x%08x reg: F2x%x\n", + cs, *base1, (pvt->fam == 0x10) ? reg1 : reg0); + } } for_each_chip_select_mask(cs, 0, pvt) { - int reg0 = DCSM0 + (cs * 4); - int reg1 = DCSM1 + (cs * 4); + int reg0 = mask_reg0 + (cs * 4); + int reg1 = mask_reg1 + (cs * 4); u32 *mask0 = &pvt->csels[0].csmasks[cs]; u32 *mask1 = &pvt->csels[1].csmasks[cs]; - if (!amd64_read_dct_pci_cfg(pvt, 0, reg0, mask0)) - edac_dbg(0, " DCSM0[%d]=0x%08x reg: F2x%x\n", - cs, *mask0, reg0); + if (pvt->umc) { + if (!amd_smn_read(pvt->mc_node_id, reg0, mask0)) + edac_dbg(0, " DCSM0[%d]=0x%08x reg: 0x%x\n", + cs, *mask0, reg0); - if (pvt->fam == 0xf) - continue; + if (!amd_smn_read(pvt->mc_node_id, reg1, mask1)) + edac_dbg(0, " DCSM1[%d]=0x%08x reg: 0x%x\n", + cs, *mask1, reg1); + } else { + if (!amd64_read_dct_pci_cfg(pvt, 0, reg0, mask0)) + edac_dbg(0, " DCSM0[%d]=0x%08x reg: F2x%x\n", + cs, *mask0, reg0); - if (!amd64_read_dct_pci_cfg(pvt, 1, reg0, mask1)) - edac_dbg(0, " DCSM1[%d]=0x%08x reg: F2x%x\n", - cs, *mask1, (pvt->fam == 0x10) ? reg1 + if (pvt->fam == 0xf) + continue; + + if (!amd64_read_dct_pci_cfg(pvt, 1, reg0, mask1)) + edac_dbg(0, " DCSM1[%d]=0x%08x reg: F2x%x\n", + cs, *mask1, (pvt->fam == 0x10) ? reg1 : reg0); + } } } @@ -881,6 +913,15 @@ static void determine_memory_type(struct amd64_pvt *pvt) case 0x16: goto ddr3; + case 0x17: + if ((pvt->umc[0].dimm_cfg | pvt->umc[1].dimm_cfg) & BIT(5)) + pvt->dram_type = MEM_LRDDR4; + else if ((pvt->umc[0].dimm_cfg | pvt->umc[1].dimm_cfg) & BIT(4)) + pvt->dram_type = MEM_RDDR4; + else + pvt->dram_type = MEM_DDR4; + return; + default: WARN(1, KERN_ERR "%s: Family??? 0x%x\n", __func__, pvt->fam); pvt->dram_type = MEM_EMPTY; @@ -2341,30 +2382,91 @@ static void free_mc_sibling_devs(struct amd64_pvt *pvt) } } +static void determine_ecc_sym_sz(struct amd64_pvt *pvt) +{ + pvt->ecc_sym_sz = 4; + + if (pvt->umc) { + u8 i; + + for (i = 0; i < NUM_UMCS; i++) { + /* Check enabled channels only: */ + if ((pvt->umc[i].sdp_ctrl & UMC_SDP_INIT) && + (pvt->umc[i].ecc_ctrl & BIT(7))) { + pvt->ecc_sym_sz = 8; + break; + } + } + + return; + } + + if (pvt->fam >= 0x10) { + u32 tmp; + + amd64_read_pci_cfg(pvt->F3, EXT_NB_MCA_CFG, &tmp); + /* F16h has only DCT0, so no need to read dbam1. */ + if (pvt->fam != 0x16) + amd64_read_dct_pci_cfg(pvt, 1, DBAM0, &pvt->dbam1); + + /* F10h, revD and later can do x8 ECC too. */ + if ((pvt->fam > 0x10 || pvt->model > 7) && tmp & BIT(25)) + pvt->ecc_sym_sz = 8; + } +} + +/* + * Retrieve the hardware registers of the memory controller. + */ +static void __read_mc_regs_df(struct amd64_pvt *pvt) +{ + u8 nid = pvt->mc_node_id; + struct amd64_umc *umc; + u32 i, umc_base; + + /* Read registers from each UMC */ + for (i = 0; i < NUM_UMCS; i++) { + + umc_base = get_umc_base(i); + umc = &pvt->umc[i]; + + amd_smn_read(nid, umc_base + UMCCH_SDP_CTRL, &umc->sdp_ctrl); + amd_smn_read(nid, umc_base + UMCCH_ECC_CTRL, &umc->ecc_ctrl); + amd_smn_read(nid, umc_base + UMCCH_DIMM_CFG, &umc->dimm_cfg); + } +} + /* * Retrieve the hardware registers of the memory controller (this includes the * 'Address Map' and 'Misc' device regs) */ static void read_mc_regs(struct amd64_pvt *pvt) { - unsigned range; + unsigned int range; u64 msr_val; - u32 tmp; /* * Retrieve TOP_MEM and TOP_MEM2; no masking off of reserved bits since - * those are Read-As-Zero + * those are Read-As-Zero. */ rdmsrl(MSR_K8_TOP_MEM1, pvt->top_mem); edac_dbg(0, " TOP_MEM: 0x%016llx\n", pvt->top_mem); - /* check first whether TOP_MEM2 is enabled */ + /* Check first whether TOP_MEM2 is enabled: */ rdmsrl(MSR_K8_SYSCFG, msr_val); - if (msr_val & (1U << 21)) { + if (msr_val & BIT(21)) { rdmsrl(MSR_K8_TOP_MEM2, pvt->top_mem2); edac_dbg(0, " TOP_MEM2: 0x%016llx\n", pvt->top_mem2); - } else + } else { edac_dbg(0, " TOP_MEM2 disabled\n"); + } + + if (pvt->umc) { + __read_mc_regs_df(pvt); + amd64_read_pci_cfg(pvt->F0, DF_DHAR, &pvt->dhar); + + goto skip; + } amd64_read_pci_cfg(pvt->F3, NBCAP, &pvt->nbcap); @@ -2393,8 +2495,6 @@ static void read_mc_regs(struct amd64_pvt *pvt) dram_dst_node(pvt, range)); } - read_dct_base_mask(pvt); - amd64_read_pci_cfg(pvt->F1, DHAR, &pvt->dhar); amd64_read_dct_pci_cfg(pvt, 0, DBAM0, &pvt->dbam0); @@ -2408,20 +2508,14 @@ static void read_mc_regs(struct amd64_pvt *pvt) amd64_read_dct_pci_cfg(pvt, 1, DCHR0, &pvt->dchr1); } - pvt->ecc_sym_sz = 4; +skip: + read_dct_base_mask(pvt); + determine_memory_type(pvt); edac_dbg(1, " DIMM type: %s\n", edac_mem_types[pvt->dram_type]); - if (pvt->fam >= 0x10) { - amd64_read_pci_cfg(pvt->F3, EXT_NB_MCA_CFG, &tmp); - /* F16h has only DCT0, so no need to read dbam1 */ - if (pvt->fam != 0x16) - amd64_read_dct_pci_cfg(pvt, 1, DBAM0, &pvt->dbam1); + determine_ecc_sym_sz(pvt); - /* F10h, revD and later can do x8 ECC too */ - if ((pvt->fam > 0x10 || pvt->model > 7) && tmp & BIT(25)) - pvt->ecc_sym_sz = 8; - } dump_misc_regs(pvt); } diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index 4ca7d249f02e..738166393673 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -250,12 +250,23 @@ /* MSRs */ #define MSR_MCGCTL_NBE BIT(4) +/* F17h */ + +/* F0: */ +#define DF_DHAR 0x104 + /* UMC CH register offsets */ +#define UMCCH_BASE_ADDR 0x0 +#define UMCCH_ADDR_MASK 0x20 +#define UMCCH_DIMM_CFG 0x80 #define UMCCH_SDP_CTRL 0x104 +#define UMCCH_ECC_CTRL 0x14C #define UMCCH_UMC_CAP_HI 0xDF4 /* UMC CH bitfields */ +#define UMC_ECC_CHIPKILL_CAP BIT(31) #define UMC_ECC_ENABLED BIT(30) + #define UMC_SDP_INIT BIT(31) #define NUM_UMCS 2 @@ -302,7 +313,9 @@ struct chip_select { }; struct amd64_umc { + u32 dimm_cfg; /* DIMM Configuration reg */ u32 sdp_ctrl; /* SDP Control reg */ + u32 ecc_ctrl; /* DRAM ECC Control reg */ }; struct amd64_pvt { From a6c14dce85e2d0472c4c7c1694034560d1772bc2 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Fri, 18 Nov 2016 09:10:22 -0500 Subject: [PATCH 18/26] EDAC, mce_amd: Don't report poison bit on Fam15h, bank 4 MCA_STATUS[43] has been defined as "Poison" or "Reserved" for every bank since Fam15h except for Fam15h, bank 4 in which case it's defined as part of the McaStatSubCache bitfield. Filter out that case. Reported-by: Dean Liberty Signed-off-by: Yazen Ghannam Cc: Aravind Gopalakrishnan Cc: linux-edac Cc: x86-ml Link: http://lkml.kernel.org/r/1479478222-19896-1-git-send-email-Yazen.Ghannam@amd.com [ Split an almost unparseable ternary conditional, add a comment. ] Signed-off-by: Borislav Petkov --- drivers/edac/mce_amd.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index ba2995c2cc23..34208f38c5b1 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -964,10 +964,13 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) ((m->status & MCI_STATUS_PCC) ? "PCC" : "-"), ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-")); - if (c->x86 >= 0x15) - pr_cont("|%s|%s", - ((m->status & MCI_STATUS_DEFERRED) ? "Deferred" : "-"), - ((m->status & MCI_STATUS_POISON) ? "Poison" : "-")); + if (c->x86 >= 0x15) { + pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-")); + + /* F15h, bank4, bit 43 is part of McaStatSubCache. */ + if (c->x86 != 0x15 || m->bank != 4) + pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-")); + } if (boot_cpu_has(X86_FEATURE_SMCA)) { u32 low, high; From 8051c0af3c846937d5454766fe407b08a7681256 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Thu, 17 Nov 2016 17:57:42 -0500 Subject: [PATCH 19/26] EDAC, amd64: Add Fam17h scrubber support Fam17h has new register offsets and fields for setting up the DRAM scrubber so add support for this. Signed-off-by: Yazen Ghannam Cc: Aravind Gopalakrishnan Cc: linux-edac Cc: x86-ml Link: http://lkml.kernel.org/r/1479423463-8536-17-git-send-email-Yazen.Ghannam@amd.com Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 43 ++++++++++++++++++++++++++++++++++----- drivers/edac/amd64_edac.h | 2 ++ 2 files changed, 40 insertions(+), 5 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 461da1323f2f..52ae415aa699 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -164,8 +164,23 @@ static inline int amd64_read_dct_pci_cfg(struct amd64_pvt *pvt, u8 dct, * other archs, we might not have access to the caches directly. */ +static inline void __f17h_set_scrubval(struct amd64_pvt *pvt, u32 scrubval) +{ + /* + * Fam17h supports scrub values between 0x5 and 0x14. Also, the values + * are shifted down by 0x5, so scrubval 0x5 is written to the register + * as 0x0, scrubval 0x6 as 0x1, etc. + */ + if (scrubval >= 0x5 && scrubval <= 0x14) { + scrubval -= 0x5; + pci_write_bits32(pvt->F6, F17H_SCR_LIMIT_ADDR, scrubval, 0xF); + pci_write_bits32(pvt->F6, F17H_SCR_BASE_ADDR, 1, 0x1); + } else { + pci_write_bits32(pvt->F6, F17H_SCR_BASE_ADDR, 0, 0x1); + } +} /* - * scan the scrub rate mapping table for a close or matching bandwidth value to + * Scan the scrub rate mapping table for a close or matching bandwidth value to * issue. If requested is too big, then use last maximum value found. */ static int __set_scrub_rate(struct amd64_pvt *pvt, u32 new_bw, u32 min_rate) @@ -196,7 +211,9 @@ static int __set_scrub_rate(struct amd64_pvt *pvt, u32 new_bw, u32 min_rate) scrubval = scrubrates[i].scrubval; - if (pvt->fam == 0x15 && pvt->model == 0x60) { + if (pvt->fam == 0x17) { + __f17h_set_scrubval(pvt, scrubval); + } else if (pvt->fam == 0x15 && pvt->model == 0x60) { f15h_select_dct(pvt, 0); pci_write_bits32(pvt->F2, F15H_M60H_SCRCTRL, scrubval, 0x001F); f15h_select_dct(pvt, 1); @@ -233,18 +250,34 @@ static int set_scrub_rate(struct mem_ctl_info *mci, u32 bw) static int get_scrub_rate(struct mem_ctl_info *mci) { struct amd64_pvt *pvt = mci->pvt_info; - u32 scrubval = 0; int i, retval = -EINVAL; + u32 scrubval = 0; - if (pvt->fam == 0x15) { + switch (pvt->fam) { + case 0x15: /* Erratum #505 */ if (pvt->model < 0x10) f15h_select_dct(pvt, 0); if (pvt->model == 0x60) amd64_read_pci_cfg(pvt->F2, F15H_M60H_SCRCTRL, &scrubval); - } else + break; + + case 0x17: + amd64_read_pci_cfg(pvt->F6, F17H_SCR_BASE_ADDR, &scrubval); + if (scrubval & BIT(0)) { + amd64_read_pci_cfg(pvt->F6, F17H_SCR_LIMIT_ADDR, &scrubval); + scrubval &= 0xF; + scrubval += 0x5; + } else { + scrubval = 0; + } + break; + + default: amd64_read_pci_cfg(pvt->F3, SCRCTRL, &scrubval); + break; + } scrubval = scrubval & 0x001F; diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index 738166393673..34d915782d4a 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -204,6 +204,8 @@ #define DCT_SEL_HI 0x114 #define F15H_M60H_SCRCTRL 0x1C8 +#define F17H_SCR_BASE_ADDR 0x48 +#define F17H_SCR_LIMIT_ADDR 0x4C /* * Function 3 - Misc Control From 07ed82ef93d6c70dcd1f31429a8fd12fbdeb21fd Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 28 Nov 2016 08:50:21 -0600 Subject: [PATCH 20/26] EDAC, amd64: Add Fam17h debug output Read a few more UMC registers and provide debug output in order to be as similar as possible to older AMD systems. Signed-off-by: Yazen Ghannam Cc: Aravind Gopalakrishnan Cc: linux-edac Cc: x86-ml Link: http://lkml.kernel.org/r/1480344621-14966-1-git-send-email-Yazen.Ghannam@amd.com [ Remove unneeded K8 check and comments, fixup others. ] Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 96 +++++++++++++++++++++++++++++++++++---- drivers/edac/amd64_edac.h | 6 +++ 2 files changed, 94 insertions(+), 8 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 52ae415aa699..fdd963794cdb 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -762,8 +762,75 @@ static void debug_dump_dramcfg_low(struct amd64_pvt *pvt, u32 dclr, int chan) (dclr & BIT(15)) ? "yes" : "no"); } +static void debug_display_dimm_sizes_df(struct amd64_pvt *pvt, u8 ctrl) +{ + u32 *dcsb = ctrl ? pvt->csels[1].csbases : pvt->csels[0].csbases; + int dimm, size0, size1; + + edac_printk(KERN_DEBUG, EDAC_MC, "UMC%d chip selects:\n", ctrl); + + for (dimm = 0; dimm < 4; dimm++) { + size0 = 0; + + if (dcsb[dimm*2] & DCSB_CS_ENABLE) + size0 = pvt->ops->dbam_to_cs(pvt, ctrl, 0, dimm); + + size1 = 0; + if (dcsb[dimm*2 + 1] & DCSB_CS_ENABLE) + size1 = pvt->ops->dbam_to_cs(pvt, ctrl, 0, dimm); + + amd64_info(EDAC_MC ": %d: %5dMB %d: %5dMB\n", + dimm * 2, size0, + dimm * 2 + 1, size1); + } +} + +static void __dump_misc_regs_df(struct amd64_pvt *pvt) +{ + struct amd64_umc *umc; + u32 i, tmp, umc_base; + + for (i = 0; i < NUM_UMCS; i++) { + umc_base = get_umc_base(i); + umc = &pvt->umc[i]; + + edac_dbg(1, "UMC%d DIMM cfg: 0x%x\n", i, umc->dimm_cfg); + edac_dbg(1, "UMC%d UMC cfg: 0x%x\n", i, umc->umc_cfg); + edac_dbg(1, "UMC%d SDP ctrl: 0x%x\n", i, umc->sdp_ctrl); + edac_dbg(1, "UMC%d ECC ctrl: 0x%x\n", i, umc->ecc_ctrl); + + amd_smn_read(pvt->mc_node_id, umc_base + UMCCH_ECC_BAD_SYMBOL, &tmp); + edac_dbg(1, "UMC%d ECC bad symbol: 0x%x\n", i, tmp); + + amd_smn_read(pvt->mc_node_id, umc_base + UMCCH_UMC_CAP, &tmp); + edac_dbg(1, "UMC%d UMC cap: 0x%x\n", i, tmp); + edac_dbg(1, "UMC%d UMC cap high: 0x%x\n", i, umc->umc_cap_hi); + + edac_dbg(1, "UMC%d ECC capable: %s, ChipKill ECC capable: %s\n", + i, (umc->umc_cap_hi & BIT(30)) ? "yes" : "no", + (umc->umc_cap_hi & BIT(31)) ? "yes" : "no"); + edac_dbg(1, "UMC%d All DIMMs support ECC: %s\n", + i, (umc->umc_cfg & BIT(12)) ? "yes" : "no"); + edac_dbg(1, "UMC%d x4 DIMMs present: %s\n", + i, (umc->dimm_cfg & BIT(6)) ? "yes" : "no"); + edac_dbg(1, "UMC%d x16 DIMMs present: %s\n", + i, (umc->dimm_cfg & BIT(7)) ? "yes" : "no"); + + if (pvt->dram_type == MEM_LRDDR4) { + amd_smn_read(pvt->mc_node_id, umc_base + UMCCH_ADDR_CFG, &tmp); + edac_dbg(1, "UMC%d LRDIMM %dx rank multiply\n", + i, 1 << ((tmp >> 4) & 0x3)); + } + + debug_display_dimm_sizes_df(pvt, i); + } + + edac_dbg(1, "F0x104 (DRAM Hole Address): 0x%08x, base: 0x%08x\n", + pvt->dhar, dhar_base(pvt)); +} + /* Display and decode various NB registers for debug purposes. */ -static void dump_misc_regs(struct amd64_pvt *pvt) +static void __dump_misc_regs(struct amd64_pvt *pvt) { edac_dbg(1, "F3xE8 (NB Cap): 0x%08x\n", pvt->nbcap); @@ -783,8 +850,6 @@ static void dump_misc_regs(struct amd64_pvt *pvt) (pvt->fam == 0xf) ? k8_dhar_offset(pvt) : f10_dhar_offset(pvt)); - edac_dbg(1, " DramHoleValid: %s\n", dhar_valid(pvt) ? "yes" : "no"); - debug_display_dimm_sizes(pvt, 0); /* everything below this point is Fam10h and above */ @@ -793,13 +858,25 @@ static void dump_misc_regs(struct amd64_pvt *pvt) debug_display_dimm_sizes(pvt, 1); - amd64_info("using %s syndromes.\n", ((pvt->ecc_sym_sz == 8) ? "x8" : "x4")); - /* Only if NOT ganged does dclr1 have valid info */ if (!dct_ganging_enabled(pvt)) debug_dump_dramcfg_low(pvt, pvt->dclr1, 1); } +/* Display and decode various NB registers for debug purposes. */ +static void dump_misc_regs(struct amd64_pvt *pvt) +{ + if (pvt->umc) + __dump_misc_regs_df(pvt); + else + __dump_misc_regs(pvt); + + edac_dbg(1, " DramHoleValid: %s\n", dhar_valid(pvt) ? "yes" : "no"); + + amd64_info("using %s syndromes.\n", + ((pvt->ecc_sym_sz == 8) ? "x8" : "x4")); +} + /* * See BKDG, F2x[1,0][5C:40], F2[1,0][6C:60] */ @@ -2001,8 +2078,9 @@ static void debug_display_dimm_sizes(struct amd64_pvt *pvt, u8 ctrl) size0 = 0; if (dcsb[dimm*2] & DCSB_CS_ENABLE) - /* For f15m60h, need multiplier for LRDIMM cs_size - * calculation. We pass 'dimm' value to the dbam_to_cs + /* + * For F15m60h, we need multiplier for LRDIMM cs_size + * calculation. We pass dimm value to the dbam_to_cs * mapper so we can find the multiplier from the * corresponding DCSM. */ @@ -2463,9 +2541,11 @@ static void __read_mc_regs_df(struct amd64_pvt *pvt) umc_base = get_umc_base(i); umc = &pvt->umc[i]; + amd_smn_read(nid, umc_base + UMCCH_DIMM_CFG, &umc->dimm_cfg); + amd_smn_read(nid, umc_base + UMCCH_UMC_CFG, &umc->umc_cfg); amd_smn_read(nid, umc_base + UMCCH_SDP_CTRL, &umc->sdp_ctrl); amd_smn_read(nid, umc_base + UMCCH_ECC_CTRL, &umc->ecc_ctrl); - amd_smn_read(nid, umc_base + UMCCH_DIMM_CFG, &umc->dimm_cfg); + amd_smn_read(nid, umc_base + UMCCH_UMC_CAP_HI, &umc->umc_cap_hi); } } diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index 34d915782d4a..cb91d0b06d23 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -260,9 +260,13 @@ /* UMC CH register offsets */ #define UMCCH_BASE_ADDR 0x0 #define UMCCH_ADDR_MASK 0x20 +#define UMCCH_ADDR_CFG 0x30 #define UMCCH_DIMM_CFG 0x80 +#define UMCCH_UMC_CFG 0x100 #define UMCCH_SDP_CTRL 0x104 #define UMCCH_ECC_CTRL 0x14C +#define UMCCH_ECC_BAD_SYMBOL 0xD90 +#define UMCCH_UMC_CAP 0xDF0 #define UMCCH_UMC_CAP_HI 0xDF4 /* UMC CH bitfields */ @@ -316,8 +320,10 @@ struct chip_select { struct amd64_umc { u32 dimm_cfg; /* DIMM Configuration reg */ + u32 umc_cfg; /* Configuration reg */ u32 sdp_ctrl; /* SDP Control reg */ u32 ecc_ctrl; /* DRAM ECC Control reg */ + u32 umc_cap_hi; /* Capabilities High reg */ }; struct amd64_pvt { From 2d09d8f301f53cb92e7ae7183d58a74fc55f85c0 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Tue, 29 Nov 2016 08:51:56 -0600 Subject: [PATCH 21/26] EDAC, amd64: Determine EDAC MC capabilities on Fam17h The UMCs on Fam17h are independent memory controllers so we need to read the capabilities from all UMCs and make sure they agree. Once we determine what capabilities are available we should save them for convenience. Signed-off-by: Yazen Ghannam Cc: Aravind Gopalakrishnan Cc: linux-edac Cc: x86-ml Link: http://lkml.kernel.org/r/1480431116-94683-1-git-send-email-Yazen.Ghannam@amd.com [ Simplify f17h_determine_edac_ctl_cap(), preinit edac_mode in init_csrows(). ] Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 67 ++++++++++++++++++++++++++++----------- 1 file changed, 49 insertions(+), 18 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index fdd963794cdb..9f9d2bc1868c 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2698,20 +2698,22 @@ static u32 get_csrow_nr_pages(struct amd64_pvt *pvt, u8 dct, int csrow_nr) static int init_csrows(struct mem_ctl_info *mci) { struct amd64_pvt *pvt = mci->pvt_info; + enum edac_type edac_mode = EDAC_NONE; struct csrow_info *csrow; struct dimm_info *dimm; - enum edac_type edac_mode; int i, j, empty = 1; int nr_pages = 0; u32 val; - amd64_read_pci_cfg(pvt->F3, NBCFG, &val); + if (!pvt->umc) { + amd64_read_pci_cfg(pvt->F3, NBCFG, &val); - pvt->nbcfg = val; + pvt->nbcfg = val; - edac_dbg(0, "node %d, NBCFG=0x%08x[ChipKillEccCap: %d|DramEccEn: %d]\n", - pvt->mc_node_id, val, - !!(val & NBCFG_CHIPKILL), !!(val & NBCFG_ECC_ENABLE)); + edac_dbg(0, "node %d, NBCFG=0x%08x[ChipKillEccCap: %d|DramEccEn: %d]\n", + pvt->mc_node_id, val, + !!(val & NBCFG_CHIPKILL), !!(val & NBCFG_ECC_ENABLE)); + } /* * We iterate over DCT0 here but we look at DCT1 in parallel, if needed. @@ -2747,14 +2749,18 @@ static int init_csrows(struct mem_ctl_info *mci) edac_dbg(1, "Total csrow%d pages: %u\n", i, nr_pages); - /* - * determine whether CHIPKILL or JUST ECC or NO ECC is operating - */ - if (pvt->nbcfg & NBCFG_ECC_ENABLE) - edac_mode = (pvt->nbcfg & NBCFG_CHIPKILL) ? - EDAC_S4ECD4ED : EDAC_SECDED; - else - edac_mode = EDAC_NONE; + /* Determine DIMM ECC mode: */ + if (pvt->umc) { + if (mci->edac_ctl_cap & EDAC_FLAG_S4ECD4ED) + edac_mode = EDAC_S4ECD4ED; + else if (mci->edac_ctl_cap & EDAC_FLAG_SECDED) + edac_mode = EDAC_SECDED; + + } else if (pvt->nbcfg & NBCFG_ECC_ENABLE) { + edac_mode = (pvt->nbcfg & NBCFG_CHIPKILL) + ? EDAC_S4ECD4ED + : EDAC_SECDED; + } for (j = 0; j < pvt->channel_count; j++) { dimm = csrow->channels[j]->dimm; @@ -2992,6 +2998,27 @@ static bool ecc_enabled(struct pci_dev *F3, u16 nid) return true; } +static inline void +f17h_determine_edac_ctl_cap(struct mem_ctl_info *mci, struct amd64_pvt *pvt) +{ + u8 i, ecc_en = 1, cpk_en = 1; + + for (i = 0; i < NUM_UMCS; i++) { + if (pvt->umc[i].sdp_ctrl & UMC_SDP_INIT) { + ecc_en &= !!(pvt->umc[i].umc_cap_hi & UMC_ECC_ENABLED); + cpk_en &= !!(pvt->umc[i].umc_cap_hi & UMC_ECC_CHIPKILL_CAP); + } + } + + /* Set chipkill only if ECC is enabled: */ + if (ecc_en) { + mci->edac_ctl_cap |= EDAC_FLAG_SECDED; + + if (cpk_en) + mci->edac_ctl_cap |= EDAC_FLAG_S4ECD4ED; + } +} + static void setup_mci_misc_attrs(struct mem_ctl_info *mci, struct amd64_family_type *fam) { @@ -3000,11 +3027,15 @@ static void setup_mci_misc_attrs(struct mem_ctl_info *mci, mci->mtype_cap = MEM_FLAG_DDR2 | MEM_FLAG_RDDR2; mci->edac_ctl_cap = EDAC_FLAG_NONE; - if (pvt->nbcap & NBCAP_SECDED) - mci->edac_ctl_cap |= EDAC_FLAG_SECDED; + if (pvt->umc) { + f17h_determine_edac_ctl_cap(mci, pvt); + } else { + if (pvt->nbcap & NBCAP_SECDED) + mci->edac_ctl_cap |= EDAC_FLAG_SECDED; - if (pvt->nbcap & NBCAP_CHIPKILL) - mci->edac_ctl_cap |= EDAC_FLAG_S4ECD4ED; + if (pvt->nbcap & NBCAP_CHIPKILL) + mci->edac_ctl_cap |= EDAC_FLAG_S4ECD4ED; + } mci->edac_cap = determine_edac_cap(pvt); mci->mod_name = EDAC_MOD_STR; From d27f3a348e3677b7d5ee6954ebafce679b011164 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Thu, 17 Nov 2016 17:57:40 -0500 Subject: [PATCH 22/26] EDAC, amd64: Determine EDAC capabilities on Fam17h systems We need to determine the EDAC capabilities from all UMCs on the node. We should only check UMCs that are enabled and make sure they all agree. Signed-off-by: Yazen Ghannam Cc: Aravind Gopalakrishnan Cc: linux-edac Cc: x86-ml Link: http://lkml.kernel.org/r/1479423463-8536-15-git-send-email-Yazen.Ghannam@amd.com Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 9f9d2bc1868c..d7bd96c83b51 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -715,15 +715,33 @@ static int get_channel_from_ecc_syndrome(struct mem_ctl_info *, u16); */ static unsigned long determine_edac_cap(struct amd64_pvt *pvt) { - u8 bit; unsigned long edac_cap = EDAC_FLAG_NONE; + u8 bit; - bit = (pvt->fam > 0xf || pvt->ext_model >= K8_REV_F) - ? 19 - : 17; + if (pvt->umc) { + u8 i, umc_en_mask = 0, dimm_ecc_en_mask = 0; - if (pvt->dclr0 & BIT(bit)) - edac_cap = EDAC_FLAG_SECDED; + for (i = 0; i < NUM_UMCS; i++) { + if (!(pvt->umc[i].sdp_ctrl & UMC_SDP_INIT)) + continue; + + umc_en_mask |= BIT(i); + + /* UMC Configuration bit 12 (DimmEccEn) */ + if (pvt->umc[i].umc_cfg & BIT(12)) + dimm_ecc_en_mask |= BIT(i); + } + + if (umc_en_mask == dimm_ecc_en_mask) + edac_cap = EDAC_FLAG_SECDED; + } else { + bit = (pvt->fam > 0xf || pvt->ext_model >= K8_REV_F) + ? 19 + : 17; + + if (pvt->dclr0 & BIT(bit)) + edac_cap = EDAC_FLAG_SECDED; + } return edac_cap; } From 713ad54675fdfd7358dbcae21ab4788a014c6e23 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 28 Nov 2016 12:59:53 -0600 Subject: [PATCH 23/26] EDAC, amd64: Define and register UMC error decode function How we need to decode UMC errors is different from how we decode bus errors, so let's define a new function for this. We also need a way to determine the UMC channel since we're not guaranteed that there is a fixed relation between channel and MCA bank. Signed-off-by: Yazen Ghannam Cc: Aravind Gopalakrishnan Cc: linux-edac Cc: x86-ml Link: http://lkml.kernel.org/r/1480359593-80369-1-git-send-email-Yazen.Ghannam@amd.com [ Fold in decode_synd_reg(), simplify. ] Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 89 +++++++++++++++++++++++++++++++++++++-- drivers/edac/amd64_edac.h | 2 + 2 files changed, 88 insertions(+), 3 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index d7bd96c83b51..48a38ab363dd 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2392,7 +2392,13 @@ static void __log_ecc_error(struct mem_ctl_info *mci, struct err_info *err, string = "Failed to map error addr to a csrow"; break; case ERR_CHANNEL: - string = "unknown syndrome - possible error reporting race"; + string = "Unknown syndrome - possible error reporting race"; + break; + case ERR_SYND: + string = "MCA_SYND not valid - unknown syndrome and csrow"; + break; + case ERR_NORM_ADDR: + string = "Cannot decode normalized address"; break; default: string = "WTF error"; @@ -2441,6 +2447,76 @@ static inline void decode_bus_error(int node_id, struct mce *m) __log_ecc_error(mci, &err, ecc_type); } +/* + * To find the UMC channel represented by this bank we need to match on its + * instance_id. The instance_id of a bank is held in the lower 32 bits of its + * IPID. + */ +static int find_umc_channel(struct amd64_pvt *pvt, struct mce *m) +{ + u32 umc_instance_id[] = {0x50f00, 0x150f00}; + u32 instance_id = m->ipid & GENMASK(31, 0); + int i, channel = -1; + + for (i = 0; i < ARRAY_SIZE(umc_instance_id); i++) + if (umc_instance_id[i] == instance_id) + channel = i; + + return channel; +} + +static void decode_umc_error(int node_id, struct mce *m) +{ + u8 ecc_type = (m->status >> 45) & 0x3; + struct mem_ctl_info *mci; + struct amd64_pvt *pvt; + struct err_info err; + u64 sys_addr; + + mci = edac_mc_find(node_id); + if (!mci) + return; + + pvt = mci->pvt_info; + + memset(&err, 0, sizeof(err)); + + if (m->status & MCI_STATUS_DEFERRED) + ecc_type = 3; + + err.channel = find_umc_channel(pvt, m); + if (err.channel < 0) { + err.err_code = ERR_CHANNEL; + goto log_error; + } + + if (umc_normaddr_to_sysaddr(m->addr, pvt->mc_node_id, err.channel, &sys_addr)) { + err.err_code = ERR_NORM_ADDR; + goto log_error; + } + + error_address_to_page_and_offset(sys_addr, &err); + + if (!(m->status & MCI_STATUS_SYNDV)) { + err.err_code = ERR_SYND; + goto log_error; + } + + if (ecc_type == 2) { + u8 length = (m->synd >> 18) & 0x3f; + + if (length) + err.syndrome = (m->synd >> 32) & GENMASK(length - 1, 0); + else + err.err_code = ERR_CHANNEL; + } + + err.csrow = m->synd & 0x7; + +log_error: + __log_ecc_error(mci, &err, ecc_type); +} + /* * Use pvt->F3 which contains the F3 CPU PCI device to get the related * F1 (AddrMap) and F2 (Dct) devices. Return negative value on error. @@ -3232,7 +3308,10 @@ static int init_one_instance(unsigned int nid) if (report_gart_errors) amd_report_gart_errors(true); - amd_register_ecc_decoder(decode_bus_error); + if (pvt->umc) + amd_register_ecc_decoder(decode_umc_error); + else + amd_register_ecc_decoder(decode_bus_error); return 0; @@ -3323,7 +3402,11 @@ static void remove_one_instance(unsigned int nid) /* unregister from EDAC MCE */ amd_report_gart_errors(false); - amd_unregister_ecc_decoder(decode_bus_error); + + if (pvt->umc) + amd_unregister_ecc_decoder(decode_umc_error); + else + amd_unregister_ecc_decoder(decode_bus_error); kfree(ecc_stngs[nid]); ecc_stngs[nid] = NULL; diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index cb91d0b06d23..c3b004a53eea 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -382,6 +382,8 @@ enum err_codes { ERR_NODE = -1, ERR_CSROW = -2, ERR_CHANNEL = -3, + ERR_SYND = -4, + ERR_NORM_ADDR = -5, }; struct err_info { From 95d3af6bd18f381b5b1c62f117ce7f152a5db3ea Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Thu, 17 Nov 2016 17:57:43 -0500 Subject: [PATCH 24/26] EDAC, amd64: Autoload amd64_edac_mod on Fam17h systems Add Fam17h to the list of families to autoload amd64_edac_mod. Signed-off-by: Yazen Ghannam Cc: Aravind Gopalakrishnan Cc: linux-edac Cc: x86-ml Link: http://lkml.kernel.org/r/1479423463-8536-18-git-send-email-Yazen.Ghannam@amd.com Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 48a38ab363dd..df6d650a8e47 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -3446,6 +3446,7 @@ static const struct x86_cpu_id amd64_cpuids[] = { { X86_VENDOR_AMD, 0x10, X86_MODEL_ANY, X86_FEATURE_ANY, 0 }, { X86_VENDOR_AMD, 0x15, X86_MODEL_ANY, X86_FEATURE_ANY, 0 }, { X86_VENDOR_AMD, 0x16, X86_MODEL_ANY, X86_FEATURE_ANY, 0 }, + { X86_VENDOR_AMD, 0x17, X86_MODEL_ANY, X86_FEATURE_ANY, 0 }, { } }; MODULE_DEVICE_TABLE(x86cpu, amd64_cpuids); From 5246c540073fb3d6f9aae87215f297e4895e89b3 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 1 Dec 2016 11:35:07 +0100 Subject: [PATCH 25/26] EDAC, amd64: Improve amd64-specific printing macros Prefix the warn and error macros with the respective string so that callers don't have to say "Error" or "Warning". We save us string length this way in the actual calls. While at it, shorten the calls in reserve_mc_sibling_devs(). Signed-off-by: Borislav Petkov Cc: Dan Carpenter Cc: Yazen Ghannam --- drivers/edac/amd64_edac.c | 16 ++++++---------- drivers/edac/amd64_edac.h | 4 ++-- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index df6d650a8e47..9a7cf3c8df06 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2528,8 +2528,7 @@ reserve_mc_sibling_devs(struct amd64_pvt *pvt, u16 pci_id1, u16 pci_id2) if (pvt->umc) { pvt->F0 = pci_get_related_function(pvt->F3->vendor, pci_id1, pvt->F3); if (!pvt->F0) { - amd64_err("error F0 device not found: vendor %x device 0x%x (broken BIOS?)\n", - PCI_VENDOR_ID_AMD, pci_id1); + amd64_err("F0 not found, device 0x%x (broken BIOS?)\n", pci_id1); return -ENODEV; } @@ -2538,11 +2537,10 @@ reserve_mc_sibling_devs(struct amd64_pvt *pvt, u16 pci_id1, u16 pci_id2) pci_dev_put(pvt->F0); pvt->F0 = NULL; - amd64_err("error F6 device not found: vendor %x device 0x%x (broken BIOS?)\n", - PCI_VENDOR_ID_AMD, pci_id2); - + amd64_err("F6 not found: device 0x%x (broken BIOS?)\n", pci_id2); return -ENODEV; } + edac_dbg(1, "F0: %s\n", pci_name(pvt->F0)); edac_dbg(1, "F3: %s\n", pci_name(pvt->F3)); edac_dbg(1, "F6: %s\n", pci_name(pvt->F6)); @@ -2553,8 +2551,7 @@ reserve_mc_sibling_devs(struct amd64_pvt *pvt, u16 pci_id1, u16 pci_id2) /* Reserve the ADDRESS MAP Device */ pvt->F1 = pci_get_related_function(pvt->F3->vendor, pci_id1, pvt->F3); if (!pvt->F1) { - amd64_err("error address map device not found: vendor %x device 0x%x (broken BIOS?)\n", - PCI_VENDOR_ID_AMD, pci_id1); + amd64_err("F1 not found: device 0x%x (broken BIOS?)\n", pci_id1); return -ENODEV; } @@ -2564,9 +2561,8 @@ reserve_mc_sibling_devs(struct amd64_pvt *pvt, u16 pci_id1, u16 pci_id2) pci_dev_put(pvt->F1); pvt->F1 = NULL; - amd64_err("error F2 device not found: vendor %x device 0x%x (broken BIOS?)\n", - PCI_VENDOR_ID_AMD, pci_id2); - return -ENODEV; + amd64_err("F2 not found: device 0x%x (broken BIOS?)\n", pci_id2); + return -ENODEV; } edac_dbg(1, "F1: %s\n", pci_name(pvt->F1)); diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index c3b004a53eea..f14c24d5b140 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -30,10 +30,10 @@ edac_printk(KERN_NOTICE, "amd64", fmt, ##arg) #define amd64_warn(fmt, arg...) \ - edac_printk(KERN_WARNING, "amd64", fmt, ##arg) + edac_printk(KERN_WARNING, "amd64", "Warning: " fmt, ##arg) #define amd64_err(fmt, arg...) \ - edac_printk(KERN_ERR, "amd64", fmt, ##arg) + edac_printk(KERN_ERR, "amd64", "Error: " fmt, ##arg) #define amd64_mc_warn(mci, fmt, arg...) \ edac_mc_chipset_printk(mci, KERN_WARNING, "amd64", fmt, ##arg) From 0de2788447b67891a31a156c0206fd159e4a8981 Mon Sep 17 00:00:00 2001 From: Pan Bian Date: Sun, 4 Dec 2016 14:07:18 +0800 Subject: [PATCH 26/26] EDAC, amd64: Fix improper return value When the call to zalloc_cpumask_var() fails, returning "false" seems improper. The real value of macro "false" is 0, and 0 means no error. Return -ENOMEM instead. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=189071 Signed-off-by: Pan Bian Cc: linux-edac Link: http://lkml.kernel.org/r/1480831638-5361-1-git-send-email-bianpan201604@163.com Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 9a7cf3c8df06..260251177830 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2913,7 +2913,7 @@ static int toggle_ecc_err_reporting(struct ecc_settings *s, u16 nid, bool on) if (!zalloc_cpumask_var(&cmask, GFP_KERNEL)) { amd64_warn("%s: error allocating mask\n", __func__); - return false; + return -ENOMEM; } get_cpus_on_this_dct_cpumask(cmask, nid);