* Add KNM support to sb_edac (Piotr Luc)

* Add AMD Zen support to amd64_edac (Yazen Ghannam)
 
 * misc small cleanups, improvements, fixes by:
  - Colin Ian King
  - Dave Hansen
  - Pan Bian
  - Thor Thayer
  - Wei Yongjun
  - Yanjiang Jin
  - yours truly
 -----BEGIN PGP SIGNATURE-----
 
 iQIcBAABCAAGBQJYTo5dAAoJEBLB8Bhh3lVKYD4QAKWu8XM3aOc41nTAYBLq0pqr
 MSXx0evsShlnKFvPj3173Cm2cSwNMi+Jebq/FeX6oX58mqgkL0zjXAuuRSFk/lF2
 GyclqGQfaxpWiE3GjyFR0Q3olB4g52RnoqgV+V8D8XXQkqrZKXcgtmXjjeuNgNHS
 fouh2dm93U8xuNjrouAU9f4qO1pu9qjUFO8Jl82Ev4I1t9F2ZoTYOvbKUQ9vpPmT
 o9/PIL4ywjMmZdsznzmcL3mteX8SK6is4OR6TOSc80F30KEsmFZ/zc1eJ3qo1cH2
 8MLgtm+blQJp2jFnifGYZL+RcAL7KRSsKdIEj/E5dDrlVIHue/wpyr7Om2IE8WC+
 Ns7NWDjKSJa3ljMdO1QD0YmpRgHRCVMl6mjRcWPmub0bv/QOfzeVR0fXu2M2PQNm
 +bRrug+994pCbeJ+j2zx2YcZJbXY/4/KeYsBIszscm1OIUB+MTh+LRaAYpuIlbgr
 /LgoJd9YSLytIQgtTLZUUQ+F1YMjYj3UvMS/agoCnKGNLNjreCVJtEO4yVz7Ry4p
 R2sF50QsKD7HX4wFzTgLFscwZ/I/poSkRM2ZFtnLzbNtlbOdC4470euzGAAWIZdO
 r7rpQ2yTfgTq+YnhnfZ55Mdlem5cEeeaLYCfS6zQLFf2y9rS6xfO2ps4HaH/oqxo
 3SPzXq56ew+RUTefTQCR
 =pZ7o
 -----END PGP SIGNATURE-----

Merge tag 'edac_for_4.10' of git://git.kernel.org/pub/scm/linux/kernel/git/bp/bp

Pull EDAC updates from Borislav Petkov:

 - add KNM support to sb_edac (Piotr Luc)

 - add AMD Zen support to amd64_edac (Yazen Ghannam)

 - misc small cleanups, improvements and fixes (Colin Ian King, Dave
   Hansen, Pan Bian, Thor Thayer, Wei Yongjun, Yanjiang Jin, yours
   truly)

* tag 'edac_for_4.10' of git://git.kernel.org/pub/scm/linux/kernel/git/bp/bp: (26 commits)
  EDAC, amd64: Fix improper return value
  EDAC, amd64: Improve amd64-specific printing macros
  EDAC, amd64: Autoload amd64_edac_mod on Fam17h systems
  EDAC, amd64: Define and register UMC error decode function
  EDAC, amd64: Determine EDAC capabilities on Fam17h systems
  EDAC, amd64: Determine EDAC MC capabilities on Fam17h
  EDAC, amd64: Add Fam17h debug output
  EDAC, amd64: Add Fam17h scrubber support
  EDAC, mce_amd: Don't report poison bit on Fam15h, bank 4
  EDAC, amd64: Read MC registers on AMD Fam17h
  EDAC, amd64: Reserve correct PCI devices on AMD Fam17h
  EDAC, amd64: Add AMD Fam17h family type and ops
  EDAC, amd64: Extend ecc_enabled() to Fam17h
  EDAC, amd64: Don't force-enable ECC checking on newer systems
  EDAC, amd64: Add Deferred Error type
  EDAC, amd64: Rename __log_bus_error() to be more specific
  EDAC, amd64: Change target of pci_name from F2 to F3
  EDAC, mce_amd: Rename nb_bus_decoder to dram_ecc_decoder
  EDAC: Add LRDDR4 DRAM type
  EDAC, mpc85xx: Implement remove method for the platform driver
  ...
This commit is contained in:
Linus Torvalds 2016-12-13 09:03:52 -08:00
commit daf34710a9
10 changed files with 726 additions and 157 deletions

View File

@ -153,13 +153,17 @@ static ssize_t altr_sdr_mc_err_inject_write(struct file *file,
if (count == 3) {
edac_printk(KERN_ALERT, EDAC_MC,
"Inject Double bit error\n");
local_irq_disable();
regmap_write(drvdata->mc_vbase, priv->ce_ue_trgr_offset,
(read_reg | priv->ue_set_mask));
local_irq_enable();
} else {
edac_printk(KERN_ALERT, EDAC_MC,
"Inject Single bit error\n");
local_irq_disable();
regmap_write(drvdata->mc_vbase, priv->ce_ue_trgr_offset,
(read_reg | priv->ce_set_mask));
local_irq_enable();
}
ptemp[0] = 0x5A5A5A5A;

File diff suppressed because it is too large Load Diff

View File

@ -30,10 +30,10 @@
edac_printk(KERN_NOTICE, "amd64", fmt, ##arg)
#define amd64_warn(fmt, arg...) \
edac_printk(KERN_WARNING, "amd64", fmt, ##arg)
edac_printk(KERN_WARNING, "amd64", "Warning: " fmt, ##arg)
#define amd64_err(fmt, arg...) \
edac_printk(KERN_ERR, "amd64", fmt, ##arg)
edac_printk(KERN_ERR, "amd64", "Error: " fmt, ##arg)
#define amd64_mc_warn(mci, fmt, arg...) \
edac_mc_chipset_printk(mci, KERN_WARNING, "amd64", fmt, ##arg)
@ -118,6 +118,8 @@
#define PCI_DEVICE_ID_AMD_16H_NB_F2 0x1532
#define PCI_DEVICE_ID_AMD_16H_M30H_NB_F1 0x1581
#define PCI_DEVICE_ID_AMD_16H_M30H_NB_F2 0x1582
#define PCI_DEVICE_ID_AMD_17H_DF_F0 0x1460
#define PCI_DEVICE_ID_AMD_17H_DF_F6 0x1466
/*
* Function 1 - Address Map
@ -202,6 +204,8 @@
#define DCT_SEL_HI 0x114
#define F15H_M60H_SCRCTRL 0x1C8
#define F17H_SCR_BASE_ADDR 0x48
#define F17H_SCR_LIMIT_ADDR 0x4C
/*
* Function 3 - Misc Control
@ -248,6 +252,31 @@
/* MSRs */
#define MSR_MCGCTL_NBE BIT(4)
/* F17h */
/* F0: */
#define DF_DHAR 0x104
/* UMC CH register offsets */
#define UMCCH_BASE_ADDR 0x0
#define UMCCH_ADDR_MASK 0x20
#define UMCCH_ADDR_CFG 0x30
#define UMCCH_DIMM_CFG 0x80
#define UMCCH_UMC_CFG 0x100
#define UMCCH_SDP_CTRL 0x104
#define UMCCH_ECC_CTRL 0x14C
#define UMCCH_ECC_BAD_SYMBOL 0xD90
#define UMCCH_UMC_CAP 0xDF0
#define UMCCH_UMC_CAP_HI 0xDF4
/* UMC CH bitfields */
#define UMC_ECC_CHIPKILL_CAP BIT(31)
#define UMC_ECC_ENABLED BIT(30)
#define UMC_SDP_INIT BIT(31)
#define NUM_UMCS 2
enum amd_families {
K8_CPUS = 0,
F10_CPUS,
@ -256,6 +285,7 @@ enum amd_families {
F15_M60H_CPUS,
F16_CPUS,
F16_M30H_CPUS,
F17_CPUS,
NUM_FAMILIES,
};
@ -288,11 +318,19 @@ struct chip_select {
u8 m_cnt;
};
struct amd64_umc {
u32 dimm_cfg; /* DIMM Configuration reg */
u32 umc_cfg; /* Configuration reg */
u32 sdp_ctrl; /* SDP Control reg */
u32 ecc_ctrl; /* DRAM ECC Control reg */
u32 umc_cap_hi; /* Capabilities High reg */
};
struct amd64_pvt {
struct low_ops *ops;
/* pci_device handles which we utilize */
struct pci_dev *F1, *F2, *F3;
struct pci_dev *F0, *F1, *F2, *F3, *F6;
u16 mc_node_id; /* MC index of this MC node */
u8 fam; /* CPU family */
@ -335,6 +373,8 @@ struct amd64_pvt {
/* cache the dram_type */
enum mem_type dram_type;
struct amd64_umc *umc; /* UMC registers */
};
enum err_codes {
@ -342,6 +382,8 @@ enum err_codes {
ERR_NODE = -1,
ERR_CSROW = -2,
ERR_CHANNEL = -3,
ERR_SYND = -4,
ERR_NORM_ADDR = -5,
};
struct err_info {
@ -354,6 +396,12 @@ struct err_info {
u32 offset;
};
static inline u32 get_umc_base(u8 channel)
{
/* ch0: 0x50000, ch1: 0x150000 */
return 0x50000 + (!!channel << 20);
}
static inline u64 get_dram_base(struct amd64_pvt *pvt, u8 i)
{
u64 addr = ((u64)pvt->ranges[i].base.lo & 0xffff0000) << 8;
@ -422,7 +470,7 @@ struct low_ops {
struct amd64_family_type {
const char *ctl_name;
u16 f1_id, f2_id;
u16 f0_id, f1_id, f2_id, f6_id;
struct low_ops ops;
};

View File

@ -482,15 +482,8 @@ void edac_mc_free(struct mem_ctl_info *mci)
}
EXPORT_SYMBOL_GPL(edac_mc_free);
/**
* find_mci_by_dev
*
* scan list of controllers looking for the one that manages
* the 'dev' device
* @dev: pointer to a struct device related with the MCI
*/
struct mem_ctl_info *find_mci_by_dev(struct device *dev)
/* Caller must hold mem_ctls_mutex */
static struct mem_ctl_info *__find_mci_by_dev(struct device *dev)
{
struct mem_ctl_info *mci;
struct list_head *item;
@ -506,6 +499,24 @@ struct mem_ctl_info *find_mci_by_dev(struct device *dev)
return NULL;
}
/**
* find_mci_by_dev
*
* scan list of controllers looking for the one that manages
* the 'dev' device
* @dev: pointer to a struct device related with the MCI
*/
struct mem_ctl_info *find_mci_by_dev(struct device *dev)
{
struct mem_ctl_info *ret;
mutex_lock(&mem_ctls_mutex);
ret = __find_mci_by_dev(dev);
mutex_unlock(&mem_ctls_mutex);
return ret;
}
EXPORT_SYMBOL_GPL(find_mci_by_dev);
/*
@ -588,7 +599,7 @@ static int add_mc_to_global_list(struct mem_ctl_info *mci)
insert_before = &mc_devices;
p = find_mci_by_dev(mci->pdev);
p = __find_mci_by_dev(mci->pdev);
if (unlikely(p != NULL))
goto fail0;
@ -640,26 +651,28 @@ static int del_mc_from_global_list(struct mem_ctl_info *mci)
*
* If found, return a pointer to the structure.
* Else return NULL.
*
* Caller must hold mem_ctls_mutex.
*/
struct mem_ctl_info *edac_mc_find(int idx)
{
struct mem_ctl_info *mci = NULL;
struct list_head *item;
struct mem_ctl_info *mci;
mutex_lock(&mem_ctls_mutex);
list_for_each(item, &mc_devices) {
mci = list_entry(item, struct mem_ctl_info, link);
if (mci->mc_idx >= idx) {
if (mci->mc_idx == idx)
return mci;
if (mci->mc_idx == idx) {
goto unlock;
}
break;
}
}
return NULL;
unlock:
mutex_unlock(&mem_ctls_mutex);
return mci;
}
EXPORT_SYMBOL(edac_mc_find);
@ -779,7 +792,7 @@ struct mem_ctl_info *edac_mc_del_mc(struct device *dev)
mutex_lock(&mem_ctls_mutex);
/* find the requested mci struct in the global list */
mci = find_mci_by_dev(dev);
mci = __find_mci_by_dev(dev);
if (mci == NULL) {
mutex_unlock(&mem_ctls_mutex);
return NULL;

View File

@ -8,7 +8,7 @@ static struct amd_decoder_ops *fam_ops;
static u8 xec_mask = 0xf;
static bool report_gart_errors;
static void (*nb_bus_decoder)(int node_id, struct mce *m);
static void (*decode_dram_ecc)(int node_id, struct mce *m);
void amd_report_gart_errors(bool v)
{
@ -18,16 +18,16 @@ EXPORT_SYMBOL_GPL(amd_report_gart_errors);
void amd_register_ecc_decoder(void (*f)(int, struct mce *))
{
nb_bus_decoder = f;
decode_dram_ecc = f;
}
EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
void amd_unregister_ecc_decoder(void (*f)(int, struct mce *))
{
if (nb_bus_decoder) {
WARN_ON(nb_bus_decoder != f);
if (decode_dram_ecc) {
WARN_ON(decode_dram_ecc != f);
nb_bus_decoder = NULL;
decode_dram_ecc = NULL;
}
}
EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
@ -763,8 +763,8 @@ static void decode_mc4_mce(struct mce *m)
pr_cont("%s.\n", mc4_mce_desc[xec]);
if (nb_bus_decoder)
nb_bus_decoder(node_id, m);
if (decode_dram_ecc)
decode_dram_ecc(node_id, m);
return;
}
break;
@ -877,6 +877,13 @@ static void decode_smca_errors(struct mce *m)
pr_emerg(HW_ERR "%s Error: ", ip_name);
pr_cont("%s.\n", smca_mce_descs[bank_type].descs[xec]);
}
/*
* amd_get_nb_id() returns the last level cache id.
* The last level cache on Fam17h is 1 level below the node.
*/
if (bank_type == SMCA_UMC && xec == 0 && decode_dram_ecc)
decode_dram_ecc(amd_get_nb_id(m->extcpu) >> 1, m);
}
static inline void amd_decode_err_code(u16 ec)
@ -957,10 +964,13 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
((m->status & MCI_STATUS_PCC) ? "PCC" : "-"),
((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"));
if (c->x86 >= 0x15)
pr_cont("|%s|%s",
((m->status & MCI_STATUS_DEFERRED) ? "Deferred" : "-"),
((m->status & MCI_STATUS_POISON) ? "Poison" : "-"));
if (c->x86 >= 0x15) {
pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-"));
/* F15h, bank4, bit 43 is part of McaStatSubCache. */
if (c->x86 != 0x15 || m->bank != 4)
pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-"));
}
if (boot_cpu_has(X86_FEATURE_SMCA)) {
u32 low, high;

View File

@ -300,6 +300,22 @@ err:
return res;
}
static int mpc85xx_pci_err_remove(struct platform_device *op)
{
struct edac_pci_ctl_info *pci = dev_get_drvdata(&op->dev);
struct mpc85xx_pci_pdata *pdata = pci->pvt_info;
edac_dbg(0, "\n");
out_be32(pdata->pci_vbase + MPC85XX_PCI_ERR_ADDR, orig_pci_err_cap_dr);
out_be32(pdata->pci_vbase + MPC85XX_PCI_ERR_EN, orig_pci_err_en);
edac_pci_del_device(&op->dev);
edac_pci_free_ctl_info(pci);
return 0;
}
static const struct platform_device_id mpc85xx_pci_err_match[] = {
{
.name = "mpc85xx-pci-edac"
@ -309,6 +325,7 @@ static const struct platform_device_id mpc85xx_pci_err_match[] = {
static struct platform_driver mpc85xx_pci_err_driver = {
.probe = mpc85xx_pci_err_probe,
.remove = mpc85xx_pci_err_remove,
.id_table = mpc85xx_pci_err_match,
.driver = {
.name = "mpc85xx_pci_err",

View File

@ -23,6 +23,7 @@
#include <linux/math64.h>
#include <linux/mod_devicetable.h>
#include <asm/cpu_device_id.h>
#include <asm/intel-family.h>
#include <asm/processor.h>
#include <asm/mce.h>
@ -3365,12 +3366,13 @@ fail0:
{ X86_VENDOR_INTEL, 6, model, 0, (unsigned long)&table }
static const struct x86_cpu_id sbridge_cpuids[] = {
ICPU(0x2d, pci_dev_descr_sbridge_table), /* SANDY_BRIDGE */
ICPU(0x3e, pci_dev_descr_ibridge_table), /* IVY_BRIDGE */
ICPU(0x3f, pci_dev_descr_haswell_table), /* HASWELL */
ICPU(0x4f, pci_dev_descr_broadwell_table), /* BROADWELL */
ICPU(0x56, pci_dev_descr_broadwell_table), /* BROADWELL-DE */
ICPU(0x57, pci_dev_descr_knl_table), /* KNIGHTS_LANDING */
ICPU(INTEL_FAM6_SANDYBRIDGE_X, pci_dev_descr_sbridge_table),
ICPU(INTEL_FAM6_IVYBRIDGE_X, pci_dev_descr_ibridge_table),
ICPU(INTEL_FAM6_HASWELL_X, pci_dev_descr_haswell_table),
ICPU(INTEL_FAM6_BROADWELL_X, pci_dev_descr_broadwell_table),
ICPU(INTEL_FAM6_BROADWELL_XEON_D, pci_dev_descr_broadwell_table),
ICPU(INTEL_FAM6_XEON_PHI_KNL, pci_dev_descr_knl_table),
ICPU(INTEL_FAM6_XEON_PHI_KNM, pci_dev_descr_knl_table),
{ }
};
MODULE_DEVICE_TABLE(x86cpu, sbridge_cpuids);

View File

@ -25,6 +25,7 @@
#include <linux/math64.h>
#include <linux/mod_devicetable.h>
#include <asm/cpu_device_id.h>
#include <asm/intel-family.h>
#include <asm/processor.h>
#include <asm/mce.h>
@ -262,8 +263,8 @@ fail:
return -ENODEV;
}
const struct x86_cpu_id skx_cpuids[] = {
{ X86_VENDOR_INTEL, 6, 0x55, 0, 0 }, /* Skylake */
static const struct x86_cpu_id skx_cpuids[] = {
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_SKYLAKE_X, 0, 0 },
{ }
};
MODULE_DEVICE_TABLE(x86cpu, skx_cpuids);
@ -1036,7 +1037,7 @@ static void skx_remove(void)
* search for all the devices we need
* check which DIMMs are present.
*/
int __init skx_init(void)
static int __init skx_init(void)
{
const struct x86_cpu_id *id;
const struct munit *m;

View File

@ -1602,16 +1602,16 @@ static void xgene_edac_pa_report(struct edac_device_ctl_info *edac_dev)
dev_err(edac_dev->dev, "IOB PA read data RAM error\n");
if (reg & IOBPA_M_RDATA_CORRUPT_MASK)
dev_err(edac_dev->dev,
"Mutilple IOB PA read data RAM error\n");
"Multiple IOB PA read data RAM error\n");
if (reg & IOBPA_WDATA_CORRUPT_MASK)
dev_err(edac_dev->dev, "IOB PA write data RAM error\n");
if (reg & IOBPA_M_WDATA_CORRUPT_MASK)
dev_err(edac_dev->dev,
"Mutilple IOB PA write data RAM error\n");
"Multiple IOB PA write data RAM error\n");
if (reg & IOBPA_TRANS_CORRUPT_MASK)
dev_err(edac_dev->dev, "IOB PA transaction error\n");
if (reg & IOBPA_M_TRANS_CORRUPT_MASK)
dev_err(edac_dev->dev, "Mutilple IOB PA transaction error\n");
dev_err(edac_dev->dev, "Multiple IOB PA transaction error\n");
if (reg & IOBPA_REQIDRAM_CORRUPT_MASK)
dev_err(edac_dev->dev, "IOB PA transaction ID RAM error\n");
if (reg & IOBPA_M_REQIDRAM_CORRUPT_MASK)

View File

@ -134,6 +134,7 @@ enum dev_type {
enum hw_event_mc_err_type {
HW_EVENT_ERR_CORRECTED,
HW_EVENT_ERR_UNCORRECTED,
HW_EVENT_ERR_DEFERRED,
HW_EVENT_ERR_FATAL,
HW_EVENT_ERR_INFO,
};
@ -145,6 +146,8 @@ static inline char *mc_event_error_type(const unsigned int err_type)
return "Corrected";
case HW_EVENT_ERR_UNCORRECTED:
return "Uncorrected";
case HW_EVENT_ERR_DEFERRED:
return "Deferred";
case HW_EVENT_ERR_FATAL:
return "Fatal";
default:
@ -192,10 +195,11 @@ static inline char *mc_event_error_type(const unsigned int err_type)
* @MEM_DDR3: DDR3 RAM
* @MEM_RDDR3: Registered DDR3 RAM
* This is a variant of the DDR3 memories.
* @MEM_LRDDR3 Load-Reduced DDR3 memory.
* @MEM_LRDDR3: Load-Reduced DDR3 memory.
* @MEM_DDR4: Unbuffered DDR4 RAM
* @MEM_RDDR4: Registered DDR4 RAM
* This is a variant of the DDR4 memories.
* @MEM_LRDDR4: Load-Reduced DDR4 memory.
*/
enum mem_type {
MEM_EMPTY = 0,
@ -218,6 +222,7 @@ enum mem_type {
MEM_LRDDR3,
MEM_DDR4,
MEM_RDDR4,
MEM_LRDDR4,
};
#define MEM_FLAG_EMPTY BIT(MEM_EMPTY)
@ -239,6 +244,7 @@ enum mem_type {
#define MEM_FLAG_RDDR3 BIT(MEM_RDDR3)
#define MEM_FLAG_DDR4 BIT(MEM_DDR4)
#define MEM_FLAG_RDDR4 BIT(MEM_RDDR4)
#define MEM_FLAG_LRDDR4 BIT(MEM_LRDDR4)
/**
* enum edac-type - Error Detection and Correction capabilities and mode