From 1e4511604dfaf6d1642603ff89effb9e93682716 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 19 Jul 2018 16:16:55 -0500 Subject: [PATCH 01/16] PCI/AER: Expose internal API for obtaining AER information Export some common AER functions and structures for other PCI core drivers to use. Since this is making the function externally visible inside the PCI core, prepend "aer_" to the function name. Signed-off-by: Keith Busch [bhelgaas: move AER declarations from linux/aer.h to drivers/pci/pci.h] Signed-off-by: Bjorn Helgaas Reviewed-by: Sinan Kaya Reviewed-by: Oza Pawandeep --- drivers/pci/pci.h | 28 ++++++++++++++++++++++++++++ drivers/pci/pcie/aer.c | 30 +++++------------------------- 2 files changed, 33 insertions(+), 25 deletions(-) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index c358e7a07f3f..4f723442f602 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -300,6 +300,34 @@ static inline bool pci_dev_is_disconnected(const struct pci_dev *dev) return test_bit(PCI_DEV_DISCONNECTED, &dev->priv_flags); } +#ifdef CONFIG_PCIEAER +#include + +#define AER_MAX_MULTI_ERR_DEVICES 5 /* Not likely to have more */ + +struct aer_err_info { + struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES]; + int error_dev_num; + + unsigned int id:16; + + unsigned int severity:2; /* 0:NONFATAL | 1:FATAL | 2:COR */ + unsigned int __pad1:5; + unsigned int multi_error_valid:1; + + unsigned int first_error:5; + unsigned int __pad2:2; + unsigned int tlp_header_valid:1; + + unsigned int status; /* COR/UNCOR Error Status */ + unsigned int mask; /* COR/UNCOR Error Mask */ + struct aer_header_log_regs tlp; /* TLP Header */ +}; + +int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info); +void aer_print_error(struct pci_dev *dev, struct aer_err_info *info); +#endif /* CONFIG_PCIEAER */ + #ifdef CONFIG_PCI_ATS void pci_restore_ats_state(struct pci_dev *dev); #else diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index a2e88386af28..0a60275f0582 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -31,26 +31,6 @@ #include "portdrv.h" #define AER_ERROR_SOURCES_MAX 100 -#define AER_MAX_MULTI_ERR_DEVICES 5 /* Not likely to have more */ - -struct aer_err_info { - struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES]; - int error_dev_num; - - unsigned int id:16; - - unsigned int severity:2; /* 0:NONFATAL | 1:FATAL | 2:COR */ - unsigned int __pad1:5; - unsigned int multi_error_valid:1; - - unsigned int first_error:5; - unsigned int __pad2:2; - unsigned int tlp_header_valid:1; - - unsigned int status; /* COR/UNCOR Error Status */ - unsigned int mask; /* COR/UNCOR Error Mask */ - struct aer_header_log_regs tlp; /* TLP Header */ -}; struct aer_err_source { unsigned int status; @@ -547,7 +527,7 @@ static void __aer_print_error(struct pci_dev *dev, } } -static void aer_print_error(struct pci_dev *dev, struct aer_err_info *info) +void aer_print_error(struct pci_dev *dev, struct aer_err_info *info) { int layer, agent; int id = ((dev->bus->number << 8) | dev->devfn); @@ -876,7 +856,7 @@ EXPORT_SYMBOL_GPL(aer_recover_queue); #endif /** - * get_device_error_info - read error status from dev and store it to info + * aer_get_device_error_info - read error status from dev and store it to info * @dev: pointer to the device expected to have a error record * @info: pointer to structure to store the error record * @@ -884,7 +864,7 @@ EXPORT_SYMBOL_GPL(aer_recover_queue); * * Note that @info is reused among all error devices. Clear fields properly. */ -static int get_device_error_info(struct pci_dev *dev, struct aer_err_info *info) +int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info) { int pos, temp; @@ -942,11 +922,11 @@ static inline void aer_process_err_devices(struct aer_err_info *e_info) /* Report all before handle them, not to lost records by reset etc. */ for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) { - if (get_device_error_info(e_info->dev[i], e_info)) + if (aer_get_device_error_info(e_info->dev[i], e_info)) aer_print_error(e_info->dev[i], e_info); } for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) { - if (get_device_error_info(e_info->dev[i], e_info)) + if (aer_get_device_error_info(e_info->dev[i], e_info)) handle_error_source(e_info->dev[i], e_info); } } From bd237801fef230cea8f2a5ab550d500f19f856d8 Mon Sep 17 00:00:00 2001 From: Tyler Baicar Date: Tue, 26 Jun 2018 11:44:15 -0400 Subject: [PATCH 02/16] PCI/AER: Adopt lspci names for AER error decoding lspci uses abbreviated naming for AER error strings. Adopt the same naming convention for the AER printing so they match. Signed-off-by: Tyler Baicar Signed-off-by: Bjorn Helgaas Reviewed-by: Oza Pawandeep --- drivers/pci/pcie/aer.c | 46 +++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 0a60275f0582..e6d5255d718c 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -439,22 +439,22 @@ static const char *aer_error_layer[] = { }; static const char *aer_correctable_error_string[] = { - "Receiver Error", /* Bit Position 0 */ + "RxErr", /* Bit Position 0 */ NULL, NULL, NULL, NULL, NULL, - "Bad TLP", /* Bit Position 6 */ - "Bad DLLP", /* Bit Position 7 */ - "RELAY_NUM Rollover", /* Bit Position 8 */ + "BadTLP", /* Bit Position 6 */ + "BadDLLP", /* Bit Position 7 */ + "Rollover", /* Bit Position 8 */ NULL, NULL, NULL, - "Replay Timer Timeout", /* Bit Position 12 */ - "Advisory Non-Fatal", /* Bit Position 13 */ - "Corrected Internal Error", /* Bit Position 14 */ - "Header Log Overflow", /* Bit Position 15 */ + "Timeout", /* Bit Position 12 */ + "NonFatalErr", /* Bit Position 13 */ + "CorrIntErr", /* Bit Position 14 */ + "HeaderOF", /* Bit Position 15 */ }; static const char *aer_uncorrectable_error_string[] = { @@ -462,28 +462,28 @@ static const char *aer_uncorrectable_error_string[] = { NULL, NULL, NULL, - "Data Link Protocol", /* Bit Position 4 */ - "Surprise Down Error", /* Bit Position 5 */ + "DLP", /* Bit Position 4 */ + "SDES", /* Bit Position 5 */ NULL, NULL, NULL, NULL, NULL, NULL, - "Poisoned TLP", /* Bit Position 12 */ - "Flow Control Protocol", /* Bit Position 13 */ - "Completion Timeout", /* Bit Position 14 */ - "Completer Abort", /* Bit Position 15 */ - "Unexpected Completion", /* Bit Position 16 */ - "Receiver Overflow", /* Bit Position 17 */ - "Malformed TLP", /* Bit Position 18 */ + "TLP", /* Bit Position 12 */ + "FCP", /* Bit Position 13 */ + "CmpltTO", /* Bit Position 14 */ + "CmpltAbrt", /* Bit Position 15 */ + "UnxCmplt", /* Bit Position 16 */ + "RxOF", /* Bit Position 17 */ + "MalfTLP", /* Bit Position 18 */ "ECRC", /* Bit Position 19 */ - "Unsupported Request", /* Bit Position 20 */ - "ACS Violation", /* Bit Position 21 */ - "Uncorrectable Internal Error", /* Bit Position 22 */ - "MC Blocked TLP", /* Bit Position 23 */ - "AtomicOp Egress Blocked", /* Bit Position 24 */ - "TLP Prefix Blocked Error", /* Bit Position 25 */ + "UnsupReq", /* Bit Position 20 */ + "ACSViol", /* Bit Position 21 */ + "UncorrIntErr", /* Bit Position 22 */ + "BlockedTLP", /* Bit Position 23 */ + "AtomicOpBlocked", /* Bit Position 24 */ + "TLPBlockedErr", /* Bit Position 25 */ }; static const char *aer_agent_string[] = { From 60ed982a4e78ff938824a750dbac8a10e5b472ef Mon Sep 17 00:00:00 2001 From: Rajat Jain Date: Thu, 21 Jun 2018 16:48:26 -0700 Subject: [PATCH 03/16] PCI/AER: Move internal declarations to drivers/pci/pci.h Since pci_aer_init() and pci_no_aer() are used only internally, move their declarations to the PCI internal header file. Also, no one cares about return value of pci_aer_init(), so make it void. Signed-off-by: Rajat Jain Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.h | 8 ++++++++ drivers/pci/pcie/aer.c | 4 ++-- include/linux/pci.h | 4 ---- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 4f723442f602..52bc5b350dfb 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -480,4 +480,12 @@ static inline int devm_of_pci_get_host_bridge_resources(struct device *dev, } #endif +#ifdef CONFIG_PCIEAER +void pci_no_aer(void); +void pci_aer_init(struct pci_dev *dev); +#else +static inline void pci_no_aer(void) { } +static inline int pci_aer_init(struct pci_dev *d) { return -ENODEV; } +#endif + #endif /* DRIVERS_PCI_H */ diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index e6d5255d718c..0c6fe22eaf75 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -382,10 +382,10 @@ int pci_cleanup_aer_error_status_regs(struct pci_dev *dev) return 0; } -int pci_aer_init(struct pci_dev *dev) +void pci_aer_init(struct pci_dev *dev) { dev->aer_cap = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR); - return pci_cleanup_aer_error_status_regs(dev); + pci_cleanup_aer_error_status_regs(dev); } #define AER_AGENT_RECEIVER 0 diff --git a/include/linux/pci.h b/include/linux/pci.h index 340029b2fb38..b4ffea05c999 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1468,13 +1468,9 @@ static inline bool pcie_aspm_support_enabled(void) { return false; } #endif #ifdef CONFIG_PCIEAER -void pci_no_aer(void); bool pci_aer_available(void); -int pci_aer_init(struct pci_dev *dev); #else -static inline void pci_no_aer(void) { } static inline bool pci_aer_available(void) { return false; } -static inline int pci_aer_init(struct pci_dev *d) { return -ENODEV; } #endif #ifdef CONFIG_PCIE_ECRC From db89ccbe52c7885644ba578c7771e57620f879b1 Mon Sep 17 00:00:00 2001 From: Rajat Jain Date: Sat, 30 Jun 2018 15:07:17 -0500 Subject: [PATCH 04/16] PCI/AER: Define aer_stats structure for AER capable devices Define a structure to hold the AER statistics. There are 2 groups of statistics: dev_* counters that are to be collected for all AER capable devices and rootport_* counters that are collected for all (AER capable) rootports only. Allocate and free this structure when device is added or released (thus counters survive the lifetime of the device). Signed-off-by: Rajat Jain Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.h | 2 ++ drivers/pci/pcie/aer.c | 53 ++++++++++++++++++++++++++++++++++++++++-- drivers/pci/probe.c | 1 + include/linux/pci.h | 1 + 4 files changed, 55 insertions(+), 2 deletions(-) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 52bc5b350dfb..1877a14e06a9 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -483,9 +483,11 @@ static inline int devm_of_pci_get_host_bridge_resources(struct device *dev, #ifdef CONFIG_PCIEAER void pci_no_aer(void); void pci_aer_init(struct pci_dev *dev); +void pci_aer_exit(struct pci_dev *dev); #else static inline void pci_no_aer(void) { } static inline int pci_aer_init(struct pci_dev *d) { return -ENODEV; } +static inline void pci_aer_exit(struct pci_dev *d) { } #endif #endif /* DRIVERS_PCI_H */ diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 0c6fe22eaf75..fe1b9d22a331 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -32,6 +32,9 @@ #define AER_ERROR_SOURCES_MAX 100 +#define AER_MAX_TYPEOF_COR_ERRS 16 /* as per PCI_ERR_COR_STATUS */ +#define AER_MAX_TYPEOF_UNCOR_ERRS 26 /* as per PCI_ERR_UNCOR_STATUS*/ + struct aer_err_source { unsigned int status; unsigned int id; @@ -56,6 +59,42 @@ struct aer_rpc { */ }; +/* AER stats for the device */ +struct aer_stats { + + /* + * Fields for all AER capable devices. They indicate the errors + * "as seen by this device". Note that this may mean that if an + * end point is causing problems, the AER counters may increment + * at its link partner (e.g. root port) because the errors will be + * "seen" by the link partner and not the the problematic end point + * itself (which may report all counters as 0 as it never saw any + * problems). + */ + /* Counters for different type of correctable errors */ + u64 dev_cor_errs[AER_MAX_TYPEOF_COR_ERRS]; + /* Counters for different type of fatal uncorrectable errors */ + u64 dev_fatal_errs[AER_MAX_TYPEOF_UNCOR_ERRS]; + /* Counters for different type of nonfatal uncorrectable errors */ + u64 dev_nonfatal_errs[AER_MAX_TYPEOF_UNCOR_ERRS]; + /* Total number of ERR_COR sent by this device */ + u64 dev_total_cor_errs; + /* Total number of ERR_FATAL sent by this device */ + u64 dev_total_fatal_errs; + /* Total number of ERR_NONFATAL sent by this device */ + u64 dev_total_nonfatal_errs; + + /* + * Fields for Root ports & root complex event collectors only, these + * indicate the total number of ERR_COR, ERR_FATAL, and ERR_NONFATAL + * messages received by the root port / event collector, INCLUDING the + * ones that are generated internally (by the rootport itself) + */ + u64 rootport_total_cor_errs; + u64 rootport_total_fatal_errs; + u64 rootport_total_nonfatal_errs; +}; + #define AER_LOG_TLP_MASKS (PCI_ERR_UNC_POISON_TLP| \ PCI_ERR_UNC_ECRC| \ PCI_ERR_UNC_UNSUP| \ @@ -385,9 +424,19 @@ int pci_cleanup_aer_error_status_regs(struct pci_dev *dev) void pci_aer_init(struct pci_dev *dev) { dev->aer_cap = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR); + + if (dev->aer_cap) + dev->aer_stats = kzalloc(sizeof(struct aer_stats), GFP_KERNEL); + pci_cleanup_aer_error_status_regs(dev); } +void pci_aer_exit(struct pci_dev *dev) +{ + kfree(dev->aer_stats); + dev->aer_stats = NULL; +} + #define AER_AGENT_RECEIVER 0 #define AER_AGENT_REQUESTER 1 #define AER_AGENT_COMPLETER 2 @@ -438,7 +487,7 @@ static const char *aer_error_layer[] = { "Transaction Layer" }; -static const char *aer_correctable_error_string[] = { +static const char *aer_correctable_error_string[AER_MAX_TYPEOF_COR_ERRS] = { "RxErr", /* Bit Position 0 */ NULL, NULL, @@ -457,7 +506,7 @@ static const char *aer_correctable_error_string[] = { "HeaderOF", /* Bit Position 15 */ }; -static const char *aer_uncorrectable_error_string[] = { +static const char *aer_uncorrectable_error_string[AER_MAX_TYPEOF_UNCOR_ERRS] = { "Undefined", /* Bit Position 0 */ NULL, NULL, diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index ac876e32de4b..48edd0c9e4bc 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -2064,6 +2064,7 @@ static void pci_configure_device(struct pci_dev *dev) static void pci_release_capabilities(struct pci_dev *dev) { + pci_aer_exit(dev); pci_vpd_release(dev); pci_iov_release(dev); pci_free_cap_save_buffers(dev); diff --git a/include/linux/pci.h b/include/linux/pci.h index b4ffea05c999..6bc0aa0fc33f 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -299,6 +299,7 @@ struct pci_dev { u8 hdr_type; /* PCI header type (`multi' flag masked out) */ #ifdef CONFIG_PCIEAER u16 aer_cap; /* AER capability offset */ + struct aer_stats *aer_stats; /* AER stats for this device */ #endif u8 pcie_cap; /* PCIe capability offset */ u8 msi_cap; /* MSI capability offset */ From 81aa5206f9a7c9793e2f7971400351664e40b04f Mon Sep 17 00:00:00 2001 From: Rajat Jain Date: Thu, 21 Jun 2018 16:48:28 -0700 Subject: [PATCH 05/16] PCI/AER: Add sysfs attributes to provide AER stats and breakdown Add sysfs attributes to provide total and breakdown of the AERs seen, into different type of correctable, fatal and nonfatal errors: /sys/bus/pci/devices//aer_dev_correctable /sys/bus/pci/devices//aer_dev_fatal /sys/bus/pci/devices//aer_dev_nonfatal Signed-off-by: Rajat Jain Signed-off-by: Bjorn Helgaas --- .../testing/sysfs-bus-pci-devices-aer_stats | 94 +++++++++++++++++++ Documentation/PCI/pcieaer-howto.txt | 5 + drivers/pci/pci-sysfs.c | 3 + drivers/pci/pci.h | 1 + drivers/pci/pcie/aer.c | 94 +++++++++++++++++++ 5 files changed, 197 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats diff --git a/Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats b/Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats new file mode 100644 index 000000000000..3a784297cfed --- /dev/null +++ b/Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats @@ -0,0 +1,94 @@ +========================== +PCIe Device AER statistics +========================== +These attributes show up under all the devices that are AER capable. These +statistical counters indicate the errors "as seen/reported by the device". +Note that this may mean that if an endpoint is causing problems, the AER +counters may increment at its link partner (e.g. root port) because the +errors may be "seen" / reported by the link partner and not the +problematic endpoint itself (which may report all counters as 0 as it never +saw any problems). + +Where: /sys/bus/pci/devices//aer_dev_correctable +Date: July 2018 +Kernel Version: 4.19.0 +Contact: linux-pci@vger.kernel.org, rajatja@google.com +Description: List of correctable errors seen and reported by this + PCI device using ERR_COR. Note that since multiple errors may + be reported using a single ERR_COR message, thus + TOTAL_ERR_COR at the end of the file may not match the actual + total of all the errors in the file. Sample output: +------------------------------------------------------------------------- +localhost /sys/devices/pci0000:00/0000:00:1c.0 # cat aer_dev_correctable +Receiver Error 2 +Bad TLP 0 +Bad DLLP 0 +RELAY_NUM Rollover 0 +Replay Timer Timeout 0 +Advisory Non-Fatal 0 +Corrected Internal Error 0 +Header Log Overflow 0 +TOTAL_ERR_COR 2 +------------------------------------------------------------------------- + +Where: /sys/bus/pci/devices//aer_dev_fatal +Date: July 2018 +Kernel Version: 4.19.0 +Contact: linux-pci@vger.kernel.org, rajatja@google.com +Description: List of uncorrectable fatal errors seen and reported by this + PCI device using ERR_FATAL. Note that since multiple errors may + be reported using a single ERR_FATAL message, thus + TOTAL_ERR_FATAL at the end of the file may not match the actual + total of all the errors in the file. Sample output: +------------------------------------------------------------------------- +localhost /sys/devices/pci0000:00/0000:00:1c.0 # cat aer_dev_fatal +Undefined 0 +Data Link Protocol 0 +Surprise Down Error 0 +Poisoned TLP 0 +Flow Control Protocol 0 +Completion Timeout 0 +Completer Abort 0 +Unexpected Completion 0 +Receiver Overflow 0 +Malformed TLP 0 +ECRC 0 +Unsupported Request 0 +ACS Violation 0 +Uncorrectable Internal Error 0 +MC Blocked TLP 0 +AtomicOp Egress Blocked 0 +TLP Prefix Blocked Error 0 +TOTAL_ERR_FATAL 0 +------------------------------------------------------------------------- + +Where: /sys/bus/pci/devices//aer_dev_nonfatal +Date: July 2018 +Kernel Version: 4.19.0 +Contact: linux-pci@vger.kernel.org, rajatja@google.com +Description: List of uncorrectable nonfatal errors seen and reported by this + PCI device using ERR_NONFATAL. Note that since multiple errors + may be reported using a single ERR_FATAL message, thus + TOTAL_ERR_NONFATAL at the end of the file may not match the + actual total of all the errors in the file. Sample output: +------------------------------------------------------------------------- +localhost /sys/devices/pci0000:00/0000:00:1c.0 # cat aer_dev_nonfatal +Undefined 0 +Data Link Protocol 0 +Surprise Down Error 0 +Poisoned TLP 0 +Flow Control Protocol 0 +Completion Timeout 0 +Completer Abort 0 +Unexpected Completion 0 +Receiver Overflow 0 +Malformed TLP 0 +ECRC 0 +Unsupported Request 0 +ACS Violation 0 +Uncorrectable Internal Error 0 +MC Blocked TLP 0 +AtomicOp Egress Blocked 0 +TLP Prefix Blocked Error 0 +TOTAL_ERR_NONFATAL 0 +------------------------------------------------------------------------- diff --git a/Documentation/PCI/pcieaer-howto.txt b/Documentation/PCI/pcieaer-howto.txt index acd0dddd6bb8..48ce7903e3c6 100644 --- a/Documentation/PCI/pcieaer-howto.txt +++ b/Documentation/PCI/pcieaer-howto.txt @@ -73,6 +73,11 @@ In the example, 'Requester ID' means the ID of the device who sends the error message to root port. Pls. refer to pci express specs for other fields. +2.4 AER Statistics / Counters + +When PCIe AER errors are captured, the counters / statistics are also exposed +in the form of sysfs attributes which are documented at +Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats 3. Developer Guide diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index 0c4653c1d2ce..9f1cb9051d7d 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -1746,6 +1746,9 @@ static const struct attribute_group *pci_dev_attr_groups[] = { #endif &pci_bridge_attr_group, &pcie_dev_attr_group, +#ifdef CONFIG_PCIEAER + &aer_stats_attr_group, +#endif NULL, }; diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 1877a14e06a9..b1ce0dcad1dc 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -484,6 +484,7 @@ static inline int devm_of_pci_get_host_bridge_resources(struct device *dev, void pci_no_aer(void); void pci_aer_init(struct pci_dev *dev); void pci_aer_exit(struct pci_dev *dev); +extern const struct attribute_group aer_stats_attr_group; #else static inline void pci_no_aer(void) { } static inline int pci_aer_init(struct pci_dev *d) { return -ENODEV; } diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index fe1b9d22a331..b18c5aca30bd 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -542,6 +542,99 @@ static const char *aer_agent_string[] = { "Transmitter ID" }; +#define aer_stats_dev_attr(name, stats_array, strings_array, \ + total_string, total_field) \ + static ssize_t \ + name##_show(struct device *dev, struct device_attribute *attr, \ + char *buf) \ +{ \ + unsigned int i; \ + char *str = buf; \ + struct pci_dev *pdev = to_pci_dev(dev); \ + u64 *stats = pdev->aer_stats->stats_array; \ + \ + for (i = 0; i < ARRAY_SIZE(strings_array); i++) { \ + if (strings_array[i]) \ + str += sprintf(str, "%s %llu\n", \ + strings_array[i], stats[i]); \ + else if (stats[i]) \ + str += sprintf(str, #stats_array "_bit[%d] %llu\n",\ + i, stats[i]); \ + } \ + str += sprintf(str, "TOTAL_%s %llu\n", total_string, \ + pdev->aer_stats->total_field); \ + return str-buf; \ +} \ +static DEVICE_ATTR_RO(name) + +aer_stats_dev_attr(aer_dev_correctable, dev_cor_errs, + aer_correctable_error_string, "ERR_COR", + dev_total_cor_errs); +aer_stats_dev_attr(aer_dev_fatal, dev_fatal_errs, + aer_uncorrectable_error_string, "ERR_FATAL", + dev_total_fatal_errs); +aer_stats_dev_attr(aer_dev_nonfatal, dev_nonfatal_errs, + aer_uncorrectable_error_string, "ERR_NONFATAL", + dev_total_nonfatal_errs); + +static struct attribute *aer_stats_attrs[] __ro_after_init = { + &dev_attr_aer_dev_correctable.attr, + &dev_attr_aer_dev_fatal.attr, + &dev_attr_aer_dev_nonfatal.attr, + NULL +}; + +static umode_t aer_stats_attrs_are_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct device *dev = kobj_to_dev(kobj); + struct pci_dev *pdev = to_pci_dev(dev); + + if (!pdev->aer_stats) + return 0; + + return a->mode; +} + +const struct attribute_group aer_stats_attr_group = { + .attrs = aer_stats_attrs, + .is_visible = aer_stats_attrs_are_visible, +}; + +static void pci_dev_aer_stats_incr(struct pci_dev *pdev, + struct aer_err_info *info) +{ + int status, i, max = -1; + u64 *counter = NULL; + struct aer_stats *aer_stats = pdev->aer_stats; + + if (!aer_stats) + return; + + switch (info->severity) { + case AER_CORRECTABLE: + aer_stats->dev_total_cor_errs++; + counter = &aer_stats->dev_cor_errs[0]; + max = AER_MAX_TYPEOF_COR_ERRS; + break; + case AER_NONFATAL: + aer_stats->dev_total_nonfatal_errs++; + counter = &aer_stats->dev_nonfatal_errs[0]; + max = AER_MAX_TYPEOF_UNCOR_ERRS; + break; + case AER_FATAL: + aer_stats->dev_total_fatal_errs++; + counter = &aer_stats->dev_fatal_errs[0]; + max = AER_MAX_TYPEOF_UNCOR_ERRS; + break; + } + + status = (info->status & ~info->mask); + for (i = 0; i < max; i++) + if (status & (1 << i)) + counter[i]++; +} + static void __print_tlp_header(struct pci_dev *dev, struct aer_header_log_regs *t) { @@ -574,6 +667,7 @@ static void __aer_print_error(struct pci_dev *dev, pci_err(dev, " [%2d] Unknown Error Bit%s\n", i, info->first_error == i ? " (First)" : ""); } + pci_dev_aer_stats_incr(dev, info); } void aer_print_error(struct pci_dev *dev, struct aer_err_info *info) From 12833017e581c384afa35fb85ce540082b2d59fc Mon Sep 17 00:00:00 2001 From: Rajat Jain Date: Thu, 21 Jun 2018 16:48:29 -0700 Subject: [PATCH 06/16] PCI/AER: Add sysfs attributes for rootport cumulative stats Add sysfs attributes for rootport statistics (that are cumulative of all the ERR_* messages seen on this PCI hierarchy). Signed-off-by: Rajat Jain Signed-off-by: Bjorn Helgaas --- .../testing/sysfs-bus-pci-devices-aer_stats | 28 +++++++++++ drivers/pci/pcie/aer.c | 47 +++++++++++++++++++ 2 files changed, 75 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats b/Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats index 3a784297cfed..4b0318c99507 100644 --- a/Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats +++ b/Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats @@ -92,3 +92,31 @@ AtomicOp Egress Blocked 0 TLP Prefix Blocked Error 0 TOTAL_ERR_NONFATAL 0 ------------------------------------------------------------------------- + +============================ +PCIe Rootport AER statistics +============================ +These attributes show up under only the rootports (or root complex event +collectors) that are AER capable. These indicate the number of error messages as +"reported to" the rootport. Please note that the rootports also transmit +(internally) the ERR_* messages for errors seen by the internal rootport PCI +device, so these counters include them and are thus cumulative of all the error +messages on the PCI hierarchy originating at that root port. + +Where: /sys/bus/pci/devices//aer_stats/aer_rootport_total_err_cor +Date: July 2018 +Kernel Version: 4.19.0 +Contact: linux-pci@vger.kernel.org, rajatja@google.com +Description: Total number of ERR_COR messages reported to rootport. + +Where: /sys/bus/pci/devices//aer_stats/aer_rootport_total_err_fatal +Date: July 2018 +Kernel Version: 4.19.0 +Contact: linux-pci@vger.kernel.org, rajatja@google.com +Description: Total number of ERR_FATAL messages reported to rootport. + +Where: /sys/bus/pci/devices//aer_stats/aer_rootport_total_err_nonfatal +Date: July 2018 +Kernel Version: 4.19.0 +Contact: linux-pci@vger.kernel.org, rajatja@google.com +Description: Total number of ERR_NONFATAL messages reported to rootport. diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index b18c5aca30bd..47c67de1ccf1 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -577,10 +577,30 @@ aer_stats_dev_attr(aer_dev_nonfatal, dev_nonfatal_errs, aer_uncorrectable_error_string, "ERR_NONFATAL", dev_total_nonfatal_errs); +#define aer_stats_rootport_attr(name, field) \ + static ssize_t \ + name##_show(struct device *dev, struct device_attribute *attr, \ + char *buf) \ +{ \ + struct pci_dev *pdev = to_pci_dev(dev); \ + return sprintf(buf, "%llu\n", pdev->aer_stats->field); \ +} \ +static DEVICE_ATTR_RO(name) + +aer_stats_rootport_attr(aer_rootport_total_err_cor, + rootport_total_cor_errs); +aer_stats_rootport_attr(aer_rootport_total_err_fatal, + rootport_total_fatal_errs); +aer_stats_rootport_attr(aer_rootport_total_err_nonfatal, + rootport_total_nonfatal_errs); + static struct attribute *aer_stats_attrs[] __ro_after_init = { &dev_attr_aer_dev_correctable.attr, &dev_attr_aer_dev_fatal.attr, &dev_attr_aer_dev_nonfatal.attr, + &dev_attr_aer_rootport_total_err_cor.attr, + &dev_attr_aer_rootport_total_err_fatal.attr, + &dev_attr_aer_rootport_total_err_nonfatal.attr, NULL }; @@ -593,6 +613,12 @@ static umode_t aer_stats_attrs_are_visible(struct kobject *kobj, if (!pdev->aer_stats) return 0; + if ((a == &dev_attr_aer_rootport_total_err_cor.attr || + a == &dev_attr_aer_rootport_total_err_fatal.attr || + a == &dev_attr_aer_rootport_total_err_nonfatal.attr) && + pci_pcie_type(pdev) != PCI_EXP_TYPE_ROOT_PORT) + return 0; + return a->mode; } @@ -635,6 +661,25 @@ static void pci_dev_aer_stats_incr(struct pci_dev *pdev, counter[i]++; } +static void pci_rootport_aer_stats_incr(struct pci_dev *pdev, + struct aer_err_source *e_src) +{ + struct aer_stats *aer_stats = pdev->aer_stats; + + if (!aer_stats) + return; + + if (e_src->status & PCI_ERR_ROOT_COR_RCV) + aer_stats->rootport_total_cor_errs++; + + if (e_src->status & PCI_ERR_ROOT_UNCOR_RCV) { + if (e_src->status & PCI_ERR_ROOT_FATAL_RCV) + aer_stats->rootport_total_fatal_errs++; + else + aer_stats->rootport_total_nonfatal_errs++; + } +} + static void __print_tlp_header(struct pci_dev *dev, struct aer_header_log_regs *t) { @@ -1085,6 +1130,8 @@ static void aer_isr_one_error(struct aer_rpc *rpc, struct pci_dev *pdev = rpc->rpd; struct aer_err_info *e_info = &rpc->e_info; + pci_rootport_aer_stats_incr(pdev, e_src); + /* * There is a possibility that both correctable error and * uncorrectable error being logged. Report correctable error first. From 7af02fcd84c16801958936f88b848944c726ca07 Mon Sep 17 00:00:00 2001 From: Alexandru Gagniuc Date: Tue, 3 Jul 2018 18:27:43 -0500 Subject: [PATCH 07/16] PCI/AER: Honor "pcie_ports=native" even if HEST sets FIRMWARE_FIRST According to the documentation, "pcie_ports=native", linux should use native AER and DPC services. While that is true for the _OSC method parsing, this is not the only place that is checked. Should the HEST list PCIe ports as firmware-first, linux will not use native services. This happens because aer_acpi_firmware_first() doesn't take 'pcie_ports' into account. This is wrong. DPC uses the same logic when it decides whether to load or not, so fixing this also fixes DPC not loading. Signed-off-by: Alexandru Gagniuc [bhelgaas: return "false" from bool function (from kbuild robot)] Signed-off-by: Bjorn Helgaas --- drivers/pci/pcie/aer.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 47c67de1ccf1..766687094706 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -322,6 +322,9 @@ int pcie_aer_get_firmware_first(struct pci_dev *dev) if (!pci_is_pcie(dev)) return 0; + if (pcie_ports_native) + return 0; + if (!dev->__aer_firmware_first_valid) aer_set_firmware_first(dev); return dev->__aer_firmware_first; @@ -342,6 +345,9 @@ bool aer_acpi_firmware_first(void) .firmware_first = 0, }; + if (pcie_ports_native) + return false; + if (!parsed) { apei_hest_parse(aer_hest_parse, &info); aer_firmware_first = info.firmware_first; From 7ab92e89bf8b0a93f0d53b6d83270e4cd0f7c563 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 19 Jul 2018 17:55:58 -0500 Subject: [PATCH 08/16] PCI/AER: Clear only ERR_FATAL status bits during fatal recovery During recovery from fatal errors, we previously called pci_cleanup_aer_uncorrect_error_status(), which cleared *all* uncorrectable error status bits (both ERR_FATAL and ERR_NONFATAL). Instead, call a new pci_aer_clear_fatal_status() that clears only the ERR_FATAL bits (as indicated by the PCI_ERR_UNCOR_SEVER register). Based-on-patch-by: Oza Pawandeep Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.h | 2 ++ drivers/pci/pcie/aer.c | 17 +++++++++++++++++ drivers/pci/pcie/err.c | 2 +- 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index b1ce0dcad1dc..107c64892b66 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -485,10 +485,12 @@ void pci_no_aer(void); void pci_aer_init(struct pci_dev *dev); void pci_aer_exit(struct pci_dev *dev); extern const struct attribute_group aer_stats_attr_group; +void pci_aer_clear_fatal_status(struct pci_dev *dev); #else static inline void pci_no_aer(void) { } static inline int pci_aer_init(struct pci_dev *d) { return -ENODEV; } static inline void pci_aer_exit(struct pci_dev *d) { } +static inline void pci_aer_clear_fatal_status(struct pci_dev *dev) { } #endif #endif /* DRIVERS_PCI_H */ diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 766687094706..b776a768a434 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -399,6 +399,23 @@ int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev) } EXPORT_SYMBOL_GPL(pci_cleanup_aer_uncorrect_error_status); +void pci_aer_clear_fatal_status(struct pci_dev *dev) +{ + int pos; + u32 status, sev; + + pos = dev->aer_cap; + if (!pos) + return; + + /* Clear status bits for ERR_FATAL errors only */ + pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, &status); + pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, &sev); + status &= sev; + if (status) + pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, status); +} + int pci_cleanup_aer_error_status_regs(struct pci_dev *dev) { int pos; diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c index f7ce0cb0b0b7..0539518f9861 100644 --- a/drivers/pci/pcie/err.c +++ b/drivers/pci/pcie/err.c @@ -316,7 +316,7 @@ void pcie_do_fatal_recovery(struct pci_dev *dev, u32 service) * do error recovery on all subordinates of the bridge instead * of the bridge and clear the error status of the bridge. */ - pci_cleanup_aer_uncorrect_error_status(dev); + pci_aer_clear_fatal_status(dev); } if (result == PCI_ERS_RESULT_RECOVERED) { From e7b0b847de6db161e3917732276e425bc92a2feb Mon Sep 17 00:00:00 2001 From: Oza Pawandeep Date: Thu, 19 Jul 2018 17:58:05 -0500 Subject: [PATCH 09/16] PCI/AER: Clear only ERR_NONFATAL bits during non-fatal recovery pci_cleanup_aer_uncorrect_error_status() is called by driver .slot_reset() methods when handling ERR_NONFATAL errors. Previously this cleared *all* the bits, including ERR_FATAL bits. Since we're only handling ERR_NONFATAL errors, clear only the ERR_NONFATAL error status bits. Signed-off-by: Oza Pawandeep [bhelgaas: split to separate patch] Signed-off-by: Bjorn Helgaas --- drivers/pci/pcie/aer.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index b776a768a434..f853e72524be 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -385,13 +385,16 @@ EXPORT_SYMBOL_GPL(pci_disable_pcie_error_reporting); int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev) { int pos; - u32 status; + u32 status, sev; pos = dev->aer_cap; if (!pos) return -EIO; + /* Clear status bits for ERR_NONFATAL errors only */ pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, &status); + pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, &sev); + status &= ~sev; if (status) pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, status); From 5b6c09660da8779dd545fa717c2b0cc79d477c9e Mon Sep 17 00:00:00 2001 From: Oza Pawandeep Date: Thu, 19 Jul 2018 17:58:06 -0500 Subject: [PATCH 10/16] PCI/AER: Factor out ERR_NONFATAL status bit clearing aer_error_resume() clears all ERR_NONFATAL error status bits. This is exactly what pci_cleanup_aer_uncorrect_error_status(), so use that instead of duplicating the code. Signed-off-by: Oza Pawandeep [bhelgaas: split to separate patch] Signed-off-by: Bjorn Helgaas --- drivers/pci/pcie/aer.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index f853e72524be..4411ada4a91c 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -1532,20 +1532,13 @@ static pci_ers_result_t aer_root_reset(struct pci_dev *dev) */ static void aer_error_resume(struct pci_dev *dev) { - int pos; - u32 status, mask; u16 reg16; /* Clean up Root device status */ pcie_capability_read_word(dev, PCI_EXP_DEVSTA, ®16); pcie_capability_write_word(dev, PCI_EXP_DEVSTA, reg16); - /* Clean AER Root Error Status */ - pos = dev->aer_cap; - pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, &status); - pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, &mask); - status &= ~mask; /* Clear corresponding nonfatal bits */ - pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, status); + pci_cleanup_aer_uncorrect_error_status(dev); } static struct pcie_port_service_driver aerdriver = { From 43ec03a9e5f382ff70fdef35b4ea813263cd8270 Mon Sep 17 00:00:00 2001 From: Oza Pawandeep Date: Thu, 19 Jul 2018 17:58:07 -0500 Subject: [PATCH 11/16] PCI/AER: Remove ERR_FATAL code from ERR_NONFATAL path broadcast_error_message() is only used for ERR_NONFATAL events, when the state is always pci_channel_io_normal, so remove the unused alternate path. Signed-off-by: Oza Pawandeep [bhelgaas: changelog] Signed-off-by: Bjorn Helgaas --- drivers/pci/pcie/err.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c index 0539518f9861..638eda5c1d79 100644 --- a/drivers/pci/pcie/err.c +++ b/drivers/pci/pcie/err.c @@ -259,15 +259,10 @@ static pci_ers_result_t broadcast_error_message(struct pci_dev *dev, /* * If the error is reported by an end point, we think this * error is related to the upstream link of the end point. + * The error is non fatal so the bus is ok; just invoke + * the callback for the function that logged the error. */ - if (state == pci_channel_io_normal) - /* - * the error is non fatal so the bus is ok, just invoke - * the callback for the function that logged the error. - */ - cb(dev, &result_data); - else - pci_walk_bus(dev->bus, cb, &result_data); + cb(dev, &result_data); } return result_data.result; From ec752f5d54d723af3df03959637f963079643cd8 Mon Sep 17 00:00:00 2001 From: Oza Pawandeep Date: Thu, 19 Jul 2018 17:58:09 -0500 Subject: [PATCH 12/16] PCI/AER: Clear device status bits during ERR_FATAL and ERR_NONFATAL Clear the device status bits while handling both ERR_FATAL and ERR_NONFATAL cases. Signed-off-by: Oza Pawandeep [bhelgaas: rename to pci_aer_clear_device_status(), declare internal to PCI core instead of exposing it everywhere] Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.h | 2 ++ drivers/pci/pcie/aer.c | 15 +++++++++------ drivers/pci/pcie/err.c | 2 ++ 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 107c64892b66..138a2b66f620 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -486,11 +486,13 @@ void pci_aer_init(struct pci_dev *dev); void pci_aer_exit(struct pci_dev *dev); extern const struct attribute_group aer_stats_attr_group; void pci_aer_clear_fatal_status(struct pci_dev *dev); +void pci_aer_clear_device_status(struct pci_dev *dev); #else static inline void pci_no_aer(void) { } static inline int pci_aer_init(struct pci_dev *d) { return -ENODEV; } static inline void pci_aer_exit(struct pci_dev *d) { } static inline void pci_aer_clear_fatal_status(struct pci_dev *dev) { } +static inline void pci_aer_clear_device_status(struct pci_dev *dev) { } #endif #endif /* DRIVERS_PCI_H */ diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 4411ada4a91c..2a40b24ae4e3 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -382,6 +382,14 @@ int pci_disable_pcie_error_reporting(struct pci_dev *dev) } EXPORT_SYMBOL_GPL(pci_disable_pcie_error_reporting); +void pci_aer_clear_device_status(struct pci_dev *dev) +{ + u16 sta; + + pcie_capability_read_word(dev, PCI_EXP_DEVSTA, &sta); + pcie_capability_write_word(dev, PCI_EXP_DEVSTA, sta); +} + int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev) { int pos; @@ -1532,12 +1540,7 @@ static pci_ers_result_t aer_root_reset(struct pci_dev *dev) */ static void aer_error_resume(struct pci_dev *dev) { - u16 reg16; - - /* Clean up Root device status */ - pcie_capability_read_word(dev, PCI_EXP_DEVSTA, ®16); - pcie_capability_write_word(dev, PCI_EXP_DEVSTA, reg16); - + pci_aer_clear_device_status(dev); pci_cleanup_aer_uncorrect_error_status(dev); } diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c index 638eda5c1d79..fdbcc555860d 100644 --- a/drivers/pci/pcie/err.c +++ b/drivers/pci/pcie/err.c @@ -252,6 +252,7 @@ static pci_ers_result_t broadcast_error_message(struct pci_dev *dev, dev->error_state = state; pci_walk_bus(dev->subordinate, cb, &result_data); if (cb == report_resume) { + pci_aer_clear_device_status(dev); pci_cleanup_aer_uncorrect_error_status(dev); dev->error_state = pci_channel_io_normal; } @@ -312,6 +313,7 @@ void pcie_do_fatal_recovery(struct pci_dev *dev, u32 service) * of the bridge and clear the error status of the bridge. */ pci_aer_clear_fatal_status(dev); + pci_aer_clear_device_status(dev); } if (result == PCI_ERS_RESULT_RECOVERED) { From 10d790d99d3b42ec07d54178b291708f14af886d Mon Sep 17 00:00:00 2001 From: Oza Pawandeep Date: Thu, 19 Jul 2018 17:58:09 -0500 Subject: [PATCH 13/16] PCI/AER: Clear device status bits during ERR_COR handling In case of correctable error, the Correctable Error Detected bit in the Device Status register is set. Clear it after handling the error. Signed-off-by: Oza Pawandeep Signed-off-by: Bjorn Helgaas --- drivers/pci/pcie/aer.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 2a40b24ae4e3..2b344c9e2d46 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -1001,6 +1001,7 @@ static void handle_error_source(struct pci_dev *dev, struct aer_err_info *info) if (pos) pci_write_config_dword(dev, pos + PCI_ERR_COR_STATUS, info->status); + pci_aer_clear_device_status(dev); } else if (info->severity == AER_NONFATAL) pcie_do_nonfatal_recovery(dev); else if (info->severity == AER_FATAL) From 89e1f5cb1ecc1cd509a196f4e79d12a1e39410b6 Mon Sep 17 00:00:00 2001 From: Oza Pawandeep Date: Thu, 19 Jul 2018 17:58:10 -0500 Subject: [PATCH 14/16] PCI/portdrv: Remove pcie_portdrv_err_handler.slot_reset The pci_error_handlers.slot_reset() callback is only used for non-bridge devices (see broadcast_error_message()). Since portdrv only binds to bridges, we don't need pcie_portdrv_slot_reset(), so remove it. Signed-off-by: Oza Pawandeep [bhelgaas: changelog, remove pcie_portdrv_slot_reset() completely] Signed-off-by: Bjorn Helgaas --- drivers/pci/pcie/portdrv_pci.c | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c index 973f1b80a038..b78840f54a9b 100644 --- a/drivers/pci/pcie/portdrv_pci.c +++ b/drivers/pci/pcie/portdrv_pci.c @@ -42,17 +42,6 @@ __setup("pcie_ports=", pcie_port_setup); /* global data */ -static int pcie_portdrv_restore_config(struct pci_dev *dev) -{ - int retval; - - retval = pci_enable_device(dev); - if (retval) - return retval; - pci_set_master(dev); - return 0; -} - #ifdef CONFIG_PM static int pcie_port_runtime_suspend(struct device *dev) { @@ -160,19 +149,6 @@ static pci_ers_result_t pcie_portdrv_mmio_enabled(struct pci_dev *dev) return PCI_ERS_RESULT_RECOVERED; } -static pci_ers_result_t pcie_portdrv_slot_reset(struct pci_dev *dev) -{ - /* If fatal, restore cfg space for possible link reset at upstream */ - if (dev->error_state == pci_channel_io_frozen) { - dev->state_saved = true; - pci_restore_state(dev); - pcie_portdrv_restore_config(dev); - pci_enable_pcie_error_reporting(dev); - } - - return PCI_ERS_RESULT_RECOVERED; -} - static int resume_iter(struct device *device, void *data) { struct pcie_device *pcie_device; @@ -208,7 +184,6 @@ static const struct pci_device_id port_pci_ids[] = { { static const struct pci_error_handlers pcie_portdrv_err_handler = { .error_detected = pcie_portdrv_error_detected, .mmio_enabled = pcie_portdrv_mmio_enabled, - .slot_reset = pcie_portdrv_slot_reset, .resume = pcie_portdrv_err_resume, }; From 944d58595be02634cc295e341306ccda2365554d Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 31 Jul 2018 16:26:09 -0500 Subject: [PATCH 15/16] PCI/AER: Remove duplicate PCI_EXP_AER_FLAGS definition PCI_EXP_AER_FLAGS was defined twice (with identical definitions), once under #ifdef CONFIG_ACPI_APEI, and again at the top level. This looks like my merge error from these commits: fd3362cb73de ("PCI/AER: Squash aerdrv_core.c into aerdrv.c") 41cbc9eb1a82 ("PCI/AER: Squash ecrc.c into aerdrv.c") Remove the duplicate PCI_EXP_AER_FLAGS definition. Fixes: 41cbc9eb1a82 ("PCI/AER: Squash ecrc.c into aerdrv.c") Signed-off-by: Bjorn Helgaas Reviewed-by: Oza Pawandeep --- drivers/pci/pcie/aer.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 2b344c9e2d46..c6cc855bfa22 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -329,8 +329,6 @@ int pcie_aer_get_firmware_first(struct pci_dev *dev) aer_set_firmware_first(dev); return dev->__aer_firmware_first; } -#define PCI_EXP_AER_FLAGS (PCI_EXP_DEVCTL_CERE | PCI_EXP_DEVCTL_NFERE | \ - PCI_EXP_DEVCTL_FERE | PCI_EXP_DEVCTL_URRE) static bool aer_firmware_first; From 45687f96c112adda2f1d1f05b977661eb00d5a1c Mon Sep 17 00:00:00 2001 From: Alexandru Gagniuc Date: Tue, 17 Jul 2018 10:31:23 -0500 Subject: [PATCH 16/16] PCI/AER: Don't clear AER bits if error handling is Firmware-First If the platform requests Firmware-First error handling, firmware is responsible for reading and clearing AER status bits. If OSPM also clears them, we may miss errors. See ACPI v6.2, sec 18.3.2.5 and 18.4. This race is mostly of theoretical significance, as it is not easy to reasonably demonstrate it in testing. Signed-off-by: Alexandru Gagniuc [bhelgaas: add similar guards to pci_cleanup_aer_uncorrect_error_status() and pci_aer_clear_fatal_status()] Signed-off-by: Bjorn Helgaas --- drivers/pci/pcie/aer.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index c6cc855bfa22..4e823ae051a7 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -397,6 +397,9 @@ int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev) if (!pos) return -EIO; + if (pcie_aer_get_firmware_first(dev)) + return -EIO; + /* Clear status bits for ERR_NONFATAL errors only */ pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, &status); pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, &sev); @@ -417,6 +420,9 @@ void pci_aer_clear_fatal_status(struct pci_dev *dev) if (!pos) return; + if (pcie_aer_get_firmware_first(dev)) + return; + /* Clear status bits for ERR_FATAL errors only */ pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, &status); pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, &sev); @@ -438,6 +444,9 @@ int pci_cleanup_aer_error_status_regs(struct pci_dev *dev) if (!pos) return -EIO; + if (pcie_aer_get_firmware_first(dev)) + return -EIO; + port_type = pci_pcie_type(dev); if (port_type == PCI_EXP_TYPE_ROOT_PORT) { pci_read_config_dword(dev, pos + PCI_ERR_ROOT_STATUS, &status);