Merge branch 'hns3-error-handling'

Salil Mehta says:

====================
net: hns3: Additions/optimizations related to HNS3 H/W err handling

This patch set primarily does following addtions and optimizations
related to error handling in HNS3 Ethernet driver:

 1. Name changes for enable and process functions and minor loop
    optimizations. [PATCH 1-6]
 2. Modify query and clearing of RAS errors using new set of commands
    because modules specific commands for clearing RCB PPP PF, SSU are
    obselete. [PATCH 7]
 3. Deletes logging 1-bit errors for RAS in HNS3 driver as these never
    get reported to the driver. [PATCH 8]
 4. Add handling of NIC hw errors reported through MSIx rather than
    PCIe AER channel. [PATCH 9]
 5. Add handling for the HW RAS and MSIx errors in the modules MAC, PPP
    PF, MSIx SRAM, RCB and SSU. [PATCH 10-13]
 6. Add handling of RoCEE RAS errors. [PATCH 14]
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2018-12-07 15:57:02 -08:00
commit 12edfdfc79
7 changed files with 1127 additions and 717 deletions

View File

@ -136,6 +136,7 @@ enum hnae3_reset_type {
HNAE3_CORE_RESET,
HNAE3_GLOBAL_RESET,
HNAE3_IMP_RESET,
HNAE3_UNKNOWN_RESET,
HNAE3_NONE_RESET,
};
@ -454,7 +455,7 @@ struct hnae3_ae_ops {
int (*restore_fd_rules)(struct hnae3_handle *handle);
void (*enable_fd)(struct hnae3_handle *handle, bool enable);
int (*dbg_run_cmd)(struct hnae3_handle *handle, char *cmd_buf);
pci_ers_result_t (*process_hw_error)(struct hnae3_ae_dev *ae_dev);
pci_ers_result_t (*handle_hw_ras_error)(struct hnae3_ae_dev *ae_dev);
bool (*get_hw_reset_stat)(struct hnae3_handle *handle);
bool (*ae_dev_resetting)(struct hnae3_handle *handle);
unsigned long (*ae_dev_reset_cnt)(struct hnae3_handle *handle);

View File

@ -1828,8 +1828,8 @@ static pci_ers_result_t hns3_error_detected(struct pci_dev *pdev,
return PCI_ERS_RESULT_NONE;
}
if (ae_dev->ops->process_hw_error)
ret = ae_dev->ops->process_hw_error(ae_dev);
if (ae_dev->ops->handle_hw_ras_error)
ret = ae_dev->ops->handle_hw_ras_error(ae_dev);
else
return PCI_ERS_RESULT_NONE;

View File

@ -215,26 +215,29 @@ enum hclge_opcode_type {
HCLGE_OPC_SFP_GET_SPEED = 0x7104,
/* Error INT commands */
HCLGE_MAC_COMMON_INT_EN = 0x030E,
HCLGE_TM_SCH_ECC_INT_EN = 0x0829,
HCLGE_TM_SCH_ECC_ERR_RINT_CMD = 0x082d,
HCLGE_TM_SCH_ECC_ERR_RINT_CE = 0x082f,
HCLGE_TM_SCH_ECC_ERR_RINT_NFE = 0x0830,
HCLGE_TM_SCH_ECC_ERR_RINT_FE = 0x0831,
HCLGE_TM_SCH_MBIT_ECC_INFO_CMD = 0x0833,
HCLGE_SSU_ECC_INT_CMD = 0x0989,
HCLGE_SSU_COMMON_INT_CMD = 0x098C,
HCLGE_PPU_MPF_ECC_INT_CMD = 0x0B40,
HCLGE_PPU_MPF_OTHER_INT_CMD = 0x0B41,
HCLGE_PPU_PF_OTHER_INT_CMD = 0x0B42,
HCLGE_COMMON_ECC_INT_CFG = 0x1505,
HCLGE_IGU_EGU_TNL_INT_QUERY = 0x1802,
HCLGE_QUERY_RAS_INT_STS_BD_NUM = 0x1510,
HCLGE_QUERY_CLEAR_MPF_RAS_INT = 0x1511,
HCLGE_QUERY_CLEAR_PF_RAS_INT = 0x1512,
HCLGE_QUERY_MSIX_INT_STS_BD_NUM = 0x1513,
HCLGE_QUERY_CLEAR_ALL_MPF_MSIX_INT = 0x1514,
HCLGE_QUERY_CLEAR_ALL_PF_MSIX_INT = 0x1515,
HCLGE_CONFIG_ROCEE_RAS_INT_EN = 0x1580,
HCLGE_QUERY_CLEAR_ROCEE_RAS_INT = 0x1581,
HCLGE_ROCEE_PF_RAS_INT_CMD = 0x1584,
HCLGE_IGU_EGU_TNL_INT_EN = 0x1803,
HCLGE_IGU_EGU_TNL_INT_CLR = 0x1804,
HCLGE_IGU_COMMON_INT_QUERY = 0x1805,
HCLGE_IGU_COMMON_INT_EN = 0x1806,
HCLGE_IGU_COMMON_INT_CLR = 0x1807,
HCLGE_TM_QCN_MEM_INT_CFG = 0x1A14,
HCLGE_TM_QCN_MEM_INT_INFO_CMD = 0x1A17,
HCLGE_PPP_CMD0_INT_CMD = 0x2100,
HCLGE_PPP_CMD1_INT_CMD = 0x2101,
HCLGE_NCSI_INT_QUERY = 0x2400,
HCLGE_NCSI_INT_EN = 0x2401,
HCLGE_NCSI_INT_CLR = 0x2402,
};
#define HCLGE_TQP_REG_OFFSET 0x80000

File diff suppressed because it is too large Load Diff

View File

@ -7,9 +7,11 @@
#include "hclge_main.h"
#define HCLGE_RAS_PF_OTHER_INT_STS_REG 0x20B00
#define HCLGE_RAS_REG_FE_MASK 0xFF
#define HCLGE_RAS_REG_NFE_MASK 0xFF00
#define HCLGE_RAS_REG_NFE_SHIFT 8
#define HCLGE_RAS_REG_ROCEE_ERR_MASK 0x3000000
#define HCLGE_VECTOR0_PF_OTHER_INT_STS_REG 0x20800
#define HCLGE_VECTOR0_REG_MSIX_MASK 0x1FF00
#define HCLGE_IMP_TCM_ECC_ERR_INT_EN 0xFFFF0000
#define HCLGE_IMP_TCM_ECC_ERR_INT_EN_MASK 0xFFFF0000
@ -23,6 +25,8 @@
#define HCLGE_IMP_RD_POISON_ERR_INT_EN_MASK 0x0100
#define HCLGE_TQP_ECC_ERR_INT_EN 0x0FFF
#define HCLGE_TQP_ECC_ERR_INT_EN_MASK 0x0FFF
#define HCLGE_MSIX_SRAM_ECC_ERR_INT_EN_MASK 0x0F000000
#define HCLGE_MSIX_SRAM_ECC_ERR_INT_EN 0x0F000000
#define HCLGE_IGU_ERR_INT_EN 0x0000066F
#define HCLGE_IGU_ERR_INT_EN_MASK 0x000F
#define HCLGE_IGU_TNL_ERR_INT_EN 0x0002AABF
@ -41,21 +45,55 @@
#define HCLGE_TM_QCN_MEM_ERR_INT_EN 0xFFFFFF
#define HCLGE_NCSI_ERR_INT_EN 0x3
#define HCLGE_NCSI_ERR_INT_TYPE 0x9
#define HCLGE_MAC_COMMON_ERR_INT_EN GENMASK(7, 0)
#define HCLGE_MAC_COMMON_ERR_INT_EN_MASK GENMASK(7, 0)
#define HCLGE_PPU_MPF_ABNORMAL_INT0_EN GENMASK(31, 0)
#define HCLGE_PPU_MPF_ABNORMAL_INT0_EN_MASK GENMASK(31, 0)
#define HCLGE_PPU_MPF_ABNORMAL_INT1_EN GENMASK(31, 0)
#define HCLGE_PPU_MPF_ABNORMAL_INT1_EN_MASK GENMASK(31, 0)
#define HCLGE_PPU_MPF_ABNORMAL_INT2_EN 0x3FFF3FFF
#define HCLGE_PPU_MPF_ABNORMAL_INT2_EN_MASK 0x3FFF3FFF
#define HCLGE_PPU_MPF_ABNORMAL_INT2_EN2 0xB
#define HCLGE_PPU_MPF_ABNORMAL_INT2_EN2_MASK 0xB
#define HCLGE_PPU_MPF_ABNORMAL_INT3_EN GENMASK(7, 0)
#define HCLGE_PPU_MPF_ABNORMAL_INT3_EN_MASK GENMASK(23, 16)
#define HCLGE_PPU_PF_ABNORMAL_INT_EN GENMASK(5, 0)
#define HCLGE_PPU_PF_ABNORMAL_INT_EN_MASK GENMASK(5, 0)
#define HCLGE_SSU_1BIT_ECC_ERR_INT_EN GENMASK(31, 0)
#define HCLGE_SSU_1BIT_ECC_ERR_INT_EN_MASK GENMASK(31, 0)
#define HCLGE_SSU_MULTI_BIT_ECC_ERR_INT_EN GENMASK(31, 0)
#define HCLGE_SSU_MULTI_BIT_ECC_ERR_INT_EN_MASK GENMASK(31, 0)
#define HCLGE_SSU_BIT32_ECC_ERR_INT_EN 0x0101
#define HCLGE_SSU_BIT32_ECC_ERR_INT_EN_MASK 0x0101
#define HCLGE_SSU_COMMON_INT_EN GENMASK(9, 0)
#define HCLGE_SSU_COMMON_INT_EN_MASK GENMASK(9, 0)
#define HCLGE_SSU_PORT_BASED_ERR_INT_EN 0x0BFF
#define HCLGE_SSU_PORT_BASED_ERR_INT_EN_MASK 0x0BFF0000
#define HCLGE_SSU_FIFO_OVERFLOW_ERR_INT_EN GENMASK(23, 0)
#define HCLGE_SSU_FIFO_OVERFLOW_ERR_INT_EN_MASK GENMASK(23, 0)
#define HCLGE_IMP_TCM_ECC_INT_MASK 0xFFFF
#define HCLGE_IMP_ITCM4_ECC_INT_MASK 0x3
#define HCLGE_CMDQ_ECC_INT_MASK 0xFFFF
#define HCLGE_CMDQ_ROC_ECC_INT_SHIFT 16
#define HCLGE_TQP_ECC_INT_MASK 0xFFF
#define HCLGE_TQP_ECC_INT_SHIFT 16
#define HCLGE_IMP_TCM_ECC_CLR_MASK 0xFFFF
#define HCLGE_IMP_ITCM4_ECC_CLR_MASK 0x3
#define HCLGE_CMDQ_NIC_ECC_CLR_MASK 0xFFFF
#define HCLGE_CMDQ_ROCEE_ECC_CLR_MASK 0xFFFF0000
#define HCLGE_TQP_IMP_ERR_CLR_MASK 0x0FFF0001
#define HCLGE_IGU_COM_INT_MASK 0xF
#define HCLGE_IGU_EGU_TNL_INT_MASK 0x3F
#define HCLGE_PPP_PF_INT_MASK 0x100
#define HCLGE_SSU_COMMON_ERR_INT_MASK GENMASK(9, 0)
#define HCLGE_SSU_PORT_INT_MSIX_MASK 0x7BFF
#define HCLGE_IGU_INT_MASK GENMASK(3, 0)
#define HCLGE_IGU_EGU_TNL_INT_MASK GENMASK(5, 0)
#define HCLGE_PPP_MPF_INT_ST3_MASK GENMASK(5, 0)
#define HCLGE_PPU_MPF_INT_ST3_MASK GENMASK(7, 0)
#define HCLGE_PPU_MPF_INT_ST2_MSIX_MASK GENMASK(29, 28)
#define HCLGE_PPU_PF_INT_MSIX_MASK 0x27
#define HCLGE_QCN_FIFO_INT_MASK GENMASK(17, 0)
#define HCLGE_QCN_ECC_INT_MASK GENMASK(21, 0)
#define HCLGE_NCSI_ECC_INT_MASK GENMASK(1, 0)
#define HCLGE_ROCEE_RAS_NFE_INT_EN 0xF
#define HCLGE_ROCEE_RAS_CE_INT_EN 0x1
#define HCLGE_ROCEE_RAS_NFE_INT_EN_MASK 0xF
#define HCLGE_ROCEE_RAS_CE_INT_EN_MASK 0x1
#define HCLGE_ROCEE_RERR_INT_MASK BIT(0)
#define HCLGE_ROCEE_BERR_INT_MASK BIT(1)
#define HCLGE_ROCEE_ECC_INT_MASK BIT(2)
#define HCLGE_ROCEE_OVF_INT_MASK BIT(3)
#define HCLGE_ROCEE_OVF_ERR_INT_MASK 0x10000
#define HCLGE_ROCEE_OVF_ERR_TYPE_MASK 0x3F
enum hclge_err_int_type {
HCLGE_ERR_INT_MSIX = 0,
@ -67,9 +105,7 @@ enum hclge_err_int_type {
struct hclge_hw_blk {
u32 msk;
const char *name;
int (*enable_error)(struct hclge_dev *hdev, bool en);
void (*process_error)(struct hclge_dev *hdev,
enum hclge_err_int_type type);
int (*config_err_int)(struct hclge_dev *hdev, bool en);
};
struct hclge_hw_error {
@ -78,6 +114,7 @@ struct hclge_hw_error {
};
int hclge_hw_error_set_state(struct hclge_dev *hdev, bool state);
int hclge_enable_tm_hw_error(struct hclge_dev *hdev, bool en);
pci_ers_result_t hclge_process_ras_hw_error(struct hnae3_ae_dev *ae_dev);
pci_ers_result_t hclge_handle_hw_ras_error(struct hnae3_ae_dev *ae_dev);
int hclge_handle_hw_msix_error(struct hclge_dev *hdev,
unsigned long *reset_requests);
#endif

View File

@ -2200,12 +2200,13 @@ static void hclge_service_complete(struct hclge_dev *hdev)
static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval)
{
u32 rst_src_reg;
u32 cmdq_src_reg;
u32 rst_src_reg, cmdq_src_reg, msix_src_reg;
/* fetch the events from their corresponding regs */
rst_src_reg = hclge_read_dev(&hdev->hw, HCLGE_MISC_VECTOR_INT_STS);
cmdq_src_reg = hclge_read_dev(&hdev->hw, HCLGE_VECTOR0_CMDQ_SRC_REG);
msix_src_reg = hclge_read_dev(&hdev->hw,
HCLGE_VECTOR0_PF_OTHER_INT_STS_REG);
/* Assumption: If by any chance reset and mailbox events are reported
* together then we will only process reset event in this go and will
@ -2239,6 +2240,10 @@ static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval)
return HCLGE_VECTOR0_EVENT_RST;
}
/* check for vector0 msix event source */
if (msix_src_reg & HCLGE_VECTOR0_REG_MSIX_MASK)
return HCLGE_VECTOR0_EVENT_ERR;
/* check for vector0 mailbox(=CMDQ RX) event source */
if (BIT(HCLGE_VECTOR0_RX_CMDQ_INT_B) & cmdq_src_reg) {
cmdq_src_reg &= ~BIT(HCLGE_VECTOR0_RX_CMDQ_INT_B);
@ -2289,6 +2294,19 @@ static irqreturn_t hclge_misc_irq_handle(int irq, void *data)
/* vector 0 interrupt is shared with reset and mailbox source events.*/
switch (event_cause) {
case HCLGE_VECTOR0_EVENT_ERR:
/* we do not know what type of reset is required now. This could
* only be decided after we fetch the type of errors which
* caused this event. Therefore, we will do below for now:
* 1. Assert HNAE3_UNKNOWN_RESET type of reset. This means we
* have defered type of reset to be used.
* 2. Schedule the reset serivce task.
* 3. When service task receives HNAE3_UNKNOWN_RESET type it
* will fetch the correct type of reset. This would be done
* by first decoding the types of errors.
*/
set_bit(HNAE3_UNKNOWN_RESET, &hdev->reset_request);
/* fall through */
case HCLGE_VECTOR0_EVENT_RST:
hclge_reset_task_schedule(hdev);
break;
@ -2593,6 +2611,23 @@ static enum hnae3_reset_type hclge_get_reset_level(struct hclge_dev *hdev,
{
enum hnae3_reset_type rst_level = HNAE3_NONE_RESET;
/* first, resolve any unknown reset type to the known type(s) */
if (test_bit(HNAE3_UNKNOWN_RESET, addr)) {
/* we will intentionally ignore any errors from this function
* as we will end up in *some* reset request in any case
*/
hclge_handle_hw_msix_error(hdev, addr);
clear_bit(HNAE3_UNKNOWN_RESET, addr);
/* We defered the clearing of the error event which caused
* interrupt since it was not posssible to do that in
* interrupt context (and this is the reason we introduced
* new UNKNOWN reset type). Now, the errors have been
* handled and cleared in hardware we can safely enable
* interrupts. This is an exception to the norm.
*/
hclge_enable_vector(&hdev->misc_vector, true);
}
/* return the highest priority reset level amongst all */
if (test_bit(HNAE3_IMP_RESET, addr)) {
rst_level = HNAE3_IMP_RESET;
@ -7269,7 +7304,7 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev)
ret = hclge_hw_error_set_state(hdev, true);
if (ret) {
dev_err(&pdev->dev,
"hw error interrupts enable failed, ret =%d\n", ret);
"fail(%d) to enable hw error interrupts\n", ret);
goto err_mdiobus_unreg;
}
@ -7405,11 +7440,15 @@ static int hclge_reset_ae_dev(struct hnae3_ae_dev *ae_dev)
return ret;
}
/* Re-enable the TM hw error interrupts because
* they get disabled on core/global reset.
/* Re-enable the hw error interrupts because
* the interrupts get disabled on core/global reset.
*/
if (hclge_enable_tm_hw_error(hdev, true))
dev_err(&pdev->dev, "failed to enable TM hw error interrupts\n");
ret = hclge_hw_error_set_state(hdev, true);
if (ret) {
dev_err(&pdev->dev,
"fail(%d) to re-enable HNS hw error interrupts\n", ret);
return ret;
}
hclge_reset_vport_state(hdev);
@ -7931,7 +7970,7 @@ static const struct hnae3_ae_ops hclge_ops = {
.restore_fd_rules = hclge_restore_fd_entries,
.enable_fd = hclge_enable_fd,
.dbg_run_cmd = hclge_dbg_run_cmd,
.process_hw_error = hclge_process_ras_hw_error,
.handle_hw_ras_error = hclge_handle_hw_ras_error,
.get_hw_reset_stat = hclge_get_hw_reset_stat,
.ae_dev_resetting = hclge_ae_dev_resetting,
.ae_dev_reset_cnt = hclge_ae_dev_reset_cnt,

View File

@ -205,6 +205,7 @@ enum HCLGE_DEV_STATE {
enum hclge_evt_cause {
HCLGE_VECTOR0_EVENT_RST,
HCLGE_VECTOR0_EVENT_MBX,
HCLGE_VECTOR0_EVENT_ERR,
HCLGE_VECTOR0_EVENT_OTHER,
};