diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index e7f757cd839b..9446248eb6b8 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -808,10 +808,8 @@ void eeh_handle_normal_event(struct eeh_pe *pe) pr_err("EEH: PHB#%x-PE#%x has failed %d times in the last hour and has been permanently disabled.\n", pe->phb->global_number, pe->addr, pe->freeze_count); - goto hard_fail; + result = PCI_ERS_RESULT_DISCONNECT; } - pr_warn("EEH: This PCI device has failed %d times in the last hour and will be permanently disabled after %d failures.\n", - pe->freeze_count, eeh_max_freezes); /* Walk the various device drivers attached to this slot through * a reset sequence, giving each an opportunity to do what it needs @@ -823,31 +821,39 @@ void eeh_handle_normal_event(struct eeh_pe *pe) * the error. Override the result if necessary to have partially * hotplug for this case. */ - pr_info("EEH: Notify device drivers to shutdown\n"); - eeh_set_channel_state(pe, pci_channel_io_frozen); - eeh_set_irq_state(pe, false); - eeh_pe_report("error_detected(IO frozen)", pe, eeh_report_error, - &result); - if ((pe->type & EEH_PE_PHB) && - result != PCI_ERS_RESULT_NONE && - result != PCI_ERS_RESULT_NEED_RESET) - result = PCI_ERS_RESULT_NEED_RESET; + if (result != PCI_ERS_RESULT_DISCONNECT) { + pr_warn("EEH: This PCI device has failed %d times in the last hour and will be permanently disabled after %d failures.\n", + pe->freeze_count, eeh_max_freezes); + pr_info("EEH: Notify device drivers to shutdown\n"); + eeh_set_channel_state(pe, pci_channel_io_frozen); + eeh_set_irq_state(pe, false); + eeh_pe_report("error_detected(IO frozen)", pe, + eeh_report_error, &result); + if ((pe->type & EEH_PE_PHB) && + result != PCI_ERS_RESULT_NONE && + result != PCI_ERS_RESULT_NEED_RESET) + result = PCI_ERS_RESULT_NEED_RESET; + } /* Get the current PCI slot state. This can take a long time, * sometimes over 300 seconds for certain systems. */ - rc = eeh_wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000); - if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) { - pr_warn("EEH: Permanent failure\n"); - goto hard_fail; + if (result != PCI_ERS_RESULT_DISCONNECT) { + rc = eeh_wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000); + if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) { + pr_warn("EEH: Permanent failure\n"); + result = PCI_ERS_RESULT_DISCONNECT; + } } /* Since rtas may enable MMIO when posting the error log, * don't post the error log until after all dev drivers * have been informed. */ - pr_info("EEH: Collect temporary log\n"); - eeh_slot_error_detail(pe, EEH_LOG_TEMP); + if (result != PCI_ERS_RESULT_DISCONNECT) { + pr_info("EEH: Collect temporary log\n"); + eeh_slot_error_detail(pe, EEH_LOG_TEMP); + } /* If all device drivers were EEH-unaware, then shut * down all of the device drivers, and hope they @@ -859,7 +865,7 @@ void eeh_handle_normal_event(struct eeh_pe *pe) if (rc) { pr_warn("%s: Unable to reset, err=%d\n", __func__, rc); - goto hard_fail; + result = PCI_ERS_RESULT_DISCONNECT; } } @@ -868,9 +874,9 @@ void eeh_handle_normal_event(struct eeh_pe *pe) pr_info("EEH: Enable I/O for affected devices\n"); rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO); - if (rc < 0) - goto hard_fail; - if (rc) { + if (rc < 0) { + result = PCI_ERS_RESULT_DISCONNECT; + } else if (rc) { result = PCI_ERS_RESULT_NEED_RESET; } else { pr_info("EEH: Notify device drivers to resume I/O\n"); @@ -884,9 +890,9 @@ void eeh_handle_normal_event(struct eeh_pe *pe) pr_info("EEH: Enabled DMA for affected devices\n"); rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA); - if (rc < 0) - goto hard_fail; - if (rc) { + if (rc < 0) { + result = PCI_ERS_RESULT_DISCONNECT; + } else if (rc) { result = PCI_ERS_RESULT_NEED_RESET; } else { /* @@ -899,12 +905,6 @@ void eeh_handle_normal_event(struct eeh_pe *pe) } } - /* If any device has a hard failure, then shut off everything. */ - if (result == PCI_ERS_RESULT_DISCONNECT) { - pr_warn("EEH: Device driver gave up\n"); - goto hard_fail; - } - /* If any device called out for a reset, then reset the slot */ if (result == PCI_ERS_RESULT_NEED_RESET) { pr_info("EEH: Reset without hotplug activity\n"); @@ -912,89 +912,81 @@ void eeh_handle_normal_event(struct eeh_pe *pe) if (rc) { pr_warn("%s: Cannot reset, err=%d\n", __func__, rc); - goto hard_fail; + result = PCI_ERS_RESULT_DISCONNECT; + } else { + result = PCI_ERS_RESULT_NONE; + eeh_set_channel_state(pe, pci_channel_io_normal); + eeh_set_irq_state(pe, true); + eeh_pe_report("slot_reset", pe, eeh_report_reset, + &result); + } + } + + if ((result == PCI_ERS_RESULT_RECOVERED) || + (result == PCI_ERS_RESULT_NONE)) { + /* + * For those hot removed VFs, we should add back them after PF + * get recovered properly. + */ + list_for_each_entry_safe(edev, tmp, &rmv_data.removed_vf_list, + rmv_entry) { + eeh_add_virt_device(edev); + list_del(&edev->rmv_entry); } - pr_info("EEH: Notify device drivers " - "the completion of reset\n"); - result = PCI_ERS_RESULT_NONE; + /* Tell all device drivers that they can resume operations */ + pr_info("EEH: Notify device driver to resume\n"); eeh_set_channel_state(pe, pci_channel_io_normal); eeh_set_irq_state(pe, true); - eeh_pe_report("slot_reset", pe, eeh_report_reset, &result); - } + eeh_pe_report("resume", pe, eeh_report_resume, NULL); + eeh_for_each_pe(pe, tmp_pe) { + eeh_pe_for_each_dev(tmp_pe, edev, tmp) { + edev->mode &= ~EEH_DEV_NO_HANDLER; + edev->in_error = false; + } + } - /* All devices should claim they have recovered by now. */ - if ((result != PCI_ERS_RESULT_RECOVERED) && - (result != PCI_ERS_RESULT_NONE)) { - pr_warn("EEH: Not recovered\n"); - goto hard_fail; - } + pr_info("EEH: Recovery successful.\n"); + } else { + /* + * About 90% of all real-life EEH failures in the field + * are due to poorly seated PCI cards. Only 10% or so are + * due to actual, failed cards. + */ + pr_err("EEH: Unable to recover from failure from PHB#%x-PE#%x.\n" + "Please try reseating or replacing it\n", + pe->phb->global_number, pe->addr); - /* - * For those hot removed VFs, we should add back them after PF get - * recovered properly. - */ - list_for_each_entry_safe(edev, tmp, &rmv_data.removed_vf_list, - rmv_entry) { - eeh_add_virt_device(edev); - list_del(&edev->rmv_entry); - } + eeh_slot_error_detail(pe, EEH_LOG_PERM); - /* Tell all device drivers that they can resume operations */ - pr_info("EEH: Notify device driver to resume\n"); - eeh_set_channel_state(pe, pci_channel_io_normal); - eeh_set_irq_state(pe, true); - eeh_pe_report("resume", pe, eeh_report_resume, NULL); - eeh_for_each_pe(pe, tmp_pe) { - eeh_pe_for_each_dev(tmp_pe, edev, tmp) { - edev->mode &= ~EEH_DEV_NO_HANDLER; - edev->in_error = false; + /* Notify all devices that they're about to go down. */ + eeh_set_channel_state(pe, pci_channel_io_perm_failure); + eeh_set_irq_state(pe, false); + eeh_pe_report("error_detected(permanent failure)", pe, + eeh_report_failure, NULL); + + /* Mark the PE to be removed permanently */ + eeh_pe_state_mark(pe, EEH_PE_REMOVED); + + /* + * Shut down the device drivers for good. We mark + * all removed devices correctly to avoid access + * the their PCI config any more. + */ + if (pe->type & EEH_PE_VF) { + eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL); + eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); + } else { + eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); + eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); + + pci_lock_rescan_remove(); + pci_hp_remove_devices(bus); + pci_unlock_rescan_remove(); + /* The passed PE should no longer be used */ + return; } } - - pr_info("EEH: Recovery successful.\n"); - goto final; - -hard_fail: - /* - * About 90% of all real-life EEH failures in the field - * are due to poorly seated PCI cards. Only 10% or so are - * due to actual, failed cards. - */ - pr_err("EEH: Unable to recover from failure from PHB#%x-PE#%x.\n" - "Please try reseating or replacing it\n", - pe->phb->global_number, pe->addr); - - eeh_slot_error_detail(pe, EEH_LOG_PERM); - - /* Notify all devices that they're about to go down. */ - eeh_set_channel_state(pe, pci_channel_io_perm_failure); - eeh_set_irq_state(pe, false); - eeh_pe_report("error_detected(permanent failure)", pe, - eeh_report_failure, NULL); - - /* Mark the PE to be removed permanently */ - eeh_pe_state_mark(pe, EEH_PE_REMOVED); - - /* - * Shut down the device drivers for good. We mark - * all removed devices correctly to avoid access - * the their PCI config any more. - */ - if (pe->type & EEH_PE_VF) { - eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL); - eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); - } else { - eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); - eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); - - pci_lock_rescan_remove(); - pci_hp_remove_devices(bus); - pci_unlock_rescan_remove(); - /* The passed PE should no longer be used */ - return; - } -final: eeh_pe_state_clear(pe, EEH_PE_RECOVERING); }