From 77bd741561016134d1761d6101c4f0361025062f Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Thu, 3 Nov 2005 18:52:49 -0600 Subject: [PATCH] powerpc: PCI Error Recovery: PPC64 core recovery routines Various PCI bus errors can be signaled by newer PCI controllers. The core error recovery routines are architecture dependent. This patch adds a recovery infrastructure for the PPC64 pSeries systems. Signed-off-by: Linas Vepstas Signed-off-by: Paul Mackerras (cherry picked from e8ca11b460c4c9c7fa6b529be221529ebd770e38 commit) --- arch/powerpc/platforms/pseries/eeh_event.c | 39 +++++++++--------------------- 1 file changed, 12 insertions(+), 27 deletions(-) (limited to 'arch/powerpc/platforms/pseries/eeh_event.c') diff --git a/arch/powerpc/platforms/pseries/eeh_event.c b/arch/powerpc/platforms/pseries/eeh_event.c index 9249733..9a9961f 100644 --- a/arch/powerpc/platforms/pseries/eeh_event.c +++ b/arch/powerpc/platforms/pseries/eeh_event.c @@ -21,6 +21,7 @@ #include #include #include +#include /** Overview: * EEH error states may be detected within exception handlers; @@ -37,31 +38,6 @@ static void eeh_thread_launcher(void *); DECLARE_WORK(eeh_event_wq, eeh_thread_launcher, NULL); /** - * eeh_panic - call panic() for an eeh event that cannot be handled. - * The philosophy of this routine is that it is better to panic and - * halt the OS than it is to risk possible data corruption by - * oblivious device drivers that don't know better. - * - * @dev pci device that had an eeh event - * @reset_state current reset state of the device slot - */ -static void eeh_panic(struct pci_dev *dev, int reset_state) -{ - /* - * Since the panic_on_oops sysctl is used to halt the system - * in light of potential corruption, we can use it here. - */ - if (panic_on_oops) { - panic("EEH: MMIO failure (%d) on device:%s\n", reset_state, - pci_name(dev)); - } - else { - printk(KERN_INFO "EEH: Ignored MMIO failure (%d) on device:%s\n", - reset_state, pci_name(dev)); - } -} - -/** * eeh_event_handler - dispatch EEH events. The detection of a frozen * slot can occur inside an interrupt, where it can be hard to do * anything about it. The goal of this routine is to pull these @@ -82,10 +58,16 @@ static int eeh_event_handler(void * dummy) spin_lock_irqsave(&eeh_eventlist_lock, flags); event = NULL; + + /* Unqueue the event, get ready to process. */ if (!list_empty(&eeh_eventlist)) { event = list_entry(eeh_eventlist.next, struct eeh_event, list); list_del(&event->list); } + + if (event) + eeh_mark_slot(event->dn, EEH_MODE_RECOVERING); + spin_unlock_irqrestore(&eeh_eventlist_lock, flags); if (event == NULL) break; @@ -93,8 +75,11 @@ static int eeh_event_handler(void * dummy) printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n", pci_name(event->dev)); - eeh_panic (event->dev, event->state); + handle_eeh_events(event); + + eeh_clear_slot(event->dn, EEH_MODE_RECOVERING); + pci_dev_put(event->dev); kfree(event); } @@ -122,7 +107,7 @@ static void eeh_thread_launcher(void *dummy) */ int eeh_send_failure_event (struct device_node *dn, struct pci_dev *dev, - int state, + enum pci_channel_state state, int time_unavail) { unsigned long flags; -- cgit v1.1