From ac325acd50013fa8f4953208cbb96504dec9b12a Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Tue, 18 Apr 2006 21:05:21 -0700 Subject: [PATCH] powerpc/pseries: clear PCI failure counter if no new failures The current PCI error recovery system keeps track of the number of PCI card resets, and refuses to bring a card back up if this number is too large. The goal of doing this was to avoid an infinite loop of resets if a card is obviously dead. However, if the failures are rare, but the machine has a high uptime, this mechanism might still be triggered; this is too harsh. This patch will avoids this problem by decrementing the fail count after an hour. Thus, as long as a pci card BSOD's less than 6 times an hour, it will continue to be reset indefinitely. If it's failure rate is greater than that, it will be taken off-line permanently. This patch is larger than it might otherwise be because it changes indentation by removing a pointless while-loop. The while loop is not needed, as the handler is invoked once fo each event (by schedule_work()); the loop is leftover cruft from an earlier implementation. Signed-off-by: Linas Vepstas Signed-off-by: Andrew Morton Signed-off-by: Paul Mackerras --- arch/powerpc/platforms/pseries/eeh_event.c | 50 +++++++++++++++++------------- 1 file changed, 28 insertions(+), 22 deletions(-) (limited to 'arch/powerpc/platforms/pseries/eeh_event.c') diff --git a/arch/powerpc/platforms/pseries/eeh_event.c b/arch/powerpc/platforms/pseries/eeh_event.c index a1bda6f..a0b3964 100644 --- a/arch/powerpc/platforms/pseries/eeh_event.c +++ b/arch/powerpc/platforms/pseries/eeh_event.c @@ -18,6 +18,7 @@ * Copyright (c) 2005 Linas Vepstas */ +#include #include #include #include @@ -56,38 +57,43 @@ static int eeh_event_handler(void * dummy) { unsigned long flags; struct eeh_event *event; + struct pci_dn *pdn; daemonize ("eehd"); + set_current_state(TASK_INTERRUPTIBLE); - while (1) { - set_current_state(TASK_INTERRUPTIBLE); + spin_lock_irqsave(&eeh_eventlist_lock, flags); + event = NULL; - spin_lock_irqsave(&eeh_eventlist_lock, flags); - event = NULL; + /* Unqueue the event, get ready to process. */ + if (!list_empty(&eeh_eventlist)) { + event = list_entry(eeh_eventlist.next, struct eeh_event, list); + list_del(&event->list); + } + spin_unlock_irqrestore(&eeh_eventlist_lock, flags); - /* Unqueue the event, get ready to process. */ - if (!list_empty(&eeh_eventlist)) { - event = list_entry(eeh_eventlist.next, struct eeh_event, list); - list_del(&event->list); - } - spin_unlock_irqrestore(&eeh_eventlist_lock, flags); + if (event == NULL) + return 0; - if (event == NULL) - break; + /* Serialize processing of EEH events */ + mutex_lock(&eeh_event_mutex); + eeh_mark_slot(event->dn, EEH_MODE_RECOVERING); - /* Serialize processing of EEH events */ - mutex_lock(&eeh_event_mutex); - eeh_mark_slot(event->dn, EEH_MODE_RECOVERING); + printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n", + pci_name(event->dev)); - printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n", - pci_name(event->dev)); + pdn = handle_eeh_events(event); - handle_eeh_events(event); + eeh_clear_slot(event->dn, EEH_MODE_RECOVERING); + pci_dev_put(event->dev); + kfree(event); + mutex_unlock(&eeh_event_mutex); - eeh_clear_slot(event->dn, EEH_MODE_RECOVERING); - pci_dev_put(event->dev); - kfree(event); - mutex_unlock(&eeh_event_mutex); + /* If there are no new errors after an hour, clear the counter. */ + if (pdn && pdn->eeh_freeze_count>0) { + msleep_interruptible (3600*1000); + if (pdn->eeh_freeze_count>0) + pdn->eeh_freeze_count--; } return 0; -- cgit v1.1