1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * 4 * Copyright (c) 2005 Linas Vepstas <linas@linas.org> 5 */ 6 7 #include <linux/delay.h> 8 #include <linux/list.h> 9 #include <linux/sched.h> 10 #include <linux/semaphore.h> 11 #include <linux/pci.h> 12 #include <linux/slab.h> 13 #include <linux/kthread.h> 14 #include <asm/eeh_event.h> 15 #include <asm/ppc-pci.h> 16 17 /** Overview: 18 * EEH error states may be detected within exception handlers; 19 * however, the recovery processing needs to occur asynchronously 20 * in a normal kernel context and not an interrupt context. 21 * This pair of routines creates an event and queues it onto a 22 * work-queue, where a worker thread can drive recovery. 23 */ 24 25 static DEFINE_SPINLOCK(eeh_eventlist_lock); 26 static DECLARE_COMPLETION(eeh_eventlist_event); 27 static LIST_HEAD(eeh_eventlist); 28 29 /** 30 * eeh_event_handler - Dispatch EEH events. 31 * @dummy - unused 32 * 33 * The detection of a frozen slot can occur inside an interrupt, 34 * where it can be hard to do anything about it. The goal of this 35 * routine is to pull these detection events out of the context 36 * of the interrupt handler, and re-dispatch them for processing 37 * at a later time in a normal context. 38 */ 39 static int eeh_event_handler(void * dummy) 40 { 41 unsigned long flags; 42 struct eeh_event *event; 43 struct eeh_pe *pe; 44 45 while (!kthread_should_stop()) { 46 if (wait_for_completion_interruptible(&eeh_eventlist_event)) 47 break; 48 49 /* Fetch EEH event from the queue */ 50 spin_lock_irqsave(&eeh_eventlist_lock, flags); 51 event = NULL; 52 if (!list_empty(&eeh_eventlist)) { 53 event = list_entry(eeh_eventlist.next, 54 struct eeh_event, list); 55 list_del(&event->list); 56 } 57 spin_unlock_irqrestore(&eeh_eventlist_lock, flags); 58 if (!event) 59 continue; 60 61 /* We might have event without binding PE */ 62 pe = event->pe; 63 if (pe) { 64 if (pe->type & EEH_PE_PHB) 65 pr_info("EEH: Detected error on PHB#%x\n", 66 pe->phb->global_number); 67 else 68 pr_info("EEH: Detected PCI bus error on " 69 "PHB#%x-PE#%x\n", 70 pe->phb->global_number, pe->addr); 71 eeh_handle_normal_event(pe); 72 } else { 73 eeh_handle_special_event(); 74 } 75 76 kfree(event); 77 } 78 79 return 0; 80 } 81 82 /** 83 * eeh_event_init - Start kernel thread to handle EEH events 84 * 85 * This routine is called to start the kernel thread for processing 86 * EEH event. 87 */ 88 int eeh_event_init(void) 89 { 90 struct task_struct *t; 91 int ret = 0; 92 93 t = kthread_run(eeh_event_handler, NULL, "eehd"); 94 if (IS_ERR(t)) { 95 ret = PTR_ERR(t); 96 pr_err("%s: Failed to start EEH daemon (%d)\n", 97 __func__, ret); 98 return ret; 99 } 100 101 return 0; 102 } 103 104 /** 105 * eeh_send_failure_event - Generate a PCI error event 106 * @pe: EEH PE 107 * 108 * This routine can be called within an interrupt context; 109 * the actual event will be delivered in a normal context 110 * (from a workqueue). 111 */ 112 int __eeh_send_failure_event(struct eeh_pe *pe) 113 { 114 unsigned long flags; 115 struct eeh_event *event; 116 117 event = kzalloc(sizeof(*event), GFP_ATOMIC); 118 if (!event) { 119 pr_err("EEH: out of memory, event not handled\n"); 120 return -ENOMEM; 121 } 122 event->pe = pe; 123 124 /* We may or may not be called in an interrupt context */ 125 spin_lock_irqsave(&eeh_eventlist_lock, flags); 126 list_add(&event->list, &eeh_eventlist); 127 spin_unlock_irqrestore(&eeh_eventlist_lock, flags); 128 129 /* For EEH deamon to knick in */ 130 complete(&eeh_eventlist_event); 131 132 return 0; 133 } 134 135 int eeh_send_failure_event(struct eeh_pe *pe) 136 { 137 /* 138 * If we've manually supressed recovery events via debugfs 139 * then just drop it on the floor. 140 */ 141 if (eeh_debugfs_no_recover) { 142 pr_err("EEH: Event dropped due to no_recover setting\n"); 143 return 0; 144 } 145 146 return __eeh_send_failure_event(pe); 147 } 148 149 /** 150 * eeh_remove_event - Remove EEH event from the queue 151 * @pe: Event binding to the PE 152 * @force: Event will be removed unconditionally 153 * 154 * On PowerNV platform, we might have subsequent coming events 155 * is part of the former one. For that case, those subsequent 156 * coming events are totally duplicated and unnecessary, thus 157 * they should be removed. 158 */ 159 void eeh_remove_event(struct eeh_pe *pe, bool force) 160 { 161 unsigned long flags; 162 struct eeh_event *event, *tmp; 163 164 /* 165 * If we have NULL PE passed in, we have dead IOC 166 * or we're sure we can report all existing errors 167 * by the caller. 168 * 169 * With "force", the event with associated PE that 170 * have been isolated, the event won't be removed 171 * to avoid event lost. 172 */ 173 spin_lock_irqsave(&eeh_eventlist_lock, flags); 174 list_for_each_entry_safe(event, tmp, &eeh_eventlist, list) { 175 if (!force && event->pe && 176 (event->pe->state & EEH_PE_ISOLATED)) 177 continue; 178 179 if (!pe) { 180 list_del(&event->list); 181 kfree(event); 182 } else if (pe->type & EEH_PE_PHB) { 183 if (event->pe && event->pe->phb == pe->phb) { 184 list_del(&event->list); 185 kfree(event); 186 } 187 } else if (event->pe == pe) { 188 list_del(&event->list); 189 kfree(event); 190 } 191 } 192 spin_unlock_irqrestore(&eeh_eventlist_lock, flags); 193 } 194