1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * OPAL hypervisor Maintenance interrupt handling support in PowerNV. 4 * 5 * Copyright 2014 IBM Corporation 6 * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com> 7 */ 8 9 #undef DEBUG 10 11 #include <linux/kernel.h> 12 #include <linux/init.h> 13 #include <linux/of.h> 14 #include <linux/mm.h> 15 #include <linux/slab.h> 16 17 #include <asm/opal.h> 18 #include <asm/cputable.h> 19 #include <asm/machdep.h> 20 21 #include "powernv.h" 22 23 static int opal_hmi_handler_nb_init; 24 struct OpalHmiEvtNode { 25 struct list_head list; 26 struct OpalHMIEvent hmi_evt; 27 }; 28 29 struct xstop_reason { 30 uint32_t xstop_reason; 31 const char *unit_failed; 32 const char *description; 33 }; 34 35 static LIST_HEAD(opal_hmi_evt_list); 36 static DEFINE_SPINLOCK(opal_hmi_evt_lock); 37 38 static void print_core_checkstop_reason(const char *level, 39 struct OpalHMIEvent *hmi_evt) 40 { 41 int i; 42 static const struct xstop_reason xstop_reason[] = { 43 { CORE_CHECKSTOP_IFU_REGFILE, "IFU", 44 "RegFile core check stop" }, 45 { CORE_CHECKSTOP_IFU_LOGIC, "IFU", "Logic core check stop" }, 46 { CORE_CHECKSTOP_PC_DURING_RECOV, "PC", 47 "Core checkstop during recovery" }, 48 { CORE_CHECKSTOP_ISU_REGFILE, "ISU", 49 "RegFile core check stop (mapper error)" }, 50 { CORE_CHECKSTOP_ISU_LOGIC, "ISU", "Logic core check stop" }, 51 { CORE_CHECKSTOP_FXU_LOGIC, "FXU", "Logic core check stop" }, 52 { CORE_CHECKSTOP_VSU_LOGIC, "VSU", "Logic core check stop" }, 53 { CORE_CHECKSTOP_PC_RECOV_IN_MAINT_MODE, "PC", 54 "Recovery in maintenance mode" }, 55 { CORE_CHECKSTOP_LSU_REGFILE, "LSU", 56 "RegFile core check stop" }, 57 { CORE_CHECKSTOP_PC_FWD_PROGRESS, "PC", 58 "Forward Progress Error" }, 59 { CORE_CHECKSTOP_LSU_LOGIC, "LSU", "Logic core check stop" }, 60 { CORE_CHECKSTOP_PC_LOGIC, "PC", "Logic core check stop" }, 61 { CORE_CHECKSTOP_PC_HYP_RESOURCE, "PC", 62 "Hypervisor Resource error - core check stop" }, 63 { CORE_CHECKSTOP_PC_HANG_RECOV_FAILED, "PC", 64 "Hang Recovery Failed (core check stop)" }, 65 { CORE_CHECKSTOP_PC_AMBI_HANG_DETECTED, "PC", 66 "Ambiguous Hang Detected (unknown source)" }, 67 { CORE_CHECKSTOP_PC_DEBUG_TRIG_ERR_INJ, "PC", 68 "Debug Trigger Error inject" }, 69 { CORE_CHECKSTOP_PC_SPRD_HYP_ERR_INJ, "PC", 70 "Hypervisor check stop via SPRC/SPRD" }, 71 }; 72 73 /* Validity check */ 74 if (!hmi_evt->u.xstop_error.xstop_reason) { 75 printk("%s Unknown Core check stop.\n", level); 76 return; 77 } 78 79 printk("%s CPU PIR: %08x\n", level, 80 be32_to_cpu(hmi_evt->u.xstop_error.u.pir)); 81 for (i = 0; i < ARRAY_SIZE(xstop_reason); i++) 82 if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) & 83 xstop_reason[i].xstop_reason) 84 printk("%s [Unit: %-3s] %s\n", level, 85 xstop_reason[i].unit_failed, 86 xstop_reason[i].description); 87 } 88 89 static void print_nx_checkstop_reason(const char *level, 90 struct OpalHMIEvent *hmi_evt) 91 { 92 int i; 93 static const struct xstop_reason xstop_reason[] = { 94 { NX_CHECKSTOP_SHM_INVAL_STATE_ERR, "DMA & Engine", 95 "SHM invalid state error" }, 96 { NX_CHECKSTOP_DMA_INVAL_STATE_ERR_1, "DMA & Engine", 97 "DMA invalid state error bit 15" }, 98 { NX_CHECKSTOP_DMA_INVAL_STATE_ERR_2, "DMA & Engine", 99 "DMA invalid state error bit 16" }, 100 { NX_CHECKSTOP_DMA_CH0_INVAL_STATE_ERR, "DMA & Engine", 101 "Channel 0 invalid state error" }, 102 { NX_CHECKSTOP_DMA_CH1_INVAL_STATE_ERR, "DMA & Engine", 103 "Channel 1 invalid state error" }, 104 { NX_CHECKSTOP_DMA_CH2_INVAL_STATE_ERR, "DMA & Engine", 105 "Channel 2 invalid state error" }, 106 { NX_CHECKSTOP_DMA_CH3_INVAL_STATE_ERR, "DMA & Engine", 107 "Channel 3 invalid state error" }, 108 { NX_CHECKSTOP_DMA_CH4_INVAL_STATE_ERR, "DMA & Engine", 109 "Channel 4 invalid state error" }, 110 { NX_CHECKSTOP_DMA_CH5_INVAL_STATE_ERR, "DMA & Engine", 111 "Channel 5 invalid state error" }, 112 { NX_CHECKSTOP_DMA_CH6_INVAL_STATE_ERR, "DMA & Engine", 113 "Channel 6 invalid state error" }, 114 { NX_CHECKSTOP_DMA_CH7_INVAL_STATE_ERR, "DMA & Engine", 115 "Channel 7 invalid state error" }, 116 { NX_CHECKSTOP_DMA_CRB_UE, "DMA & Engine", 117 "UE error on CRB(CSB address, CCB)" }, 118 { NX_CHECKSTOP_DMA_CRB_SUE, "DMA & Engine", 119 "SUE error on CRB(CSB address, CCB)" }, 120 { NX_CHECKSTOP_PBI_ISN_UE, "PowerBus Interface", 121 "CRB Kill ISN received while holding ISN with UE error" }, 122 }; 123 124 /* Validity check */ 125 if (!hmi_evt->u.xstop_error.xstop_reason) { 126 printk("%s Unknown NX check stop.\n", level); 127 return; 128 } 129 130 printk("%s NX checkstop on CHIP ID: %x\n", level, 131 be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id)); 132 for (i = 0; i < ARRAY_SIZE(xstop_reason); i++) 133 if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) & 134 xstop_reason[i].xstop_reason) 135 printk("%s [Unit: %-3s] %s\n", level, 136 xstop_reason[i].unit_failed, 137 xstop_reason[i].description); 138 } 139 140 static void print_checkstop_reason(const char *level, 141 struct OpalHMIEvent *hmi_evt) 142 { 143 uint8_t type = hmi_evt->u.xstop_error.xstop_type; 144 switch (type) { 145 case CHECKSTOP_TYPE_CORE: 146 print_core_checkstop_reason(level, hmi_evt); 147 break; 148 case CHECKSTOP_TYPE_NX: 149 print_nx_checkstop_reason(level, hmi_evt); 150 break; 151 default: 152 printk("%s Unknown Malfunction Alert of type %d\n", 153 level, type); 154 break; 155 } 156 } 157 158 static void print_hmi_event_info(struct OpalHMIEvent *hmi_evt) 159 { 160 const char *level, *sevstr, *error_info; 161 static const char *hmi_error_types[] = { 162 "Malfunction Alert", 163 "Processor Recovery done", 164 "Processor recovery occurred again", 165 "Processor recovery occurred for masked error", 166 "Timer facility experienced an error", 167 "TFMR SPR is corrupted", 168 "UPS (Uninterrupted Power System) Overflow indication", 169 "An XSCOM operation failure", 170 "An XSCOM operation completed", 171 "SCOM has set a reserved FIR bit to cause recovery", 172 "Debug trigger has set a reserved FIR bit to cause recovery", 173 "A hypervisor resource error occurred", 174 "CAPP recovery process is in progress", 175 }; 176 177 /* Print things out */ 178 if (hmi_evt->version < OpalHMIEvt_V1) { 179 pr_err("HMI Interrupt, Unknown event version %d !\n", 180 hmi_evt->version); 181 return; 182 } 183 switch (hmi_evt->severity) { 184 case OpalHMI_SEV_NO_ERROR: 185 level = KERN_INFO; 186 sevstr = "Harmless"; 187 break; 188 case OpalHMI_SEV_WARNING: 189 level = KERN_WARNING; 190 sevstr = ""; 191 break; 192 case OpalHMI_SEV_ERROR_SYNC: 193 level = KERN_ERR; 194 sevstr = "Severe"; 195 break; 196 case OpalHMI_SEV_FATAL: 197 default: 198 level = KERN_ERR; 199 sevstr = "Fatal"; 200 break; 201 } 202 203 printk("%s%s Hypervisor Maintenance interrupt [%s]\n", 204 level, sevstr, 205 hmi_evt->disposition == OpalHMI_DISPOSITION_RECOVERED ? 206 "Recovered" : "Not recovered"); 207 error_info = hmi_evt->type < ARRAY_SIZE(hmi_error_types) ? 208 hmi_error_types[hmi_evt->type] 209 : "Unknown"; 210 printk("%s Error detail: %s\n", level, error_info); 211 printk("%s HMER: %016llx\n", level, be64_to_cpu(hmi_evt->hmer)); 212 if ((hmi_evt->type == OpalHMI_ERROR_TFAC) || 213 (hmi_evt->type == OpalHMI_ERROR_TFMR_PARITY)) 214 printk("%s TFMR: %016llx\n", level, 215 be64_to_cpu(hmi_evt->tfmr)); 216 217 if (hmi_evt->version < OpalHMIEvt_V2) 218 return; 219 220 /* OpalHMIEvt_V2 and above provides reason for malfunction alert. */ 221 if (hmi_evt->type == OpalHMI_ERROR_MALFUNC_ALERT) 222 print_checkstop_reason(level, hmi_evt); 223 } 224 225 static void hmi_event_handler(struct work_struct *work) 226 { 227 unsigned long flags; 228 struct OpalHMIEvent *hmi_evt; 229 struct OpalHmiEvtNode *msg_node; 230 uint8_t disposition; 231 struct opal_msg msg; 232 int unrecoverable = 0; 233 234 spin_lock_irqsave(&opal_hmi_evt_lock, flags); 235 while (!list_empty(&opal_hmi_evt_list)) { 236 msg_node = list_entry(opal_hmi_evt_list.next, 237 struct OpalHmiEvtNode, list); 238 list_del(&msg_node->list); 239 spin_unlock_irqrestore(&opal_hmi_evt_lock, flags); 240 241 hmi_evt = (struct OpalHMIEvent *) &msg_node->hmi_evt; 242 print_hmi_event_info(hmi_evt); 243 disposition = hmi_evt->disposition; 244 kfree(msg_node); 245 246 /* 247 * Check if HMI event has been recovered or not. If not 248 * then kernel can't continue, we need to panic. 249 * But before we do that, display all the HMI event 250 * available on the list and set unrecoverable flag to 1. 251 */ 252 if (disposition != OpalHMI_DISPOSITION_RECOVERED) 253 unrecoverable = 1; 254 255 spin_lock_irqsave(&opal_hmi_evt_lock, flags); 256 } 257 spin_unlock_irqrestore(&opal_hmi_evt_lock, flags); 258 259 if (unrecoverable) { 260 /* Pull all HMI events from OPAL before we panic. */ 261 while (opal_get_msg(__pa(&msg), sizeof(msg)) == OPAL_SUCCESS) { 262 u32 type; 263 264 type = be32_to_cpu(msg.msg_type); 265 266 /* skip if not HMI event */ 267 if (type != OPAL_MSG_HMI_EVT) 268 continue; 269 270 /* HMI event info starts from param[0] */ 271 hmi_evt = (struct OpalHMIEvent *)&msg.params[0]; 272 print_hmi_event_info(hmi_evt); 273 } 274 275 pnv_platform_error_reboot(NULL, "Unrecoverable HMI exception"); 276 } 277 } 278 279 static DECLARE_WORK(hmi_event_work, hmi_event_handler); 280 /* 281 * opal_handle_hmi_event - notifier handler that queues up HMI events 282 * to be preocessed later. 283 */ 284 static int opal_handle_hmi_event(struct notifier_block *nb, 285 unsigned long msg_type, void *msg) 286 { 287 unsigned long flags; 288 struct OpalHMIEvent *hmi_evt; 289 struct opal_msg *hmi_msg = msg; 290 struct OpalHmiEvtNode *msg_node; 291 292 /* Sanity Checks */ 293 if (msg_type != OPAL_MSG_HMI_EVT) 294 return 0; 295 296 /* HMI event info starts from param[0] */ 297 hmi_evt = (struct OpalHMIEvent *)&hmi_msg->params[0]; 298 299 /* Delay the logging of HMI events to workqueue. */ 300 msg_node = kzalloc(sizeof(*msg_node), GFP_ATOMIC); 301 if (!msg_node) { 302 pr_err("HMI: out of memory, Opal message event not handled\n"); 303 return -ENOMEM; 304 } 305 memcpy(&msg_node->hmi_evt, hmi_evt, sizeof(*hmi_evt)); 306 307 spin_lock_irqsave(&opal_hmi_evt_lock, flags); 308 list_add(&msg_node->list, &opal_hmi_evt_list); 309 spin_unlock_irqrestore(&opal_hmi_evt_lock, flags); 310 311 schedule_work(&hmi_event_work); 312 return 0; 313 } 314 315 static struct notifier_block opal_hmi_handler_nb = { 316 .notifier_call = opal_handle_hmi_event, 317 .next = NULL, 318 .priority = 0, 319 }; 320 321 int __init opal_hmi_handler_init(void) 322 { 323 int ret; 324 325 if (!opal_hmi_handler_nb_init) { 326 ret = opal_message_notifier_register( 327 OPAL_MSG_HMI_EVT, &opal_hmi_handler_nb); 328 if (ret) { 329 pr_err("%s: Can't register OPAL event notifier (%d)\n", 330 __func__, ret); 331 return ret; 332 } 333 opal_hmi_handler_nb_init = 1; 334 } 335 return 0; 336 } 337