1 /* 2 * OPAL hypervisor Maintenance interrupt handling support in PowreNV. 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License as published by 6 * the Free Software Foundation; either version 2 of the License, or 7 * (at your option) any later version. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; If not, see <http://www.gnu.org/licenses/>. 16 * 17 * Copyright 2014 IBM Corporation 18 * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com> 19 */ 20 21 #undef DEBUG 22 23 #include <linux/kernel.h> 24 #include <linux/init.h> 25 #include <linux/of.h> 26 #include <linux/mm.h> 27 #include <linux/slab.h> 28 29 #include <asm/opal.h> 30 #include <asm/cputable.h> 31 #include <asm/machdep.h> 32 33 static int opal_hmi_handler_nb_init; 34 struct OpalHmiEvtNode { 35 struct list_head list; 36 struct OpalHMIEvent hmi_evt; 37 }; 38 39 struct xstop_reason { 40 uint32_t xstop_reason; 41 const char *unit_failed; 42 const char *description; 43 }; 44 45 static LIST_HEAD(opal_hmi_evt_list); 46 static DEFINE_SPINLOCK(opal_hmi_evt_lock); 47 48 static void print_core_checkstop_reason(const char *level, 49 struct OpalHMIEvent *hmi_evt) 50 { 51 int i; 52 static const struct xstop_reason xstop_reason[] = { 53 { CORE_CHECKSTOP_IFU_REGFILE, "IFU", 54 "RegFile core check stop" }, 55 { CORE_CHECKSTOP_IFU_LOGIC, "IFU", "Logic core check stop" }, 56 { CORE_CHECKSTOP_PC_DURING_RECOV, "PC", 57 "Core checkstop during recovery" }, 58 { CORE_CHECKSTOP_ISU_REGFILE, "ISU", 59 "RegFile core check stop (mapper error)" }, 60 { CORE_CHECKSTOP_ISU_LOGIC, "ISU", "Logic core check stop" }, 61 { CORE_CHECKSTOP_FXU_LOGIC, "FXU", "Logic core check stop" }, 62 { CORE_CHECKSTOP_VSU_LOGIC, "VSU", "Logic core check stop" }, 63 { CORE_CHECKSTOP_PC_RECOV_IN_MAINT_MODE, "PC", 64 "Recovery in maintenance mode" }, 65 { CORE_CHECKSTOP_LSU_REGFILE, "LSU", 66 "RegFile core check stop" }, 67 { CORE_CHECKSTOP_PC_FWD_PROGRESS, "PC", 68 "Forward Progress Error" }, 69 { CORE_CHECKSTOP_LSU_LOGIC, "LSU", "Logic core check stop" }, 70 { CORE_CHECKSTOP_PC_LOGIC, "PC", "Logic core check stop" }, 71 { CORE_CHECKSTOP_PC_HYP_RESOURCE, "PC", 72 "Hypervisor Resource error - core check stop" }, 73 { CORE_CHECKSTOP_PC_HANG_RECOV_FAILED, "PC", 74 "Hang Recovery Failed (core check stop)" }, 75 { CORE_CHECKSTOP_PC_AMBI_HANG_DETECTED, "PC", 76 "Ambiguous Hang Detected (unknown source)" }, 77 { CORE_CHECKSTOP_PC_DEBUG_TRIG_ERR_INJ, "PC", 78 "Debug Trigger Error inject" }, 79 { CORE_CHECKSTOP_PC_SPRD_HYP_ERR_INJ, "PC", 80 "Hypervisor check stop via SPRC/SPRD" }, 81 }; 82 83 /* Validity check */ 84 if (!hmi_evt->u.xstop_error.xstop_reason) { 85 printk("%s Unknown Core check stop.\n", level); 86 return; 87 } 88 89 printk("%s CPU PIR: %08x\n", level, 90 be32_to_cpu(hmi_evt->u.xstop_error.u.pir)); 91 for (i = 0; i < ARRAY_SIZE(xstop_reason); i++) 92 if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) & 93 xstop_reason[i].xstop_reason) 94 printk("%s [Unit: %-3s] %s\n", level, 95 xstop_reason[i].unit_failed, 96 xstop_reason[i].description); 97 } 98 99 static void print_nx_checkstop_reason(const char *level, 100 struct OpalHMIEvent *hmi_evt) 101 { 102 int i; 103 static const struct xstop_reason xstop_reason[] = { 104 { NX_CHECKSTOP_SHM_INVAL_STATE_ERR, "DMA & Engine", 105 "SHM invalid state error" }, 106 { NX_CHECKSTOP_DMA_INVAL_STATE_ERR_1, "DMA & Engine", 107 "DMA invalid state error bit 15" }, 108 { NX_CHECKSTOP_DMA_INVAL_STATE_ERR_2, "DMA & Engine", 109 "DMA invalid state error bit 16" }, 110 { NX_CHECKSTOP_DMA_CH0_INVAL_STATE_ERR, "DMA & Engine", 111 "Channel 0 invalid state error" }, 112 { NX_CHECKSTOP_DMA_CH1_INVAL_STATE_ERR, "DMA & Engine", 113 "Channel 1 invalid state error" }, 114 { NX_CHECKSTOP_DMA_CH2_INVAL_STATE_ERR, "DMA & Engine", 115 "Channel 2 invalid state error" }, 116 { NX_CHECKSTOP_DMA_CH3_INVAL_STATE_ERR, "DMA & Engine", 117 "Channel 3 invalid state error" }, 118 { NX_CHECKSTOP_DMA_CH4_INVAL_STATE_ERR, "DMA & Engine", 119 "Channel 4 invalid state error" }, 120 { NX_CHECKSTOP_DMA_CH5_INVAL_STATE_ERR, "DMA & Engine", 121 "Channel 5 invalid state error" }, 122 { NX_CHECKSTOP_DMA_CH6_INVAL_STATE_ERR, "DMA & Engine", 123 "Channel 6 invalid state error" }, 124 { NX_CHECKSTOP_DMA_CH7_INVAL_STATE_ERR, "DMA & Engine", 125 "Channel 7 invalid state error" }, 126 { NX_CHECKSTOP_DMA_CRB_UE, "DMA & Engine", 127 "UE error on CRB(CSB address, CCB)" }, 128 { NX_CHECKSTOP_DMA_CRB_SUE, "DMA & Engine", 129 "SUE error on CRB(CSB address, CCB)" }, 130 { NX_CHECKSTOP_PBI_ISN_UE, "PowerBus Interface", 131 "CRB Kill ISN received while holding ISN with UE error" }, 132 }; 133 134 /* Validity check */ 135 if (!hmi_evt->u.xstop_error.xstop_reason) { 136 printk("%s Unknown NX check stop.\n", level); 137 return; 138 } 139 140 printk("%s NX checkstop on CHIP ID: %x\n", level, 141 be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id)); 142 for (i = 0; i < ARRAY_SIZE(xstop_reason); i++) 143 if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) & 144 xstop_reason[i].xstop_reason) 145 printk("%s [Unit: %-3s] %s\n", level, 146 xstop_reason[i].unit_failed, 147 xstop_reason[i].description); 148 } 149 150 static void print_checkstop_reason(const char *level, 151 struct OpalHMIEvent *hmi_evt) 152 { 153 uint8_t type = hmi_evt->u.xstop_error.xstop_type; 154 switch (type) { 155 case CHECKSTOP_TYPE_CORE: 156 print_core_checkstop_reason(level, hmi_evt); 157 break; 158 case CHECKSTOP_TYPE_NX: 159 print_nx_checkstop_reason(level, hmi_evt); 160 break; 161 default: 162 printk("%s Unknown Malfunction Alert of type %d\n", 163 level, type); 164 break; 165 } 166 } 167 168 static void print_hmi_event_info(struct OpalHMIEvent *hmi_evt) 169 { 170 const char *level, *sevstr, *error_info; 171 static const char *hmi_error_types[] = { 172 "Malfunction Alert", 173 "Processor Recovery done", 174 "Processor recovery occurred again", 175 "Processor recovery occurred for masked error", 176 "Timer facility experienced an error", 177 "TFMR SPR is corrupted", 178 "UPS (Uniterrupted Power System) Overflow indication", 179 "An XSCOM operation failure", 180 "An XSCOM operation completed", 181 "SCOM has set a reserved FIR bit to cause recovery", 182 "Debug trigger has set a reserved FIR bit to cause recovery", 183 "A hypervisor resource error occurred", 184 "CAPP recovery process is in progress", 185 }; 186 187 /* Print things out */ 188 if (hmi_evt->version < OpalHMIEvt_V1) { 189 pr_err("HMI Interrupt, Unknown event version %d !\n", 190 hmi_evt->version); 191 return; 192 } 193 switch (hmi_evt->severity) { 194 case OpalHMI_SEV_NO_ERROR: 195 level = KERN_INFO; 196 sevstr = "Harmless"; 197 break; 198 case OpalHMI_SEV_WARNING: 199 level = KERN_WARNING; 200 sevstr = ""; 201 break; 202 case OpalHMI_SEV_ERROR_SYNC: 203 level = KERN_ERR; 204 sevstr = "Severe"; 205 break; 206 case OpalHMI_SEV_FATAL: 207 default: 208 level = KERN_ERR; 209 sevstr = "Fatal"; 210 break; 211 } 212 213 printk("%s%s Hypervisor Maintenance interrupt [%s]\n", 214 level, sevstr, 215 hmi_evt->disposition == OpalHMI_DISPOSITION_RECOVERED ? 216 "Recovered" : "Not recovered"); 217 error_info = hmi_evt->type < ARRAY_SIZE(hmi_error_types) ? 218 hmi_error_types[hmi_evt->type] 219 : "Unknown"; 220 printk("%s Error detail: %s\n", level, error_info); 221 printk("%s HMER: %016llx\n", level, be64_to_cpu(hmi_evt->hmer)); 222 if ((hmi_evt->type == OpalHMI_ERROR_TFAC) || 223 (hmi_evt->type == OpalHMI_ERROR_TFMR_PARITY)) 224 printk("%s TFMR: %016llx\n", level, 225 be64_to_cpu(hmi_evt->tfmr)); 226 227 if (hmi_evt->version < OpalHMIEvt_V2) 228 return; 229 230 /* OpalHMIEvt_V2 and above provides reason for malfunction alert. */ 231 if (hmi_evt->type == OpalHMI_ERROR_MALFUNC_ALERT) 232 print_checkstop_reason(level, hmi_evt); 233 } 234 235 static void hmi_event_handler(struct work_struct *work) 236 { 237 unsigned long flags; 238 struct OpalHMIEvent *hmi_evt; 239 struct OpalHmiEvtNode *msg_node; 240 uint8_t disposition; 241 struct opal_msg msg; 242 int unrecoverable = 0; 243 244 spin_lock_irqsave(&opal_hmi_evt_lock, flags); 245 while (!list_empty(&opal_hmi_evt_list)) { 246 msg_node = list_entry(opal_hmi_evt_list.next, 247 struct OpalHmiEvtNode, list); 248 list_del(&msg_node->list); 249 spin_unlock_irqrestore(&opal_hmi_evt_lock, flags); 250 251 hmi_evt = (struct OpalHMIEvent *) &msg_node->hmi_evt; 252 print_hmi_event_info(hmi_evt); 253 disposition = hmi_evt->disposition; 254 kfree(msg_node); 255 256 /* 257 * Check if HMI event has been recovered or not. If not 258 * then kernel can't continue, we need to panic. 259 * But before we do that, display all the HMI event 260 * available on the list and set unrecoverable flag to 1. 261 */ 262 if (disposition != OpalHMI_DISPOSITION_RECOVERED) 263 unrecoverable = 1; 264 265 spin_lock_irqsave(&opal_hmi_evt_lock, flags); 266 } 267 spin_unlock_irqrestore(&opal_hmi_evt_lock, flags); 268 269 if (unrecoverable) { 270 int ret; 271 272 /* Pull all HMI events from OPAL before we panic. */ 273 while (opal_get_msg(__pa(&msg), sizeof(msg)) == OPAL_SUCCESS) { 274 u32 type; 275 276 type = be32_to_cpu(msg.msg_type); 277 278 /* skip if not HMI event */ 279 if (type != OPAL_MSG_HMI_EVT) 280 continue; 281 282 /* HMI event info starts from param[0] */ 283 hmi_evt = (struct OpalHMIEvent *)&msg.params[0]; 284 print_hmi_event_info(hmi_evt); 285 } 286 287 /* 288 * Unrecoverable HMI exception. We need to inform BMC/OCC 289 * about this error so that it can collect relevant data 290 * for error analysis before rebooting. 291 */ 292 ret = opal_cec_reboot2(OPAL_REBOOT_PLATFORM_ERROR, 293 "Unrecoverable HMI exception"); 294 if (ret == OPAL_UNSUPPORTED) { 295 pr_emerg("Reboot type %d not supported\n", 296 OPAL_REBOOT_PLATFORM_ERROR); 297 } 298 299 /* 300 * Fall through and panic if opal_cec_reboot2() returns 301 * OPAL_UNSUPPORTED. 302 */ 303 panic("Unrecoverable HMI exception"); 304 } 305 } 306 307 static DECLARE_WORK(hmi_event_work, hmi_event_handler); 308 /* 309 * opal_handle_hmi_event - notifier handler that queues up HMI events 310 * to be preocessed later. 311 */ 312 static int opal_handle_hmi_event(struct notifier_block *nb, 313 unsigned long msg_type, void *msg) 314 { 315 unsigned long flags; 316 struct OpalHMIEvent *hmi_evt; 317 struct opal_msg *hmi_msg = msg; 318 struct OpalHmiEvtNode *msg_node; 319 320 /* Sanity Checks */ 321 if (msg_type != OPAL_MSG_HMI_EVT) 322 return 0; 323 324 /* HMI event info starts from param[0] */ 325 hmi_evt = (struct OpalHMIEvent *)&hmi_msg->params[0]; 326 327 /* Delay the logging of HMI events to workqueue. */ 328 msg_node = kzalloc(sizeof(*msg_node), GFP_ATOMIC); 329 if (!msg_node) { 330 pr_err("HMI: out of memory, Opal message event not handled\n"); 331 return -ENOMEM; 332 } 333 memcpy(&msg_node->hmi_evt, hmi_evt, sizeof(struct OpalHMIEvent)); 334 335 spin_lock_irqsave(&opal_hmi_evt_lock, flags); 336 list_add(&msg_node->list, &opal_hmi_evt_list); 337 spin_unlock_irqrestore(&opal_hmi_evt_lock, flags); 338 339 schedule_work(&hmi_event_work); 340 return 0; 341 } 342 343 static struct notifier_block opal_hmi_handler_nb = { 344 .notifier_call = opal_handle_hmi_event, 345 .next = NULL, 346 .priority = 0, 347 }; 348 349 int __init opal_hmi_handler_init(void) 350 { 351 int ret; 352 353 if (!opal_hmi_handler_nb_init) { 354 ret = opal_message_notifier_register( 355 OPAL_MSG_HMI_EVT, &opal_hmi_handler_nb); 356 if (ret) { 357 pr_err("%s: Can't register OPAL event notifier (%d)\n", 358 __func__, ret); 359 return ret; 360 } 361 opal_hmi_handler_nb_init = 1; 362 } 363 return 0; 364 } 365