1d9953105SMichael Ellerman /* 2d9953105SMichael Ellerman * Copyright (C) 2001 Dave Engebretsen IBM Corporation 3d9953105SMichael Ellerman * 4d9953105SMichael Ellerman * This program is free software; you can redistribute it and/or modify 5d9953105SMichael Ellerman * it under the terms of the GNU General Public License as published by 6d9953105SMichael Ellerman * the Free Software Foundation; either version 2 of the License, or 7d9953105SMichael Ellerman * (at your option) any later version. 8d9953105SMichael Ellerman * 9d9953105SMichael Ellerman * This program is distributed in the hope that it will be useful, 10d9953105SMichael Ellerman * but WITHOUT ANY WARRANTY; without even the implied warranty of 11d9953105SMichael Ellerman * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12d9953105SMichael Ellerman * GNU General Public License for more details. 13d9953105SMichael Ellerman * 14d9953105SMichael Ellerman * You should have received a copy of the GNU General Public License 15d9953105SMichael Ellerman * along with this program; if not, write to the Free Software 16d9953105SMichael Ellerman * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 17d9953105SMichael Ellerman */ 18d9953105SMichael Ellerman 19d9953105SMichael Ellerman /* Change Activity: 20d9953105SMichael Ellerman * 2001/09/21 : engebret : Created with minimal EPOW and HW exception support. 21d9953105SMichael Ellerman * End Change Activity 22d9953105SMichael Ellerman */ 23d9953105SMichael Ellerman 24d9953105SMichael Ellerman #include <linux/errno.h> 25d9953105SMichael Ellerman #include <linux/threads.h> 26d9953105SMichael Ellerman #include <linux/kernel_stat.h> 27d9953105SMichael Ellerman #include <linux/signal.h> 28d9953105SMichael Ellerman #include <linux/sched.h> 29d9953105SMichael Ellerman #include <linux/ioport.h> 30d9953105SMichael Ellerman #include <linux/interrupt.h> 31d9953105SMichael Ellerman #include <linux/timex.h> 32d9953105SMichael Ellerman #include <linux/init.h> 33d9953105SMichael Ellerman #include <linux/delay.h> 34d9953105SMichael Ellerman #include <linux/irq.h> 35d9953105SMichael Ellerman #include <linux/random.h> 36d9953105SMichael Ellerman #include <linux/sysrq.h> 37d9953105SMichael Ellerman #include <linux/bitops.h> 38*55fc0c56SAnton Blanchard #include <linux/fs.h> 39*55fc0c56SAnton Blanchard #include <linux/reboot.h> 40d9953105SMichael Ellerman 41d9953105SMichael Ellerman #include <asm/uaccess.h> 42d9953105SMichael Ellerman #include <asm/system.h> 43d9953105SMichael Ellerman #include <asm/io.h> 44d9953105SMichael Ellerman #include <asm/pgtable.h> 45d9953105SMichael Ellerman #include <asm/irq.h> 46d9953105SMichael Ellerman #include <asm/cache.h> 47d9953105SMichael Ellerman #include <asm/prom.h> 48d9953105SMichael Ellerman #include <asm/ptrace.h> 49d9953105SMichael Ellerman #include <asm/machdep.h> 50d9953105SMichael Ellerman #include <asm/rtas.h> 51dcad47fcSDavid Gibson #include <asm/udbg.h> 528c4f1f29SMichael Ellerman #include <asm/firmware.h> 53d9953105SMichael Ellerman 54577830b0SMichael Ellerman #include "pseries.h" 55c902be71SArnd Bergmann 56d9953105SMichael Ellerman static unsigned char ras_log_buf[RTAS_ERROR_LOG_MAX]; 57d9953105SMichael Ellerman static DEFINE_SPINLOCK(ras_log_buf_lock); 58d9953105SMichael Ellerman 59d368514cSAnton Blanchard static char global_mce_data_buf[RTAS_ERROR_LOG_MAX]; 60d368514cSAnton Blanchard static DEFINE_PER_CPU(__u64, mce_data_buf); 61d9953105SMichael Ellerman 62d9953105SMichael Ellerman static int ras_get_sensor_state_token; 63d9953105SMichael Ellerman static int ras_check_exception_token; 64d9953105SMichael Ellerman 65d9953105SMichael Ellerman #define EPOW_SENSOR_TOKEN 9 66d9953105SMichael Ellerman #define EPOW_SENSOR_INDEX 0 67d9953105SMichael Ellerman 687d12e780SDavid Howells static irqreturn_t ras_epow_interrupt(int irq, void *dev_id); 697d12e780SDavid Howells static irqreturn_t ras_error_interrupt(int irq, void *dev_id); 70d9953105SMichael Ellerman 710ebfff14SBenjamin Herrenschmidt 72d9953105SMichael Ellerman /* 73d9953105SMichael Ellerman * Initialize handlers for the set of interrupts caused by hardware errors 74d9953105SMichael Ellerman * and power system events. 75d9953105SMichael Ellerman */ 76d9953105SMichael Ellerman static int __init init_ras_IRQ(void) 77d9953105SMichael Ellerman { 78d9953105SMichael Ellerman struct device_node *np; 79d9953105SMichael Ellerman 80d9953105SMichael Ellerman ras_get_sensor_state_token = rtas_token("get-sensor-state"); 81d9953105SMichael Ellerman ras_check_exception_token = rtas_token("check-exception"); 82d9953105SMichael Ellerman 83d9953105SMichael Ellerman /* Internal Errors */ 84d9953105SMichael Ellerman np = of_find_node_by_path("/event-sources/internal-errors"); 85d9953105SMichael Ellerman if (np != NULL) { 8632c96f77SMark Nelson request_event_sources_irqs(np, ras_error_interrupt, 8732c96f77SMark Nelson "RAS_ERROR"); 88d9953105SMichael Ellerman of_node_put(np); 89d9953105SMichael Ellerman } 90d9953105SMichael Ellerman 91d9953105SMichael Ellerman /* EPOW Events */ 92d9953105SMichael Ellerman np = of_find_node_by_path("/event-sources/epow-events"); 93d9953105SMichael Ellerman if (np != NULL) { 9432c96f77SMark Nelson request_event_sources_irqs(np, ras_epow_interrupt, "RAS_EPOW"); 95d9953105SMichael Ellerman of_node_put(np); 96d9953105SMichael Ellerman } 97d9953105SMichael Ellerman 9869ed3324SAnton Blanchard return 0; 99d9953105SMichael Ellerman } 100*55fc0c56SAnton Blanchard subsys_initcall(init_ras_IRQ); 101d9953105SMichael Ellerman 102*55fc0c56SAnton Blanchard #define EPOW_SHUTDOWN_NORMAL 1 103*55fc0c56SAnton Blanchard #define EPOW_SHUTDOWN_ON_UPS 2 104*55fc0c56SAnton Blanchard #define EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS 3 105*55fc0c56SAnton Blanchard #define EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH 4 106*55fc0c56SAnton Blanchard 107*55fc0c56SAnton Blanchard static void handle_system_shutdown(char event_modifier) 108*55fc0c56SAnton Blanchard { 109*55fc0c56SAnton Blanchard switch (event_modifier) { 110*55fc0c56SAnton Blanchard case EPOW_SHUTDOWN_NORMAL: 111*55fc0c56SAnton Blanchard pr_emerg("Firmware initiated power off"); 112*55fc0c56SAnton Blanchard orderly_poweroff(1); 113*55fc0c56SAnton Blanchard break; 114*55fc0c56SAnton Blanchard 115*55fc0c56SAnton Blanchard case EPOW_SHUTDOWN_ON_UPS: 116*55fc0c56SAnton Blanchard pr_emerg("Loss of power reported by firmware, system is " 117*55fc0c56SAnton Blanchard "running on UPS/battery"); 118*55fc0c56SAnton Blanchard break; 119*55fc0c56SAnton Blanchard 120*55fc0c56SAnton Blanchard case EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS: 121*55fc0c56SAnton Blanchard pr_emerg("Loss of system critical functions reported by " 122*55fc0c56SAnton Blanchard "firmware"); 123*55fc0c56SAnton Blanchard pr_emerg("Check RTAS error log for details"); 124*55fc0c56SAnton Blanchard orderly_poweroff(1); 125*55fc0c56SAnton Blanchard break; 126*55fc0c56SAnton Blanchard 127*55fc0c56SAnton Blanchard case EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH: 128*55fc0c56SAnton Blanchard pr_emerg("Ambient temperature too high reported by firmware"); 129*55fc0c56SAnton Blanchard pr_emerg("Check RTAS error log for details"); 130*55fc0c56SAnton Blanchard orderly_poweroff(1); 131*55fc0c56SAnton Blanchard break; 132*55fc0c56SAnton Blanchard 133*55fc0c56SAnton Blanchard default: 134*55fc0c56SAnton Blanchard pr_err("Unknown power/cooling shutdown event (modifier %d)", 135*55fc0c56SAnton Blanchard event_modifier); 136*55fc0c56SAnton Blanchard } 137*55fc0c56SAnton Blanchard } 138*55fc0c56SAnton Blanchard 139*55fc0c56SAnton Blanchard struct epow_errorlog { 140*55fc0c56SAnton Blanchard unsigned char sensor_value; 141*55fc0c56SAnton Blanchard unsigned char event_modifier; 142*55fc0c56SAnton Blanchard unsigned char extended_modifier; 143*55fc0c56SAnton Blanchard unsigned char reserved; 144*55fc0c56SAnton Blanchard unsigned char platform_reason; 145*55fc0c56SAnton Blanchard }; 146*55fc0c56SAnton Blanchard 147*55fc0c56SAnton Blanchard #define EPOW_RESET 0 148*55fc0c56SAnton Blanchard #define EPOW_WARN_COOLING 1 149*55fc0c56SAnton Blanchard #define EPOW_WARN_POWER 2 150*55fc0c56SAnton Blanchard #define EPOW_SYSTEM_SHUTDOWN 3 151*55fc0c56SAnton Blanchard #define EPOW_SYSTEM_HALT 4 152*55fc0c56SAnton Blanchard #define EPOW_MAIN_ENCLOSURE 5 153*55fc0c56SAnton Blanchard #define EPOW_POWER_OFF 7 154*55fc0c56SAnton Blanchard 155*55fc0c56SAnton Blanchard void rtas_parse_epow_errlog(struct rtas_error_log *log) 156*55fc0c56SAnton Blanchard { 157*55fc0c56SAnton Blanchard struct pseries_errorlog *pseries_log; 158*55fc0c56SAnton Blanchard struct epow_errorlog *epow_log; 159*55fc0c56SAnton Blanchard char action_code; 160*55fc0c56SAnton Blanchard char modifier; 161*55fc0c56SAnton Blanchard 162*55fc0c56SAnton Blanchard pseries_log = get_pseries_errorlog(log, PSERIES_ELOG_SECT_ID_EPOW); 163*55fc0c56SAnton Blanchard if (pseries_log == NULL) 164*55fc0c56SAnton Blanchard return; 165*55fc0c56SAnton Blanchard 166*55fc0c56SAnton Blanchard epow_log = (struct epow_errorlog *)pseries_log->data; 167*55fc0c56SAnton Blanchard action_code = epow_log->sensor_value & 0xF; /* bottom 4 bits */ 168*55fc0c56SAnton Blanchard modifier = epow_log->event_modifier & 0xF; /* bottom 4 bits */ 169*55fc0c56SAnton Blanchard 170*55fc0c56SAnton Blanchard switch (action_code) { 171*55fc0c56SAnton Blanchard case EPOW_RESET: 172*55fc0c56SAnton Blanchard pr_err("Non critical power or cooling issue cleared"); 173*55fc0c56SAnton Blanchard break; 174*55fc0c56SAnton Blanchard 175*55fc0c56SAnton Blanchard case EPOW_WARN_COOLING: 176*55fc0c56SAnton Blanchard pr_err("Non critical cooling issue reported by firmware"); 177*55fc0c56SAnton Blanchard pr_err("Check RTAS error log for details"); 178*55fc0c56SAnton Blanchard break; 179*55fc0c56SAnton Blanchard 180*55fc0c56SAnton Blanchard case EPOW_WARN_POWER: 181*55fc0c56SAnton Blanchard pr_err("Non critical power issue reported by firmware"); 182*55fc0c56SAnton Blanchard pr_err("Check RTAS error log for details"); 183*55fc0c56SAnton Blanchard break; 184*55fc0c56SAnton Blanchard 185*55fc0c56SAnton Blanchard case EPOW_SYSTEM_SHUTDOWN: 186*55fc0c56SAnton Blanchard handle_system_shutdown(epow_log->event_modifier); 187*55fc0c56SAnton Blanchard break; 188*55fc0c56SAnton Blanchard 189*55fc0c56SAnton Blanchard case EPOW_SYSTEM_HALT: 190*55fc0c56SAnton Blanchard pr_emerg("Firmware initiated power off"); 191*55fc0c56SAnton Blanchard orderly_poweroff(1); 192*55fc0c56SAnton Blanchard break; 193*55fc0c56SAnton Blanchard 194*55fc0c56SAnton Blanchard case EPOW_MAIN_ENCLOSURE: 195*55fc0c56SAnton Blanchard case EPOW_POWER_OFF: 196*55fc0c56SAnton Blanchard pr_emerg("Critical power/cooling issue reported by firmware"); 197*55fc0c56SAnton Blanchard pr_emerg("Check RTAS error log for details"); 198*55fc0c56SAnton Blanchard pr_emerg("Immediate power off"); 199*55fc0c56SAnton Blanchard emergency_sync(); 200*55fc0c56SAnton Blanchard kernel_power_off(); 201*55fc0c56SAnton Blanchard break; 202*55fc0c56SAnton Blanchard 203*55fc0c56SAnton Blanchard default: 204*55fc0c56SAnton Blanchard pr_err("Unknown power/cooling event (action code %d)", 205*55fc0c56SAnton Blanchard action_code); 206*55fc0c56SAnton Blanchard } 207*55fc0c56SAnton Blanchard } 208*55fc0c56SAnton Blanchard 209*55fc0c56SAnton Blanchard /* Handle environmental and power warning (EPOW) interrupts. */ 2107d12e780SDavid Howells static irqreturn_t ras_epow_interrupt(int irq, void *dev_id) 211d9953105SMichael Ellerman { 212*55fc0c56SAnton Blanchard int status; 213*55fc0c56SAnton Blanchard int state; 214d9953105SMichael Ellerman int critical; 215d9953105SMichael Ellerman 216d9953105SMichael Ellerman status = rtas_call(ras_get_sensor_state_token, 2, 2, &state, 217d9953105SMichael Ellerman EPOW_SENSOR_TOKEN, EPOW_SENSOR_INDEX); 218d9953105SMichael Ellerman 219d9953105SMichael Ellerman if (state > 3) 220d9953105SMichael Ellerman critical = 1; /* Time Critical */ 221d9953105SMichael Ellerman else 222d9953105SMichael Ellerman critical = 0; 223d9953105SMichael Ellerman 224d9953105SMichael Ellerman spin_lock(&ras_log_buf_lock); 225d9953105SMichael Ellerman 226d9953105SMichael Ellerman status = rtas_call(ras_check_exception_token, 6, 1, NULL, 227b08e281bSMark Nelson RTAS_VECTOR_EXTERNAL_INTERRUPT, 228476eb491SGrant Likely virq_to_hw(irq), 229d9953105SMichael Ellerman RTAS_EPOW_WARNING | RTAS_POWERMGM_EVENTS, 230d9953105SMichael Ellerman critical, __pa(&ras_log_buf), 231d9953105SMichael Ellerman rtas_get_error_log_max()); 232d9953105SMichael Ellerman 233d9953105SMichael Ellerman log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0); 234d9953105SMichael Ellerman 235*55fc0c56SAnton Blanchard rtas_parse_epow_errlog((struct rtas_error_log *)ras_log_buf); 236*55fc0c56SAnton Blanchard 237d9953105SMichael Ellerman spin_unlock(&ras_log_buf_lock); 238d9953105SMichael Ellerman return IRQ_HANDLED; 239d9953105SMichael Ellerman } 240d9953105SMichael Ellerman 241d9953105SMichael Ellerman /* 242d9953105SMichael Ellerman * Handle hardware error interrupts. 243d9953105SMichael Ellerman * 244d9953105SMichael Ellerman * RTAS check-exception is called to collect data on the exception. If 245d9953105SMichael Ellerman * the error is deemed recoverable, we log a warning and return. 246d9953105SMichael Ellerman * For nonrecoverable errors, an error is logged and we stop all processing 247d9953105SMichael Ellerman * as quickly as possible in order to prevent propagation of the failure. 248d9953105SMichael Ellerman */ 2497d12e780SDavid Howells static irqreturn_t ras_error_interrupt(int irq, void *dev_id) 250d9953105SMichael Ellerman { 251d9953105SMichael Ellerman struct rtas_error_log *rtas_elog; 252d9953105SMichael Ellerman int status = 0xdeadbeef; 253d9953105SMichael Ellerman int fatal; 254d9953105SMichael Ellerman 255d9953105SMichael Ellerman spin_lock(&ras_log_buf_lock); 256d9953105SMichael Ellerman 257d9953105SMichael Ellerman status = rtas_call(ras_check_exception_token, 6, 1, NULL, 258b08e281bSMark Nelson RTAS_VECTOR_EXTERNAL_INTERRUPT, 259476eb491SGrant Likely virq_to_hw(irq), 260d9953105SMichael Ellerman RTAS_INTERNAL_ERROR, 1 /*Time Critical */, 261d9953105SMichael Ellerman __pa(&ras_log_buf), 262d9953105SMichael Ellerman rtas_get_error_log_max()); 263d9953105SMichael Ellerman 264d9953105SMichael Ellerman rtas_elog = (struct rtas_error_log *)ras_log_buf; 265d9953105SMichael Ellerman 266d9953105SMichael Ellerman if ((status == 0) && (rtas_elog->severity >= RTAS_SEVERITY_ERROR_SYNC)) 267d9953105SMichael Ellerman fatal = 1; 268d9953105SMichael Ellerman else 269d9953105SMichael Ellerman fatal = 0; 270d9953105SMichael Ellerman 271d9953105SMichael Ellerman /* format and print the extended information */ 272d9953105SMichael Ellerman log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, fatal); 273d9953105SMichael Ellerman 274d9953105SMichael Ellerman if (fatal) { 275d9953105SMichael Ellerman udbg_printf("Fatal HW Error <0x%lx 0x%x>\n", 276d9953105SMichael Ellerman *((unsigned long *)&ras_log_buf), status); 277d9953105SMichael Ellerman printk(KERN_EMERG "Error: Fatal hardware error <0x%lx 0x%x>\n", 278d9953105SMichael Ellerman *((unsigned long *)&ras_log_buf), status); 279d9953105SMichael Ellerman 28036f8a2c4SMichael Ellerman #ifndef DEBUG_RTAS_POWER_OFF 281d9953105SMichael Ellerman /* Don't actually power off when debugging so we can test 282d9953105SMichael Ellerman * without actually failing while injecting errors. 283d9953105SMichael Ellerman * Error data will not be logged to syslog. 284d9953105SMichael Ellerman */ 285d9953105SMichael Ellerman ppc_md.power_off(); 286d9953105SMichael Ellerman #endif 287d9953105SMichael Ellerman } else { 288d9953105SMichael Ellerman udbg_printf("Recoverable HW Error <0x%lx 0x%x>\n", 289d9953105SMichael Ellerman *((unsigned long *)&ras_log_buf), status); 290d9953105SMichael Ellerman printk(KERN_WARNING 291d9953105SMichael Ellerman "Warning: Recoverable hardware error <0x%lx 0x%x>\n", 292d9953105SMichael Ellerman *((unsigned long *)&ras_log_buf), status); 293d9953105SMichael Ellerman } 294d9953105SMichael Ellerman 295d9953105SMichael Ellerman spin_unlock(&ras_log_buf_lock); 296d9953105SMichael Ellerman return IRQ_HANDLED; 297d9953105SMichael Ellerman } 298d9953105SMichael Ellerman 299d368514cSAnton Blanchard /* 300d368514cSAnton Blanchard * Some versions of FWNMI place the buffer inside the 4kB page starting at 301d368514cSAnton Blanchard * 0x7000. Other versions place it inside the rtas buffer. We check both. 302d368514cSAnton Blanchard */ 303d368514cSAnton Blanchard #define VALID_FWNMI_BUFFER(A) \ 304d368514cSAnton Blanchard ((((A) >= 0x7000) && ((A) < 0x7ff0)) || \ 305d368514cSAnton Blanchard (((A) >= rtas.base) && ((A) < (rtas.base + rtas.size - 16)))) 306d368514cSAnton Blanchard 307d368514cSAnton Blanchard /* 308d368514cSAnton Blanchard * Get the error information for errors coming through the 309d9953105SMichael Ellerman * FWNMI vectors. The pt_regs' r3 will be updated to reflect 310d9953105SMichael Ellerman * the actual r3 if possible, and a ptr to the error log entry 311d9953105SMichael Ellerman * will be returned if found. 312d9953105SMichael Ellerman * 313d368514cSAnton Blanchard * If the RTAS error is not of the extended type, then we put it in a per 314d368514cSAnton Blanchard * cpu 64bit buffer. If it is the extended type we use global_mce_data_buf. 315d368514cSAnton Blanchard * 316d368514cSAnton Blanchard * The global_mce_data_buf does not have any locks or protection around it, 317d9953105SMichael Ellerman * if a second machine check comes in, or a system reset is done 318d9953105SMichael Ellerman * before we have logged the error, then we will get corruption in the 319d9953105SMichael Ellerman * error log. This is preferable over holding off on calling 320d9953105SMichael Ellerman * ibm,nmi-interlock which would result in us checkstopping if a 321d9953105SMichael Ellerman * second machine check did come in. 322d9953105SMichael Ellerman */ 323d9953105SMichael Ellerman static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs) 324d9953105SMichael Ellerman { 325d9953105SMichael Ellerman unsigned long *savep; 326d368514cSAnton Blanchard struct rtas_error_log *h, *errhdr = NULL; 327d9953105SMichael Ellerman 328d368514cSAnton Blanchard if (!VALID_FWNMI_BUFFER(regs->gpr[3])) { 329f0e939aeSAnton Blanchard printk(KERN_ERR "FWNMI: corrupt r3 0x%016lx\n", regs->gpr[3]); 330d368514cSAnton Blanchard return NULL; 331d9953105SMichael Ellerman } 332d368514cSAnton Blanchard 333d368514cSAnton Blanchard savep = __va(regs->gpr[3]); 334d368514cSAnton Blanchard regs->gpr[3] = savep[0]; /* restore original r3 */ 335d368514cSAnton Blanchard 336d368514cSAnton Blanchard /* If it isn't an extended log we can use the per cpu 64bit buffer */ 337d368514cSAnton Blanchard h = (struct rtas_error_log *)&savep[1]; 338d368514cSAnton Blanchard if (!h->extended) { 339d368514cSAnton Blanchard memcpy(&__get_cpu_var(mce_data_buf), h, sizeof(__u64)); 340d368514cSAnton Blanchard errhdr = (struct rtas_error_log *)&__get_cpu_var(mce_data_buf); 341d368514cSAnton Blanchard } else { 342d368514cSAnton Blanchard int len; 343d368514cSAnton Blanchard 344d368514cSAnton Blanchard len = max_t(int, 8+h->extended_log_length, RTAS_ERROR_LOG_MAX); 345d368514cSAnton Blanchard memset(global_mce_data_buf, 0, RTAS_ERROR_LOG_MAX); 346d368514cSAnton Blanchard memcpy(global_mce_data_buf, h, len); 347d368514cSAnton Blanchard errhdr = (struct rtas_error_log *)global_mce_data_buf; 348d368514cSAnton Blanchard } 349d368514cSAnton Blanchard 350d9953105SMichael Ellerman return errhdr; 351d9953105SMichael Ellerman } 352d9953105SMichael Ellerman 353d9953105SMichael Ellerman /* Call this when done with the data returned by FWNMI_get_errinfo. 354d9953105SMichael Ellerman * It will release the saved data area for other CPUs in the 355d9953105SMichael Ellerman * partition to receive FWNMI errors. 356d9953105SMichael Ellerman */ 357d9953105SMichael Ellerman static void fwnmi_release_errinfo(void) 358d9953105SMichael Ellerman { 359d9953105SMichael Ellerman int ret = rtas_call(rtas_token("ibm,nmi-interlock"), 0, 1, NULL); 360d9953105SMichael Ellerman if (ret != 0) 361d368514cSAnton Blanchard printk(KERN_ERR "FWNMI: nmi-interlock failed: %d\n", ret); 362d9953105SMichael Ellerman } 363d9953105SMichael Ellerman 364c902be71SArnd Bergmann int pSeries_system_reset_exception(struct pt_regs *regs) 365d9953105SMichael Ellerman { 366d9953105SMichael Ellerman if (fwnmi_active) { 367d9953105SMichael Ellerman struct rtas_error_log *errhdr = fwnmi_get_errinfo(regs); 368d9953105SMichael Ellerman if (errhdr) { 369d9953105SMichael Ellerman /* XXX Should look at FWNMI information */ 370d9953105SMichael Ellerman } 371d9953105SMichael Ellerman fwnmi_release_errinfo(); 372d9953105SMichael Ellerman } 373c902be71SArnd Bergmann return 0; /* need to perform reset */ 374d9953105SMichael Ellerman } 375d9953105SMichael Ellerman 376d9953105SMichael Ellerman /* 377d9953105SMichael Ellerman * See if we can recover from a machine check exception. 378d9953105SMichael Ellerman * This is only called on power4 (or above) and only via 379d9953105SMichael Ellerman * the Firmware Non-Maskable Interrupts (fwnmi) handler 380d9953105SMichael Ellerman * which provides the error analysis for us. 381d9953105SMichael Ellerman * 382d9953105SMichael Ellerman * Return 1 if corrected (or delivered a signal). 383d9953105SMichael Ellerman * Return 0 if there is nothing we can do. 384d9953105SMichael Ellerman */ 385d9953105SMichael Ellerman static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err) 386d9953105SMichael Ellerman { 387d47d1d8aSAnton Blanchard int recovered = 0; 388d9953105SMichael Ellerman 389d47d1d8aSAnton Blanchard if (!(regs->msr & MSR_RI)) { 390d47d1d8aSAnton Blanchard /* If MSR_RI isn't set, we cannot recover */ 391d47d1d8aSAnton Blanchard recovered = 0; 392d47d1d8aSAnton Blanchard 393d47d1d8aSAnton Blanchard } else if (err->disposition == RTAS_DISP_FULLY_RECOVERED) { 394d9953105SMichael Ellerman /* Platform corrected itself */ 395d47d1d8aSAnton Blanchard recovered = 1; 396d47d1d8aSAnton Blanchard 397d47d1d8aSAnton Blanchard } else if (err->disposition == RTAS_DISP_LIMITED_RECOVERY) { 398d47d1d8aSAnton Blanchard /* Platform corrected itself but could be degraded */ 399d47d1d8aSAnton Blanchard printk(KERN_ERR "MCE: limited recovery, system may " 400d47d1d8aSAnton Blanchard "be degraded\n"); 401d47d1d8aSAnton Blanchard recovered = 1; 402d47d1d8aSAnton Blanchard 403d47d1d8aSAnton Blanchard } else if (user_mode(regs) && !is_global_init(current) && 404d47d1d8aSAnton Blanchard err->severity == RTAS_SEVERITY_ERROR_SYNC) { 405d47d1d8aSAnton Blanchard 406d47d1d8aSAnton Blanchard /* 407d47d1d8aSAnton Blanchard * If we received a synchronous error when in userspace 408d47d1d8aSAnton Blanchard * kill the task. Firmware may report details of the fail 409d47d1d8aSAnton Blanchard * asynchronously, so we can't rely on the target and type 410d47d1d8aSAnton Blanchard * fields being valid here. 411d47d1d8aSAnton Blanchard */ 412d47d1d8aSAnton Blanchard printk(KERN_ERR "MCE: uncorrectable error, killing task " 413d47d1d8aSAnton Blanchard "%s:%d\n", current->comm, current->pid); 414d47d1d8aSAnton Blanchard 415d47d1d8aSAnton Blanchard _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip); 416d47d1d8aSAnton Blanchard recovered = 1; 417d9953105SMichael Ellerman } 418d9953105SMichael Ellerman 4193f9793e6SAnton Blanchard log_error((char *)err, ERR_TYPE_RTAS_LOG, 0); 420d9953105SMichael Ellerman 421d47d1d8aSAnton Blanchard return recovered; 422d9953105SMichael Ellerman } 423d9953105SMichael Ellerman 424d9953105SMichael Ellerman /* 425d9953105SMichael Ellerman * Handle a machine check. 426d9953105SMichael Ellerman * 427d9953105SMichael Ellerman * Note that on Power 4 and beyond Firmware Non-Maskable Interrupts (fwnmi) 428d9953105SMichael Ellerman * should be present. If so the handler which called us tells us if the 429d9953105SMichael Ellerman * error was recovered (never true if RI=0). 430d9953105SMichael Ellerman * 431d9953105SMichael Ellerman * On hardware prior to Power 4 these exceptions were asynchronous which 432d9953105SMichael Ellerman * means we can't tell exactly where it occurred and so we can't recover. 433d9953105SMichael Ellerman */ 434d9953105SMichael Ellerman int pSeries_machine_check_exception(struct pt_regs *regs) 435d9953105SMichael Ellerman { 436d9953105SMichael Ellerman struct rtas_error_log *errp; 437d9953105SMichael Ellerman 438d9953105SMichael Ellerman if (fwnmi_active) { 439d9953105SMichael Ellerman errp = fwnmi_get_errinfo(regs); 440d9953105SMichael Ellerman fwnmi_release_errinfo(); 441d9953105SMichael Ellerman if (errp && recover_mce(regs, errp)) 442d9953105SMichael Ellerman return 1; 443d9953105SMichael Ellerman } 444d9953105SMichael Ellerman 445d9953105SMichael Ellerman return 0; 446d9953105SMichael Ellerman } 447