1d9953105SMichael Ellerman /* 2d9953105SMichael Ellerman * Copyright (C) 2001 Dave Engebretsen IBM Corporation 3d9953105SMichael Ellerman * 4d9953105SMichael Ellerman * This program is free software; you can redistribute it and/or modify 5d9953105SMichael Ellerman * it under the terms of the GNU General Public License as published by 6d9953105SMichael Ellerman * the Free Software Foundation; either version 2 of the License, or 7d9953105SMichael Ellerman * (at your option) any later version. 8d9953105SMichael Ellerman * 9d9953105SMichael Ellerman * This program is distributed in the hope that it will be useful, 10d9953105SMichael Ellerman * but WITHOUT ANY WARRANTY; without even the implied warranty of 11d9953105SMichael Ellerman * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12d9953105SMichael Ellerman * GNU General Public License for more details. 13d9953105SMichael Ellerman * 14d9953105SMichael Ellerman * You should have received a copy of the GNU General Public License 15d9953105SMichael Ellerman * along with this program; if not, write to the Free Software 16d9953105SMichael Ellerman * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 17d9953105SMichael Ellerman */ 18d9953105SMichael Ellerman 19d9953105SMichael Ellerman #include <linux/sched.h> 20d9953105SMichael Ellerman #include <linux/interrupt.h> 21d9953105SMichael Ellerman #include <linux/irq.h> 2290128997SAnton Blanchard #include <linux/of.h> 2355fc0c56SAnton Blanchard #include <linux/fs.h> 2455fc0c56SAnton Blanchard #include <linux/reboot.h> 25d9953105SMichael Ellerman 26d9953105SMichael Ellerman #include <asm/machdep.h> 27d9953105SMichael Ellerman #include <asm/rtas.h> 288c4f1f29SMichael Ellerman #include <asm/firmware.h> 29d9953105SMichael Ellerman 30577830b0SMichael Ellerman #include "pseries.h" 31c902be71SArnd Bergmann 32d9953105SMichael Ellerman static unsigned char ras_log_buf[RTAS_ERROR_LOG_MAX]; 33d9953105SMichael Ellerman static DEFINE_SPINLOCK(ras_log_buf_lock); 34d9953105SMichael Ellerman 35d368514cSAnton Blanchard static char global_mce_data_buf[RTAS_ERROR_LOG_MAX]; 36d368514cSAnton Blanchard static DEFINE_PER_CPU(__u64, mce_data_buf); 37d9953105SMichael Ellerman 38d9953105SMichael Ellerman static int ras_check_exception_token; 39d9953105SMichael Ellerman 40d9953105SMichael Ellerman #define EPOW_SENSOR_TOKEN 9 41d9953105SMichael Ellerman #define EPOW_SENSOR_INDEX 0 42d9953105SMichael Ellerman 43b4af279aSVipin K Parashar /* EPOW events counter variable */ 44b4af279aSVipin K Parashar static int num_epow_events; 45b4af279aSVipin K Parashar 46b7d9eb39SJohn Allen static irqreturn_t ras_hotplug_interrupt(int irq, void *dev_id); 477d12e780SDavid Howells static irqreturn_t ras_epow_interrupt(int irq, void *dev_id); 487d12e780SDavid Howells static irqreturn_t ras_error_interrupt(int irq, void *dev_id); 49d9953105SMichael Ellerman 500ebfff14SBenjamin Herrenschmidt 51d9953105SMichael Ellerman /* 52*c9dccf1dSSam Bobroff * Enable the hotplug interrupt late because processing them may touch other 53*c9dccf1dSSam Bobroff * devices or systems (e.g. hugepages) that have not been initialized at the 54*c9dccf1dSSam Bobroff * subsys stage. 55*c9dccf1dSSam Bobroff */ 56*c9dccf1dSSam Bobroff int __init init_ras_hotplug_IRQ(void) 57*c9dccf1dSSam Bobroff { 58*c9dccf1dSSam Bobroff struct device_node *np; 59*c9dccf1dSSam Bobroff 60*c9dccf1dSSam Bobroff /* Hotplug Events */ 61*c9dccf1dSSam Bobroff np = of_find_node_by_path("/event-sources/hot-plug-events"); 62*c9dccf1dSSam Bobroff if (np != NULL) { 63*c9dccf1dSSam Bobroff if (dlpar_workqueue_init() == 0) 64*c9dccf1dSSam Bobroff request_event_sources_irqs(np, ras_hotplug_interrupt, 65*c9dccf1dSSam Bobroff "RAS_HOTPLUG"); 66*c9dccf1dSSam Bobroff of_node_put(np); 67*c9dccf1dSSam Bobroff } 68*c9dccf1dSSam Bobroff 69*c9dccf1dSSam Bobroff return 0; 70*c9dccf1dSSam Bobroff } 71*c9dccf1dSSam Bobroff machine_late_initcall(pseries, init_ras_hotplug_IRQ); 72*c9dccf1dSSam Bobroff 73*c9dccf1dSSam Bobroff /* 74d9953105SMichael Ellerman * Initialize handlers for the set of interrupts caused by hardware errors 75d9953105SMichael Ellerman * and power system events. 76d9953105SMichael Ellerman */ 77d9953105SMichael Ellerman static int __init init_ras_IRQ(void) 78d9953105SMichael Ellerman { 79d9953105SMichael Ellerman struct device_node *np; 80d9953105SMichael Ellerman 81d9953105SMichael Ellerman ras_check_exception_token = rtas_token("check-exception"); 82d9953105SMichael Ellerman 83d9953105SMichael Ellerman /* Internal Errors */ 84d9953105SMichael Ellerman np = of_find_node_by_path("/event-sources/internal-errors"); 85d9953105SMichael Ellerman if (np != NULL) { 8632c96f77SMark Nelson request_event_sources_irqs(np, ras_error_interrupt, 8732c96f77SMark Nelson "RAS_ERROR"); 88d9953105SMichael Ellerman of_node_put(np); 89d9953105SMichael Ellerman } 90d9953105SMichael Ellerman 91d9953105SMichael Ellerman /* EPOW Events */ 92d9953105SMichael Ellerman np = of_find_node_by_path("/event-sources/epow-events"); 93d9953105SMichael Ellerman if (np != NULL) { 9432c96f77SMark Nelson request_event_sources_irqs(np, ras_epow_interrupt, "RAS_EPOW"); 95d9953105SMichael Ellerman of_node_put(np); 96d9953105SMichael Ellerman } 97d9953105SMichael Ellerman 9869ed3324SAnton Blanchard return 0; 99d9953105SMichael Ellerman } 1008e83e905SMichael Ellerman machine_subsys_initcall(pseries, init_ras_IRQ); 101d9953105SMichael Ellerman 10255fc0c56SAnton Blanchard #define EPOW_SHUTDOWN_NORMAL 1 10355fc0c56SAnton Blanchard #define EPOW_SHUTDOWN_ON_UPS 2 10455fc0c56SAnton Blanchard #define EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS 3 10555fc0c56SAnton Blanchard #define EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH 4 10655fc0c56SAnton Blanchard 10755fc0c56SAnton Blanchard static void handle_system_shutdown(char event_modifier) 10855fc0c56SAnton Blanchard { 10955fc0c56SAnton Blanchard switch (event_modifier) { 11055fc0c56SAnton Blanchard case EPOW_SHUTDOWN_NORMAL: 111b4af279aSVipin K Parashar pr_emerg("Power off requested\n"); 1121b7e0cbeSliguang orderly_poweroff(true); 11355fc0c56SAnton Blanchard break; 11455fc0c56SAnton Blanchard 11555fc0c56SAnton Blanchard case EPOW_SHUTDOWN_ON_UPS: 116b4af279aSVipin K Parashar pr_emerg("Loss of system power detected. System is running on" 117b4af279aSVipin K Parashar " UPS/battery. Check RTAS error log for details\n"); 11879872e35SAnshuman Khandual orderly_poweroff(true); 11955fc0c56SAnton Blanchard break; 12055fc0c56SAnton Blanchard 12155fc0c56SAnton Blanchard case EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS: 122b4af279aSVipin K Parashar pr_emerg("Loss of system critical functions detected. Check" 123b4af279aSVipin K Parashar " RTAS error log for details\n"); 1241b7e0cbeSliguang orderly_poweroff(true); 12555fc0c56SAnton Blanchard break; 12655fc0c56SAnton Blanchard 12755fc0c56SAnton Blanchard case EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH: 128b4af279aSVipin K Parashar pr_emerg("High ambient temperature detected. Check RTAS" 129b4af279aSVipin K Parashar " error log for details\n"); 1301b7e0cbeSliguang orderly_poweroff(true); 13155fc0c56SAnton Blanchard break; 13255fc0c56SAnton Blanchard 13355fc0c56SAnton Blanchard default: 134b4af279aSVipin K Parashar pr_err("Unknown power/cooling shutdown event (modifier = %d)\n", 13555fc0c56SAnton Blanchard event_modifier); 13655fc0c56SAnton Blanchard } 13755fc0c56SAnton Blanchard } 13855fc0c56SAnton Blanchard 13955fc0c56SAnton Blanchard struct epow_errorlog { 14055fc0c56SAnton Blanchard unsigned char sensor_value; 14155fc0c56SAnton Blanchard unsigned char event_modifier; 14255fc0c56SAnton Blanchard unsigned char extended_modifier; 14355fc0c56SAnton Blanchard unsigned char reserved; 14455fc0c56SAnton Blanchard unsigned char platform_reason; 14555fc0c56SAnton Blanchard }; 14655fc0c56SAnton Blanchard 14755fc0c56SAnton Blanchard #define EPOW_RESET 0 14855fc0c56SAnton Blanchard #define EPOW_WARN_COOLING 1 14955fc0c56SAnton Blanchard #define EPOW_WARN_POWER 2 15055fc0c56SAnton Blanchard #define EPOW_SYSTEM_SHUTDOWN 3 15155fc0c56SAnton Blanchard #define EPOW_SYSTEM_HALT 4 15255fc0c56SAnton Blanchard #define EPOW_MAIN_ENCLOSURE 5 15355fc0c56SAnton Blanchard #define EPOW_POWER_OFF 7 15455fc0c56SAnton Blanchard 155e51df2c1SAnton Blanchard static void rtas_parse_epow_errlog(struct rtas_error_log *log) 15655fc0c56SAnton Blanchard { 15755fc0c56SAnton Blanchard struct pseries_errorlog *pseries_log; 15855fc0c56SAnton Blanchard struct epow_errorlog *epow_log; 15955fc0c56SAnton Blanchard char action_code; 16055fc0c56SAnton Blanchard char modifier; 16155fc0c56SAnton Blanchard 16255fc0c56SAnton Blanchard pseries_log = get_pseries_errorlog(log, PSERIES_ELOG_SECT_ID_EPOW); 16355fc0c56SAnton Blanchard if (pseries_log == NULL) 16455fc0c56SAnton Blanchard return; 16555fc0c56SAnton Blanchard 16655fc0c56SAnton Blanchard epow_log = (struct epow_errorlog *)pseries_log->data; 16755fc0c56SAnton Blanchard action_code = epow_log->sensor_value & 0xF; /* bottom 4 bits */ 16855fc0c56SAnton Blanchard modifier = epow_log->event_modifier & 0xF; /* bottom 4 bits */ 16955fc0c56SAnton Blanchard 17055fc0c56SAnton Blanchard switch (action_code) { 17155fc0c56SAnton Blanchard case EPOW_RESET: 172b4af279aSVipin K Parashar if (num_epow_events) { 173b4af279aSVipin K Parashar pr_info("Non critical power/cooling issue cleared\n"); 174b4af279aSVipin K Parashar num_epow_events--; 175b4af279aSVipin K Parashar } 17655fc0c56SAnton Blanchard break; 17755fc0c56SAnton Blanchard 17855fc0c56SAnton Blanchard case EPOW_WARN_COOLING: 179b4af279aSVipin K Parashar pr_info("Non-critical cooling issue detected. Check RTAS error" 180b4af279aSVipin K Parashar " log for details\n"); 18155fc0c56SAnton Blanchard break; 18255fc0c56SAnton Blanchard 18355fc0c56SAnton Blanchard case EPOW_WARN_POWER: 184b4af279aSVipin K Parashar pr_info("Non-critical power issue detected. Check RTAS error" 185b4af279aSVipin K Parashar " log for details\n"); 18655fc0c56SAnton Blanchard break; 18755fc0c56SAnton Blanchard 18855fc0c56SAnton Blanchard case EPOW_SYSTEM_SHUTDOWN: 18955fc0c56SAnton Blanchard handle_system_shutdown(epow_log->event_modifier); 19055fc0c56SAnton Blanchard break; 19155fc0c56SAnton Blanchard 19255fc0c56SAnton Blanchard case EPOW_SYSTEM_HALT: 193b4af279aSVipin K Parashar pr_emerg("Critical power/cooling issue detected. Check RTAS" 194b4af279aSVipin K Parashar " error log for details. Powering off.\n"); 1951b7e0cbeSliguang orderly_poweroff(true); 19655fc0c56SAnton Blanchard break; 19755fc0c56SAnton Blanchard 19855fc0c56SAnton Blanchard case EPOW_MAIN_ENCLOSURE: 19955fc0c56SAnton Blanchard case EPOW_POWER_OFF: 200b4af279aSVipin K Parashar pr_emerg("System about to lose power. Check RTAS error log " 201b4af279aSVipin K Parashar " for details. Powering off immediately.\n"); 20255fc0c56SAnton Blanchard emergency_sync(); 20355fc0c56SAnton Blanchard kernel_power_off(); 20455fc0c56SAnton Blanchard break; 20555fc0c56SAnton Blanchard 20655fc0c56SAnton Blanchard default: 207b4af279aSVipin K Parashar pr_err("Unknown power/cooling event (action code = %d)\n", 20855fc0c56SAnton Blanchard action_code); 20955fc0c56SAnton Blanchard } 210b4af279aSVipin K Parashar 211b4af279aSVipin K Parashar /* Increment epow events counter variable */ 212b4af279aSVipin K Parashar if (action_code != EPOW_RESET) 213b4af279aSVipin K Parashar num_epow_events++; 21455fc0c56SAnton Blanchard } 21555fc0c56SAnton Blanchard 216b7d9eb39SJohn Allen static irqreturn_t ras_hotplug_interrupt(int irq, void *dev_id) 217b7d9eb39SJohn Allen { 218b7d9eb39SJohn Allen struct pseries_errorlog *pseries_log; 219b7d9eb39SJohn Allen struct pseries_hp_errorlog *hp_elog; 220b7d9eb39SJohn Allen 221b7d9eb39SJohn Allen spin_lock(&ras_log_buf_lock); 222b7d9eb39SJohn Allen 223b7d9eb39SJohn Allen rtas_call(ras_check_exception_token, 6, 1, NULL, 224b7d9eb39SJohn Allen RTAS_VECTOR_EXTERNAL_INTERRUPT, virq_to_hw(irq), 225b7d9eb39SJohn Allen RTAS_HOTPLUG_EVENTS, 0, __pa(&ras_log_buf), 226b7d9eb39SJohn Allen rtas_get_error_log_max()); 227b7d9eb39SJohn Allen 228b7d9eb39SJohn Allen pseries_log = get_pseries_errorlog((struct rtas_error_log *)ras_log_buf, 229b7d9eb39SJohn Allen PSERIES_ELOG_SECT_ID_HOTPLUG); 230b7d9eb39SJohn Allen hp_elog = (struct pseries_hp_errorlog *)pseries_log->data; 231b7d9eb39SJohn Allen 232b7d9eb39SJohn Allen /* 233b7d9eb39SJohn Allen * Since PCI hotplug is not currently supported on pseries, put PCI 234b7d9eb39SJohn Allen * hotplug events on the ras_log_buf to be handled by rtas_errd. 235b7d9eb39SJohn Allen */ 236b7d9eb39SJohn Allen if (hp_elog->resource == PSERIES_HP_ELOG_RESOURCE_MEM || 237b7d9eb39SJohn Allen hp_elog->resource == PSERIES_HP_ELOG_RESOURCE_CPU) 238b7d9eb39SJohn Allen queue_hotplug_event(hp_elog, NULL, NULL); 239b7d9eb39SJohn Allen else 240b7d9eb39SJohn Allen log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0); 241b7d9eb39SJohn Allen 242b7d9eb39SJohn Allen spin_unlock(&ras_log_buf_lock); 243b7d9eb39SJohn Allen return IRQ_HANDLED; 244b7d9eb39SJohn Allen } 245b7d9eb39SJohn Allen 24655fc0c56SAnton Blanchard /* Handle environmental and power warning (EPOW) interrupts. */ 2477d12e780SDavid Howells static irqreturn_t ras_epow_interrupt(int irq, void *dev_id) 248d9953105SMichael Ellerman { 24955fc0c56SAnton Blanchard int status; 25055fc0c56SAnton Blanchard int state; 251d9953105SMichael Ellerman int critical; 252d9953105SMichael Ellerman 2531c2cb594SThomas Huth status = rtas_get_sensor_fast(EPOW_SENSOR_TOKEN, EPOW_SENSOR_INDEX, 2541c2cb594SThomas Huth &state); 255d9953105SMichael Ellerman 256d9953105SMichael Ellerman if (state > 3) 257d9953105SMichael Ellerman critical = 1; /* Time Critical */ 258d9953105SMichael Ellerman else 259d9953105SMichael Ellerman critical = 0; 260d9953105SMichael Ellerman 261d9953105SMichael Ellerman spin_lock(&ras_log_buf_lock); 262d9953105SMichael Ellerman 263d9953105SMichael Ellerman status = rtas_call(ras_check_exception_token, 6, 1, NULL, 264b08e281bSMark Nelson RTAS_VECTOR_EXTERNAL_INTERRUPT, 265476eb491SGrant Likely virq_to_hw(irq), 2666f43747fSAnton Blanchard RTAS_EPOW_WARNING, 267d9953105SMichael Ellerman critical, __pa(&ras_log_buf), 268d9953105SMichael Ellerman rtas_get_error_log_max()); 269d9953105SMichael Ellerman 270d9953105SMichael Ellerman log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0); 271d9953105SMichael Ellerman 27255fc0c56SAnton Blanchard rtas_parse_epow_errlog((struct rtas_error_log *)ras_log_buf); 27355fc0c56SAnton Blanchard 274d9953105SMichael Ellerman spin_unlock(&ras_log_buf_lock); 275d9953105SMichael Ellerman return IRQ_HANDLED; 276d9953105SMichael Ellerman } 277d9953105SMichael Ellerman 278d9953105SMichael Ellerman /* 279d9953105SMichael Ellerman * Handle hardware error interrupts. 280d9953105SMichael Ellerman * 281d9953105SMichael Ellerman * RTAS check-exception is called to collect data on the exception. If 282d9953105SMichael Ellerman * the error is deemed recoverable, we log a warning and return. 283d9953105SMichael Ellerman * For nonrecoverable errors, an error is logged and we stop all processing 284d9953105SMichael Ellerman * as quickly as possible in order to prevent propagation of the failure. 285d9953105SMichael Ellerman */ 2867d12e780SDavid Howells static irqreturn_t ras_error_interrupt(int irq, void *dev_id) 287d9953105SMichael Ellerman { 288d9953105SMichael Ellerman struct rtas_error_log *rtas_elog; 289cc8b5263SAnton Blanchard int status; 290d9953105SMichael Ellerman int fatal; 291d9953105SMichael Ellerman 292d9953105SMichael Ellerman spin_lock(&ras_log_buf_lock); 293d9953105SMichael Ellerman 294d9953105SMichael Ellerman status = rtas_call(ras_check_exception_token, 6, 1, NULL, 295b08e281bSMark Nelson RTAS_VECTOR_EXTERNAL_INTERRUPT, 296476eb491SGrant Likely virq_to_hw(irq), 297d9953105SMichael Ellerman RTAS_INTERNAL_ERROR, 1 /* Time Critical */, 298d9953105SMichael Ellerman __pa(&ras_log_buf), 299d9953105SMichael Ellerman rtas_get_error_log_max()); 300d9953105SMichael Ellerman 301d9953105SMichael Ellerman rtas_elog = (struct rtas_error_log *)ras_log_buf; 302d9953105SMichael Ellerman 303a08a53eaSGreg Kurz if (status == 0 && 304a08a53eaSGreg Kurz rtas_error_severity(rtas_elog) >= RTAS_SEVERITY_ERROR_SYNC) 305d9953105SMichael Ellerman fatal = 1; 306d9953105SMichael Ellerman else 307d9953105SMichael Ellerman fatal = 0; 308d9953105SMichael Ellerman 309d9953105SMichael Ellerman /* format and print the extended information */ 310d9953105SMichael Ellerman log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, fatal); 311d9953105SMichael Ellerman 312d9953105SMichael Ellerman if (fatal) { 313b4af279aSVipin K Parashar pr_emerg("Fatal hardware error detected. Check RTAS error" 314b4af279aSVipin K Parashar " log for details. Powering off immediately\n"); 315cc8b5263SAnton Blanchard emergency_sync(); 316cc8b5263SAnton Blanchard kernel_power_off(); 317d9953105SMichael Ellerman } else { 318b4af279aSVipin K Parashar pr_err("Recoverable hardware error detected\n"); 319d9953105SMichael Ellerman } 320d9953105SMichael Ellerman 321d9953105SMichael Ellerman spin_unlock(&ras_log_buf_lock); 322d9953105SMichael Ellerman return IRQ_HANDLED; 323d9953105SMichael Ellerman } 324d9953105SMichael Ellerman 325d368514cSAnton Blanchard /* 326d368514cSAnton Blanchard * Some versions of FWNMI place the buffer inside the 4kB page starting at 327d368514cSAnton Blanchard * 0x7000. Other versions place it inside the rtas buffer. We check both. 328d368514cSAnton Blanchard */ 329d368514cSAnton Blanchard #define VALID_FWNMI_BUFFER(A) \ 330d368514cSAnton Blanchard ((((A) >= 0x7000) && ((A) < 0x7ff0)) || \ 331d368514cSAnton Blanchard (((A) >= rtas.base) && ((A) < (rtas.base + rtas.size - 16)))) 332d368514cSAnton Blanchard 333d368514cSAnton Blanchard /* 334d368514cSAnton Blanchard * Get the error information for errors coming through the 335d9953105SMichael Ellerman * FWNMI vectors. The pt_regs' r3 will be updated to reflect 336d9953105SMichael Ellerman * the actual r3 if possible, and a ptr to the error log entry 337d9953105SMichael Ellerman * will be returned if found. 338d9953105SMichael Ellerman * 339d368514cSAnton Blanchard * If the RTAS error is not of the extended type, then we put it in a per 340d368514cSAnton Blanchard * cpu 64bit buffer. If it is the extended type we use global_mce_data_buf. 341d368514cSAnton Blanchard * 342d368514cSAnton Blanchard * The global_mce_data_buf does not have any locks or protection around it, 343d9953105SMichael Ellerman * if a second machine check comes in, or a system reset is done 344d9953105SMichael Ellerman * before we have logged the error, then we will get corruption in the 345d9953105SMichael Ellerman * error log. This is preferable over holding off on calling 346d9953105SMichael Ellerman * ibm,nmi-interlock which would result in us checkstopping if a 347d9953105SMichael Ellerman * second machine check did come in. 348d9953105SMichael Ellerman */ 349d9953105SMichael Ellerman static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs) 350d9953105SMichael Ellerman { 351d9953105SMichael Ellerman unsigned long *savep; 352d368514cSAnton Blanchard struct rtas_error_log *h, *errhdr = NULL; 353d9953105SMichael Ellerman 354ee1dd1e3SMahesh Salgaonkar /* Mask top two bits */ 355ee1dd1e3SMahesh Salgaonkar regs->gpr[3] &= ~(0x3UL << 62); 356ee1dd1e3SMahesh Salgaonkar 357d368514cSAnton Blanchard if (!VALID_FWNMI_BUFFER(regs->gpr[3])) { 358f0e939aeSAnton Blanchard printk(KERN_ERR "FWNMI: corrupt r3 0x%016lx\n", regs->gpr[3]); 359d368514cSAnton Blanchard return NULL; 360d9953105SMichael Ellerman } 361d368514cSAnton Blanchard 362d368514cSAnton Blanchard savep = __va(regs->gpr[3]); 363d368514cSAnton Blanchard regs->gpr[3] = savep[0]; /* restore original r3 */ 364d368514cSAnton Blanchard 365d368514cSAnton Blanchard /* If it isn't an extended log we can use the per cpu 64bit buffer */ 366d368514cSAnton Blanchard h = (struct rtas_error_log *)&savep[1]; 367a08a53eaSGreg Kurz if (!rtas_error_extended(h)) { 36869111bacSChristoph Lameter memcpy(this_cpu_ptr(&mce_data_buf), h, sizeof(__u64)); 36969111bacSChristoph Lameter errhdr = (struct rtas_error_log *)this_cpu_ptr(&mce_data_buf); 370d368514cSAnton Blanchard } else { 371a08a53eaSGreg Kurz int len, error_log_length; 372d368514cSAnton Blanchard 373a08a53eaSGreg Kurz error_log_length = 8 + rtas_error_extended_log_length(h); 374a08a53eaSGreg Kurz len = max_t(int, error_log_length, RTAS_ERROR_LOG_MAX); 375d368514cSAnton Blanchard memset(global_mce_data_buf, 0, RTAS_ERROR_LOG_MAX); 376d368514cSAnton Blanchard memcpy(global_mce_data_buf, h, len); 377d368514cSAnton Blanchard errhdr = (struct rtas_error_log *)global_mce_data_buf; 378d368514cSAnton Blanchard } 379d368514cSAnton Blanchard 380d9953105SMichael Ellerman return errhdr; 381d9953105SMichael Ellerman } 382d9953105SMichael Ellerman 383d9953105SMichael Ellerman /* Call this when done with the data returned by FWNMI_get_errinfo. 384d9953105SMichael Ellerman * It will release the saved data area for other CPUs in the 385d9953105SMichael Ellerman * partition to receive FWNMI errors. 386d9953105SMichael Ellerman */ 387d9953105SMichael Ellerman static void fwnmi_release_errinfo(void) 388d9953105SMichael Ellerman { 389d9953105SMichael Ellerman int ret = rtas_call(rtas_token("ibm,nmi-interlock"), 0, 1, NULL); 390d9953105SMichael Ellerman if (ret != 0) 391d368514cSAnton Blanchard printk(KERN_ERR "FWNMI: nmi-interlock failed: %d\n", ret); 392d9953105SMichael Ellerman } 393d9953105SMichael Ellerman 394c902be71SArnd Bergmann int pSeries_system_reset_exception(struct pt_regs *regs) 395d9953105SMichael Ellerman { 396bded0706SNicholas Piggin #ifdef __LITTLE_ENDIAN__ 397bded0706SNicholas Piggin /* 398bded0706SNicholas Piggin * Some firmware byteswaps SRR registers and gives incorrect SRR1. Try 399bded0706SNicholas Piggin * to detect the bad SRR1 pattern here. Flip the NIP back to correct 400bded0706SNicholas Piggin * endian for reporting purposes. Unfortunately the MSR can't be fixed, 401bded0706SNicholas Piggin * so clear it. It will be missing MSR_RI so we won't try to recover. 402bded0706SNicholas Piggin */ 403bded0706SNicholas Piggin if ((be64_to_cpu(regs->msr) & 404bded0706SNicholas Piggin (MSR_LE|MSR_RI|MSR_DR|MSR_IR|MSR_ME|MSR_PR| 405bded0706SNicholas Piggin MSR_ILE|MSR_HV|MSR_SF)) == (MSR_DR|MSR_SF)) { 406bded0706SNicholas Piggin regs->nip = be64_to_cpu((__be64)regs->nip); 407bded0706SNicholas Piggin regs->msr = 0; 408bded0706SNicholas Piggin } 409bded0706SNicholas Piggin #endif 410bded0706SNicholas Piggin 411d9953105SMichael Ellerman if (fwnmi_active) { 412d9953105SMichael Ellerman struct rtas_error_log *errhdr = fwnmi_get_errinfo(regs); 413d9953105SMichael Ellerman if (errhdr) { 414d9953105SMichael Ellerman /* XXX Should look at FWNMI information */ 415d9953105SMichael Ellerman } 416d9953105SMichael Ellerman fwnmi_release_errinfo(); 417d9953105SMichael Ellerman } 418102c05e8SNicholas Piggin 419102c05e8SNicholas Piggin if (smp_handle_nmi_ipi(regs)) 420102c05e8SNicholas Piggin return 1; 421102c05e8SNicholas Piggin 422c902be71SArnd Bergmann return 0; /* need to perform reset */ 423d9953105SMichael Ellerman } 424d9953105SMichael Ellerman 425d9953105SMichael Ellerman /* 426d9953105SMichael Ellerman * See if we can recover from a machine check exception. 427d9953105SMichael Ellerman * This is only called on power4 (or above) and only via 428d9953105SMichael Ellerman * the Firmware Non-Maskable Interrupts (fwnmi) handler 429d9953105SMichael Ellerman * which provides the error analysis for us. 430d9953105SMichael Ellerman * 431d9953105SMichael Ellerman * Return 1 if corrected (or delivered a signal). 432d9953105SMichael Ellerman * Return 0 if there is nothing we can do. 433d9953105SMichael Ellerman */ 434d9953105SMichael Ellerman static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err) 435d9953105SMichael Ellerman { 436d47d1d8aSAnton Blanchard int recovered = 0; 437a08a53eaSGreg Kurz int disposition = rtas_error_disposition(err); 438d9953105SMichael Ellerman 439d47d1d8aSAnton Blanchard if (!(regs->msr & MSR_RI)) { 440d47d1d8aSAnton Blanchard /* If MSR_RI isn't set, we cannot recover */ 441d47d1d8aSAnton Blanchard recovered = 0; 442d47d1d8aSAnton Blanchard 443a08a53eaSGreg Kurz } else if (disposition == RTAS_DISP_FULLY_RECOVERED) { 444d9953105SMichael Ellerman /* Platform corrected itself */ 445d47d1d8aSAnton Blanchard recovered = 1; 446d47d1d8aSAnton Blanchard 447a08a53eaSGreg Kurz } else if (disposition == RTAS_DISP_LIMITED_RECOVERY) { 448d47d1d8aSAnton Blanchard /* Platform corrected itself but could be degraded */ 449d47d1d8aSAnton Blanchard printk(KERN_ERR "MCE: limited recovery, system may " 450d47d1d8aSAnton Blanchard "be degraded\n"); 451d47d1d8aSAnton Blanchard recovered = 1; 452d47d1d8aSAnton Blanchard 453d47d1d8aSAnton Blanchard } else if (user_mode(regs) && !is_global_init(current) && 454a08a53eaSGreg Kurz rtas_error_severity(err) == RTAS_SEVERITY_ERROR_SYNC) { 455d47d1d8aSAnton Blanchard 456d47d1d8aSAnton Blanchard /* 457d47d1d8aSAnton Blanchard * If we received a synchronous error when in userspace 458d47d1d8aSAnton Blanchard * kill the task. Firmware may report details of the fail 459d47d1d8aSAnton Blanchard * asynchronously, so we can't rely on the target and type 460d47d1d8aSAnton Blanchard * fields being valid here. 461d47d1d8aSAnton Blanchard */ 462d47d1d8aSAnton Blanchard printk(KERN_ERR "MCE: uncorrectable error, killing task " 463d47d1d8aSAnton Blanchard "%s:%d\n", current->comm, current->pid); 464d47d1d8aSAnton Blanchard 465d47d1d8aSAnton Blanchard _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip); 466d47d1d8aSAnton Blanchard recovered = 1; 467d9953105SMichael Ellerman } 468d9953105SMichael Ellerman 4693f9793e6SAnton Blanchard log_error((char *)err, ERR_TYPE_RTAS_LOG, 0); 470d9953105SMichael Ellerman 471d47d1d8aSAnton Blanchard return recovered; 472d9953105SMichael Ellerman } 473d9953105SMichael Ellerman 474d9953105SMichael Ellerman /* 475d9953105SMichael Ellerman * Handle a machine check. 476d9953105SMichael Ellerman * 477d9953105SMichael Ellerman * Note that on Power 4 and beyond Firmware Non-Maskable Interrupts (fwnmi) 478d9953105SMichael Ellerman * should be present. If so the handler which called us tells us if the 479d9953105SMichael Ellerman * error was recovered (never true if RI=0). 480d9953105SMichael Ellerman * 481d9953105SMichael Ellerman * On hardware prior to Power 4 these exceptions were asynchronous which 482d9953105SMichael Ellerman * means we can't tell exactly where it occurred and so we can't recover. 483d9953105SMichael Ellerman */ 484d9953105SMichael Ellerman int pSeries_machine_check_exception(struct pt_regs *regs) 485d9953105SMichael Ellerman { 486d9953105SMichael Ellerman struct rtas_error_log *errp; 487d9953105SMichael Ellerman 488d9953105SMichael Ellerman if (fwnmi_active) { 489d9953105SMichael Ellerman errp = fwnmi_get_errinfo(regs); 490d9953105SMichael Ellerman fwnmi_release_errinfo(); 491d9953105SMichael Ellerman if (errp && recover_mce(regs, errp)) 492d9953105SMichael Ellerman return 1; 493d9953105SMichael Ellerman } 494d9953105SMichael Ellerman 495d9953105SMichael Ellerman return 0; 496d9953105SMichael Ellerman } 497