1d9953105SMichael Ellerman /* 2d9953105SMichael Ellerman * Copyright (C) 2001 Dave Engebretsen IBM Corporation 3d9953105SMichael Ellerman * 4d9953105SMichael Ellerman * This program is free software; you can redistribute it and/or modify 5d9953105SMichael Ellerman * it under the terms of the GNU General Public License as published by 6d9953105SMichael Ellerman * the Free Software Foundation; either version 2 of the License, or 7d9953105SMichael Ellerman * (at your option) any later version. 8d9953105SMichael Ellerman * 9d9953105SMichael Ellerman * This program is distributed in the hope that it will be useful, 10d9953105SMichael Ellerman * but WITHOUT ANY WARRANTY; without even the implied warranty of 11d9953105SMichael Ellerman * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12d9953105SMichael Ellerman * GNU General Public License for more details. 13d9953105SMichael Ellerman * 14d9953105SMichael Ellerman * You should have received a copy of the GNU General Public License 15d9953105SMichael Ellerman * along with this program; if not, write to the Free Software 16d9953105SMichael Ellerman * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 17d9953105SMichael Ellerman */ 18d9953105SMichael Ellerman 19d9953105SMichael Ellerman #include <linux/sched.h> 20d9953105SMichael Ellerman #include <linux/interrupt.h> 21d9953105SMichael Ellerman #include <linux/irq.h> 2290128997SAnton Blanchard #include <linux/of.h> 2355fc0c56SAnton Blanchard #include <linux/fs.h> 2455fc0c56SAnton Blanchard #include <linux/reboot.h> 2594675cceSMahesh Salgaonkar #include <linux/irq_work.h> 26d9953105SMichael Ellerman 27d9953105SMichael Ellerman #include <asm/machdep.h> 28d9953105SMichael Ellerman #include <asm/rtas.h> 298c4f1f29SMichael Ellerman #include <asm/firmware.h> 30d9953105SMichael Ellerman 31577830b0SMichael Ellerman #include "pseries.h" 32c902be71SArnd Bergmann 33d9953105SMichael Ellerman static unsigned char ras_log_buf[RTAS_ERROR_LOG_MAX]; 34d9953105SMichael Ellerman static DEFINE_SPINLOCK(ras_log_buf_lock); 35d9953105SMichael Ellerman 36d9953105SMichael Ellerman static int ras_check_exception_token; 37d9953105SMichael Ellerman 3894675cceSMahesh Salgaonkar static void mce_process_errlog_event(struct irq_work *work); 3994675cceSMahesh Salgaonkar static struct irq_work mce_errlog_process_work = { 4094675cceSMahesh Salgaonkar .func = mce_process_errlog_event, 4194675cceSMahesh Salgaonkar }; 4294675cceSMahesh Salgaonkar 43d9953105SMichael Ellerman #define EPOW_SENSOR_TOKEN 9 44d9953105SMichael Ellerman #define EPOW_SENSOR_INDEX 0 45d9953105SMichael Ellerman 46b4af279aSVipin K Parashar /* EPOW events counter variable */ 47b4af279aSVipin K Parashar static int num_epow_events; 48b4af279aSVipin K Parashar 49b7d9eb39SJohn Allen static irqreturn_t ras_hotplug_interrupt(int irq, void *dev_id); 507d12e780SDavid Howells static irqreturn_t ras_epow_interrupt(int irq, void *dev_id); 517d12e780SDavid Howells static irqreturn_t ras_error_interrupt(int irq, void *dev_id); 52d9953105SMichael Ellerman 530ebfff14SBenjamin Herrenschmidt 54d9953105SMichael Ellerman /* 55c9dccf1dSSam Bobroff * Enable the hotplug interrupt late because processing them may touch other 56c9dccf1dSSam Bobroff * devices or systems (e.g. hugepages) that have not been initialized at the 57c9dccf1dSSam Bobroff * subsys stage. 58c9dccf1dSSam Bobroff */ 59c9dccf1dSSam Bobroff int __init init_ras_hotplug_IRQ(void) 60c9dccf1dSSam Bobroff { 61c9dccf1dSSam Bobroff struct device_node *np; 62c9dccf1dSSam Bobroff 63c9dccf1dSSam Bobroff /* Hotplug Events */ 64c9dccf1dSSam Bobroff np = of_find_node_by_path("/event-sources/hot-plug-events"); 65c9dccf1dSSam Bobroff if (np != NULL) { 66c9dccf1dSSam Bobroff if (dlpar_workqueue_init() == 0) 67c9dccf1dSSam Bobroff request_event_sources_irqs(np, ras_hotplug_interrupt, 68c9dccf1dSSam Bobroff "RAS_HOTPLUG"); 69c9dccf1dSSam Bobroff of_node_put(np); 70c9dccf1dSSam Bobroff } 71c9dccf1dSSam Bobroff 72c9dccf1dSSam Bobroff return 0; 73c9dccf1dSSam Bobroff } 74c9dccf1dSSam Bobroff machine_late_initcall(pseries, init_ras_hotplug_IRQ); 75c9dccf1dSSam Bobroff 76c9dccf1dSSam Bobroff /* 77d9953105SMichael Ellerman * Initialize handlers for the set of interrupts caused by hardware errors 78d9953105SMichael Ellerman * and power system events. 79d9953105SMichael Ellerman */ 80d9953105SMichael Ellerman static int __init init_ras_IRQ(void) 81d9953105SMichael Ellerman { 82d9953105SMichael Ellerman struct device_node *np; 83d9953105SMichael Ellerman 84d9953105SMichael Ellerman ras_check_exception_token = rtas_token("check-exception"); 85d9953105SMichael Ellerman 86d9953105SMichael Ellerman /* Internal Errors */ 87d9953105SMichael Ellerman np = of_find_node_by_path("/event-sources/internal-errors"); 88d9953105SMichael Ellerman if (np != NULL) { 8932c96f77SMark Nelson request_event_sources_irqs(np, ras_error_interrupt, 9032c96f77SMark Nelson "RAS_ERROR"); 91d9953105SMichael Ellerman of_node_put(np); 92d9953105SMichael Ellerman } 93d9953105SMichael Ellerman 94d9953105SMichael Ellerman /* EPOW Events */ 95d9953105SMichael Ellerman np = of_find_node_by_path("/event-sources/epow-events"); 96d9953105SMichael Ellerman if (np != NULL) { 9732c96f77SMark Nelson request_event_sources_irqs(np, ras_epow_interrupt, "RAS_EPOW"); 98d9953105SMichael Ellerman of_node_put(np); 99d9953105SMichael Ellerman } 100d9953105SMichael Ellerman 10169ed3324SAnton Blanchard return 0; 102d9953105SMichael Ellerman } 1038e83e905SMichael Ellerman machine_subsys_initcall(pseries, init_ras_IRQ); 104d9953105SMichael Ellerman 10555fc0c56SAnton Blanchard #define EPOW_SHUTDOWN_NORMAL 1 10655fc0c56SAnton Blanchard #define EPOW_SHUTDOWN_ON_UPS 2 10755fc0c56SAnton Blanchard #define EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS 3 10855fc0c56SAnton Blanchard #define EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH 4 10955fc0c56SAnton Blanchard 11055fc0c56SAnton Blanchard static void handle_system_shutdown(char event_modifier) 11155fc0c56SAnton Blanchard { 11255fc0c56SAnton Blanchard switch (event_modifier) { 11355fc0c56SAnton Blanchard case EPOW_SHUTDOWN_NORMAL: 114b4af279aSVipin K Parashar pr_emerg("Power off requested\n"); 1151b7e0cbeSliguang orderly_poweroff(true); 11655fc0c56SAnton Blanchard break; 11755fc0c56SAnton Blanchard 11855fc0c56SAnton Blanchard case EPOW_SHUTDOWN_ON_UPS: 119b4af279aSVipin K Parashar pr_emerg("Loss of system power detected. System is running on" 120b4af279aSVipin K Parashar " UPS/battery. Check RTAS error log for details\n"); 12179872e35SAnshuman Khandual orderly_poweroff(true); 12255fc0c56SAnton Blanchard break; 12355fc0c56SAnton Blanchard 12455fc0c56SAnton Blanchard case EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS: 125b4af279aSVipin K Parashar pr_emerg("Loss of system critical functions detected. Check" 126b4af279aSVipin K Parashar " RTAS error log for details\n"); 1271b7e0cbeSliguang orderly_poweroff(true); 12855fc0c56SAnton Blanchard break; 12955fc0c56SAnton Blanchard 13055fc0c56SAnton Blanchard case EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH: 131b4af279aSVipin K Parashar pr_emerg("High ambient temperature detected. Check RTAS" 132b4af279aSVipin K Parashar " error log for details\n"); 1331b7e0cbeSliguang orderly_poweroff(true); 13455fc0c56SAnton Blanchard break; 13555fc0c56SAnton Blanchard 13655fc0c56SAnton Blanchard default: 137b4af279aSVipin K Parashar pr_err("Unknown power/cooling shutdown event (modifier = %d)\n", 13855fc0c56SAnton Blanchard event_modifier); 13955fc0c56SAnton Blanchard } 14055fc0c56SAnton Blanchard } 14155fc0c56SAnton Blanchard 14255fc0c56SAnton Blanchard struct epow_errorlog { 14355fc0c56SAnton Blanchard unsigned char sensor_value; 14455fc0c56SAnton Blanchard unsigned char event_modifier; 14555fc0c56SAnton Blanchard unsigned char extended_modifier; 14655fc0c56SAnton Blanchard unsigned char reserved; 14755fc0c56SAnton Blanchard unsigned char platform_reason; 14855fc0c56SAnton Blanchard }; 14955fc0c56SAnton Blanchard 15055fc0c56SAnton Blanchard #define EPOW_RESET 0 15155fc0c56SAnton Blanchard #define EPOW_WARN_COOLING 1 15255fc0c56SAnton Blanchard #define EPOW_WARN_POWER 2 15355fc0c56SAnton Blanchard #define EPOW_SYSTEM_SHUTDOWN 3 15455fc0c56SAnton Blanchard #define EPOW_SYSTEM_HALT 4 15555fc0c56SAnton Blanchard #define EPOW_MAIN_ENCLOSURE 5 15655fc0c56SAnton Blanchard #define EPOW_POWER_OFF 7 15755fc0c56SAnton Blanchard 158e51df2c1SAnton Blanchard static void rtas_parse_epow_errlog(struct rtas_error_log *log) 15955fc0c56SAnton Blanchard { 16055fc0c56SAnton Blanchard struct pseries_errorlog *pseries_log; 16155fc0c56SAnton Blanchard struct epow_errorlog *epow_log; 16255fc0c56SAnton Blanchard char action_code; 16355fc0c56SAnton Blanchard char modifier; 16455fc0c56SAnton Blanchard 16555fc0c56SAnton Blanchard pseries_log = get_pseries_errorlog(log, PSERIES_ELOG_SECT_ID_EPOW); 16655fc0c56SAnton Blanchard if (pseries_log == NULL) 16755fc0c56SAnton Blanchard return; 16855fc0c56SAnton Blanchard 16955fc0c56SAnton Blanchard epow_log = (struct epow_errorlog *)pseries_log->data; 17055fc0c56SAnton Blanchard action_code = epow_log->sensor_value & 0xF; /* bottom 4 bits */ 17155fc0c56SAnton Blanchard modifier = epow_log->event_modifier & 0xF; /* bottom 4 bits */ 17255fc0c56SAnton Blanchard 17355fc0c56SAnton Blanchard switch (action_code) { 17455fc0c56SAnton Blanchard case EPOW_RESET: 175b4af279aSVipin K Parashar if (num_epow_events) { 176b4af279aSVipin K Parashar pr_info("Non critical power/cooling issue cleared\n"); 177b4af279aSVipin K Parashar num_epow_events--; 178b4af279aSVipin K Parashar } 17955fc0c56SAnton Blanchard break; 18055fc0c56SAnton Blanchard 18155fc0c56SAnton Blanchard case EPOW_WARN_COOLING: 182b4af279aSVipin K Parashar pr_info("Non-critical cooling issue detected. Check RTAS error" 183b4af279aSVipin K Parashar " log for details\n"); 18455fc0c56SAnton Blanchard break; 18555fc0c56SAnton Blanchard 18655fc0c56SAnton Blanchard case EPOW_WARN_POWER: 187b4af279aSVipin K Parashar pr_info("Non-critical power issue detected. Check RTAS error" 188b4af279aSVipin K Parashar " log for details\n"); 18955fc0c56SAnton Blanchard break; 19055fc0c56SAnton Blanchard 19155fc0c56SAnton Blanchard case EPOW_SYSTEM_SHUTDOWN: 19255fc0c56SAnton Blanchard handle_system_shutdown(epow_log->event_modifier); 19355fc0c56SAnton Blanchard break; 19455fc0c56SAnton Blanchard 19555fc0c56SAnton Blanchard case EPOW_SYSTEM_HALT: 196b4af279aSVipin K Parashar pr_emerg("Critical power/cooling issue detected. Check RTAS" 197b4af279aSVipin K Parashar " error log for details. Powering off.\n"); 1981b7e0cbeSliguang orderly_poweroff(true); 19955fc0c56SAnton Blanchard break; 20055fc0c56SAnton Blanchard 20155fc0c56SAnton Blanchard case EPOW_MAIN_ENCLOSURE: 20255fc0c56SAnton Blanchard case EPOW_POWER_OFF: 203b4af279aSVipin K Parashar pr_emerg("System about to lose power. Check RTAS error log " 204b4af279aSVipin K Parashar " for details. Powering off immediately.\n"); 20555fc0c56SAnton Blanchard emergency_sync(); 20655fc0c56SAnton Blanchard kernel_power_off(); 20755fc0c56SAnton Blanchard break; 20855fc0c56SAnton Blanchard 20955fc0c56SAnton Blanchard default: 210b4af279aSVipin K Parashar pr_err("Unknown power/cooling event (action code = %d)\n", 21155fc0c56SAnton Blanchard action_code); 21255fc0c56SAnton Blanchard } 213b4af279aSVipin K Parashar 214b4af279aSVipin K Parashar /* Increment epow events counter variable */ 215b4af279aSVipin K Parashar if (action_code != EPOW_RESET) 216b4af279aSVipin K Parashar num_epow_events++; 21755fc0c56SAnton Blanchard } 21855fc0c56SAnton Blanchard 219b7d9eb39SJohn Allen static irqreturn_t ras_hotplug_interrupt(int irq, void *dev_id) 220b7d9eb39SJohn Allen { 221b7d9eb39SJohn Allen struct pseries_errorlog *pseries_log; 222b7d9eb39SJohn Allen struct pseries_hp_errorlog *hp_elog; 223b7d9eb39SJohn Allen 224b7d9eb39SJohn Allen spin_lock(&ras_log_buf_lock); 225b7d9eb39SJohn Allen 226b7d9eb39SJohn Allen rtas_call(ras_check_exception_token, 6, 1, NULL, 227b7d9eb39SJohn Allen RTAS_VECTOR_EXTERNAL_INTERRUPT, virq_to_hw(irq), 228b7d9eb39SJohn Allen RTAS_HOTPLUG_EVENTS, 0, __pa(&ras_log_buf), 229b7d9eb39SJohn Allen rtas_get_error_log_max()); 230b7d9eb39SJohn Allen 231b7d9eb39SJohn Allen pseries_log = get_pseries_errorlog((struct rtas_error_log *)ras_log_buf, 232b7d9eb39SJohn Allen PSERIES_ELOG_SECT_ID_HOTPLUG); 233b7d9eb39SJohn Allen hp_elog = (struct pseries_hp_errorlog *)pseries_log->data; 234b7d9eb39SJohn Allen 235b7d9eb39SJohn Allen /* 236b7d9eb39SJohn Allen * Since PCI hotplug is not currently supported on pseries, put PCI 237b7d9eb39SJohn Allen * hotplug events on the ras_log_buf to be handled by rtas_errd. 238b7d9eb39SJohn Allen */ 239b7d9eb39SJohn Allen if (hp_elog->resource == PSERIES_HP_ELOG_RESOURCE_MEM || 240b7d9eb39SJohn Allen hp_elog->resource == PSERIES_HP_ELOG_RESOURCE_CPU) 241b7d9eb39SJohn Allen queue_hotplug_event(hp_elog, NULL, NULL); 242b7d9eb39SJohn Allen else 243b7d9eb39SJohn Allen log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0); 244b7d9eb39SJohn Allen 245b7d9eb39SJohn Allen spin_unlock(&ras_log_buf_lock); 246b7d9eb39SJohn Allen return IRQ_HANDLED; 247b7d9eb39SJohn Allen } 248b7d9eb39SJohn Allen 24955fc0c56SAnton Blanchard /* Handle environmental and power warning (EPOW) interrupts. */ 2507d12e780SDavid Howells static irqreturn_t ras_epow_interrupt(int irq, void *dev_id) 251d9953105SMichael Ellerman { 25255fc0c56SAnton Blanchard int status; 25355fc0c56SAnton Blanchard int state; 254d9953105SMichael Ellerman int critical; 255d9953105SMichael Ellerman 2561c2cb594SThomas Huth status = rtas_get_sensor_fast(EPOW_SENSOR_TOKEN, EPOW_SENSOR_INDEX, 2571c2cb594SThomas Huth &state); 258d9953105SMichael Ellerman 259d9953105SMichael Ellerman if (state > 3) 260d9953105SMichael Ellerman critical = 1; /* Time Critical */ 261d9953105SMichael Ellerman else 262d9953105SMichael Ellerman critical = 0; 263d9953105SMichael Ellerman 264d9953105SMichael Ellerman spin_lock(&ras_log_buf_lock); 265d9953105SMichael Ellerman 266d9953105SMichael Ellerman status = rtas_call(ras_check_exception_token, 6, 1, NULL, 267b08e281bSMark Nelson RTAS_VECTOR_EXTERNAL_INTERRUPT, 268476eb491SGrant Likely virq_to_hw(irq), 2696f43747fSAnton Blanchard RTAS_EPOW_WARNING, 270d9953105SMichael Ellerman critical, __pa(&ras_log_buf), 271d9953105SMichael Ellerman rtas_get_error_log_max()); 272d9953105SMichael Ellerman 273d9953105SMichael Ellerman log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0); 274d9953105SMichael Ellerman 27555fc0c56SAnton Blanchard rtas_parse_epow_errlog((struct rtas_error_log *)ras_log_buf); 27655fc0c56SAnton Blanchard 277d9953105SMichael Ellerman spin_unlock(&ras_log_buf_lock); 278d9953105SMichael Ellerman return IRQ_HANDLED; 279d9953105SMichael Ellerman } 280d9953105SMichael Ellerman 281d9953105SMichael Ellerman /* 282d9953105SMichael Ellerman * Handle hardware error interrupts. 283d9953105SMichael Ellerman * 284d9953105SMichael Ellerman * RTAS check-exception is called to collect data on the exception. If 285d9953105SMichael Ellerman * the error is deemed recoverable, we log a warning and return. 286d9953105SMichael Ellerman * For nonrecoverable errors, an error is logged and we stop all processing 287d9953105SMichael Ellerman * as quickly as possible in order to prevent propagation of the failure. 288d9953105SMichael Ellerman */ 2897d12e780SDavid Howells static irqreturn_t ras_error_interrupt(int irq, void *dev_id) 290d9953105SMichael Ellerman { 291d9953105SMichael Ellerman struct rtas_error_log *rtas_elog; 292cc8b5263SAnton Blanchard int status; 293d9953105SMichael Ellerman int fatal; 294d9953105SMichael Ellerman 295d9953105SMichael Ellerman spin_lock(&ras_log_buf_lock); 296d9953105SMichael Ellerman 297d9953105SMichael Ellerman status = rtas_call(ras_check_exception_token, 6, 1, NULL, 298b08e281bSMark Nelson RTAS_VECTOR_EXTERNAL_INTERRUPT, 299476eb491SGrant Likely virq_to_hw(irq), 300d9953105SMichael Ellerman RTAS_INTERNAL_ERROR, 1 /* Time Critical */, 301d9953105SMichael Ellerman __pa(&ras_log_buf), 302d9953105SMichael Ellerman rtas_get_error_log_max()); 303d9953105SMichael Ellerman 304d9953105SMichael Ellerman rtas_elog = (struct rtas_error_log *)ras_log_buf; 305d9953105SMichael Ellerman 306a08a53eaSGreg Kurz if (status == 0 && 307a08a53eaSGreg Kurz rtas_error_severity(rtas_elog) >= RTAS_SEVERITY_ERROR_SYNC) 308d9953105SMichael Ellerman fatal = 1; 309d9953105SMichael Ellerman else 310d9953105SMichael Ellerman fatal = 0; 311d9953105SMichael Ellerman 312d9953105SMichael Ellerman /* format and print the extended information */ 313d9953105SMichael Ellerman log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, fatal); 314d9953105SMichael Ellerman 315d9953105SMichael Ellerman if (fatal) { 316b4af279aSVipin K Parashar pr_emerg("Fatal hardware error detected. Check RTAS error" 317b4af279aSVipin K Parashar " log for details. Powering off immediately\n"); 318cc8b5263SAnton Blanchard emergency_sync(); 319cc8b5263SAnton Blanchard kernel_power_off(); 320d9953105SMichael Ellerman } else { 321b4af279aSVipin K Parashar pr_err("Recoverable hardware error detected\n"); 322d9953105SMichael Ellerman } 323d9953105SMichael Ellerman 324d9953105SMichael Ellerman spin_unlock(&ras_log_buf_lock); 325d9953105SMichael Ellerman return IRQ_HANDLED; 326d9953105SMichael Ellerman } 327d9953105SMichael Ellerman 328d368514cSAnton Blanchard /* 329d368514cSAnton Blanchard * Some versions of FWNMI place the buffer inside the 4kB page starting at 330d368514cSAnton Blanchard * 0x7000. Other versions place it inside the rtas buffer. We check both. 331d368514cSAnton Blanchard */ 332d368514cSAnton Blanchard #define VALID_FWNMI_BUFFER(A) \ 333d368514cSAnton Blanchard ((((A) >= 0x7000) && ((A) < 0x7ff0)) || \ 334d368514cSAnton Blanchard (((A) >= rtas.base) && ((A) < (rtas.base + rtas.size - 16)))) 335d368514cSAnton Blanchard 33694675cceSMahesh Salgaonkar static inline struct rtas_error_log *fwnmi_get_errlog(void) 33794675cceSMahesh Salgaonkar { 33894675cceSMahesh Salgaonkar return (struct rtas_error_log *)local_paca->mce_data_buf; 33994675cceSMahesh Salgaonkar } 34094675cceSMahesh Salgaonkar 341d368514cSAnton Blanchard /* 342d368514cSAnton Blanchard * Get the error information for errors coming through the 343d9953105SMichael Ellerman * FWNMI vectors. The pt_regs' r3 will be updated to reflect 344d9953105SMichael Ellerman * the actual r3 if possible, and a ptr to the error log entry 345d9953105SMichael Ellerman * will be returned if found. 346d9953105SMichael Ellerman * 34794675cceSMahesh Salgaonkar * Use one buffer mce_data_buf per cpu to store RTAS error. 348d368514cSAnton Blanchard * 34994675cceSMahesh Salgaonkar * The mce_data_buf does not have any locks or protection around it, 350d9953105SMichael Ellerman * if a second machine check comes in, or a system reset is done 351d9953105SMichael Ellerman * before we have logged the error, then we will get corruption in the 352d9953105SMichael Ellerman * error log. This is preferable over holding off on calling 353d9953105SMichael Ellerman * ibm,nmi-interlock which would result in us checkstopping if a 354d9953105SMichael Ellerman * second machine check did come in. 355d9953105SMichael Ellerman */ 356d9953105SMichael Ellerman static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs) 357d9953105SMichael Ellerman { 358d9953105SMichael Ellerman unsigned long *savep; 35994675cceSMahesh Salgaonkar struct rtas_error_log *h; 360d9953105SMichael Ellerman 361ee1dd1e3SMahesh Salgaonkar /* Mask top two bits */ 362ee1dd1e3SMahesh Salgaonkar regs->gpr[3] &= ~(0x3UL << 62); 363ee1dd1e3SMahesh Salgaonkar 364d368514cSAnton Blanchard if (!VALID_FWNMI_BUFFER(regs->gpr[3])) { 365f0e939aeSAnton Blanchard printk(KERN_ERR "FWNMI: corrupt r3 0x%016lx\n", regs->gpr[3]); 366d368514cSAnton Blanchard return NULL; 367d9953105SMichael Ellerman } 368d368514cSAnton Blanchard 369d368514cSAnton Blanchard savep = __va(regs->gpr[3]); 370*cd813e1cSMahesh Salgaonkar regs->gpr[3] = be64_to_cpu(savep[0]); /* restore original r3 */ 371d368514cSAnton Blanchard 372d368514cSAnton Blanchard h = (struct rtas_error_log *)&savep[1]; 37394675cceSMahesh Salgaonkar /* Use the per cpu buffer from paca to store rtas error log */ 37494675cceSMahesh Salgaonkar memset(local_paca->mce_data_buf, 0, RTAS_ERROR_LOG_MAX); 375a08a53eaSGreg Kurz if (!rtas_error_extended(h)) { 37694675cceSMahesh Salgaonkar memcpy(local_paca->mce_data_buf, h, sizeof(__u64)); 377d368514cSAnton Blanchard } else { 378a08a53eaSGreg Kurz int len, error_log_length; 379d368514cSAnton Blanchard 380a08a53eaSGreg Kurz error_log_length = 8 + rtas_error_extended_log_length(h); 38174e96bf4SMahesh Salgaonkar len = min_t(int, error_log_length, RTAS_ERROR_LOG_MAX); 38294675cceSMahesh Salgaonkar memcpy(local_paca->mce_data_buf, h, len); 383d368514cSAnton Blanchard } 384d368514cSAnton Blanchard 38594675cceSMahesh Salgaonkar return (struct rtas_error_log *)local_paca->mce_data_buf; 386d9953105SMichael Ellerman } 387d9953105SMichael Ellerman 388d9953105SMichael Ellerman /* Call this when done with the data returned by FWNMI_get_errinfo. 389d9953105SMichael Ellerman * It will release the saved data area for other CPUs in the 390d9953105SMichael Ellerman * partition to receive FWNMI errors. 391d9953105SMichael Ellerman */ 392d9953105SMichael Ellerman static void fwnmi_release_errinfo(void) 393d9953105SMichael Ellerman { 394d9953105SMichael Ellerman int ret = rtas_call(rtas_token("ibm,nmi-interlock"), 0, 1, NULL); 395d9953105SMichael Ellerman if (ret != 0) 396d368514cSAnton Blanchard printk(KERN_ERR "FWNMI: nmi-interlock failed: %d\n", ret); 397d9953105SMichael Ellerman } 398d9953105SMichael Ellerman 399c902be71SArnd Bergmann int pSeries_system_reset_exception(struct pt_regs *regs) 400d9953105SMichael Ellerman { 401bded0706SNicholas Piggin #ifdef __LITTLE_ENDIAN__ 402bded0706SNicholas Piggin /* 403bded0706SNicholas Piggin * Some firmware byteswaps SRR registers and gives incorrect SRR1. Try 404bded0706SNicholas Piggin * to detect the bad SRR1 pattern here. Flip the NIP back to correct 405bded0706SNicholas Piggin * endian for reporting purposes. Unfortunately the MSR can't be fixed, 406bded0706SNicholas Piggin * so clear it. It will be missing MSR_RI so we won't try to recover. 407bded0706SNicholas Piggin */ 408bded0706SNicholas Piggin if ((be64_to_cpu(regs->msr) & 409bded0706SNicholas Piggin (MSR_LE|MSR_RI|MSR_DR|MSR_IR|MSR_ME|MSR_PR| 410bded0706SNicholas Piggin MSR_ILE|MSR_HV|MSR_SF)) == (MSR_DR|MSR_SF)) { 411bded0706SNicholas Piggin regs->nip = be64_to_cpu((__be64)regs->nip); 412bded0706SNicholas Piggin regs->msr = 0; 413bded0706SNicholas Piggin } 414bded0706SNicholas Piggin #endif 415bded0706SNicholas Piggin 416d9953105SMichael Ellerman if (fwnmi_active) { 417d9953105SMichael Ellerman struct rtas_error_log *errhdr = fwnmi_get_errinfo(regs); 418d9953105SMichael Ellerman if (errhdr) { 419d9953105SMichael Ellerman /* XXX Should look at FWNMI information */ 420d9953105SMichael Ellerman } 421d9953105SMichael Ellerman fwnmi_release_errinfo(); 422d9953105SMichael Ellerman } 423102c05e8SNicholas Piggin 424102c05e8SNicholas Piggin if (smp_handle_nmi_ipi(regs)) 425102c05e8SNicholas Piggin return 1; 426102c05e8SNicholas Piggin 427c902be71SArnd Bergmann return 0; /* need to perform reset */ 428d9953105SMichael Ellerman } 429d9953105SMichael Ellerman 430d9953105SMichael Ellerman /* 43194675cceSMahesh Salgaonkar * Process MCE rtas errlog event. 43294675cceSMahesh Salgaonkar */ 43394675cceSMahesh Salgaonkar static void mce_process_errlog_event(struct irq_work *work) 43494675cceSMahesh Salgaonkar { 43594675cceSMahesh Salgaonkar struct rtas_error_log *err; 43694675cceSMahesh Salgaonkar 43794675cceSMahesh Salgaonkar err = fwnmi_get_errlog(); 43894675cceSMahesh Salgaonkar log_error((char *)err, ERR_TYPE_RTAS_LOG, 0); 43994675cceSMahesh Salgaonkar } 44094675cceSMahesh Salgaonkar 44194675cceSMahesh Salgaonkar /* 442d9953105SMichael Ellerman * See if we can recover from a machine check exception. 443d9953105SMichael Ellerman * This is only called on power4 (or above) and only via 444d9953105SMichael Ellerman * the Firmware Non-Maskable Interrupts (fwnmi) handler 445d9953105SMichael Ellerman * which provides the error analysis for us. 446d9953105SMichael Ellerman * 447d9953105SMichael Ellerman * Return 1 if corrected (or delivered a signal). 448d9953105SMichael Ellerman * Return 0 if there is nothing we can do. 449d9953105SMichael Ellerman */ 450d9953105SMichael Ellerman static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err) 451d9953105SMichael Ellerman { 452d47d1d8aSAnton Blanchard int recovered = 0; 453a08a53eaSGreg Kurz int disposition = rtas_error_disposition(err); 454d9953105SMichael Ellerman 455d47d1d8aSAnton Blanchard if (!(regs->msr & MSR_RI)) { 456d47d1d8aSAnton Blanchard /* If MSR_RI isn't set, we cannot recover */ 457d47d1d8aSAnton Blanchard recovered = 0; 458d47d1d8aSAnton Blanchard 459a08a53eaSGreg Kurz } else if (disposition == RTAS_DISP_FULLY_RECOVERED) { 460d9953105SMichael Ellerman /* Platform corrected itself */ 461d47d1d8aSAnton Blanchard recovered = 1; 462d47d1d8aSAnton Blanchard 463a08a53eaSGreg Kurz } else if (disposition == RTAS_DISP_LIMITED_RECOVERY) { 464d47d1d8aSAnton Blanchard /* Platform corrected itself but could be degraded */ 465d47d1d8aSAnton Blanchard printk(KERN_ERR "MCE: limited recovery, system may " 466d47d1d8aSAnton Blanchard "be degraded\n"); 467d47d1d8aSAnton Blanchard recovered = 1; 468d47d1d8aSAnton Blanchard 469d47d1d8aSAnton Blanchard } else if (user_mode(regs) && !is_global_init(current) && 470a08a53eaSGreg Kurz rtas_error_severity(err) == RTAS_SEVERITY_ERROR_SYNC) { 471d47d1d8aSAnton Blanchard 472d47d1d8aSAnton Blanchard /* 473d47d1d8aSAnton Blanchard * If we received a synchronous error when in userspace 474d47d1d8aSAnton Blanchard * kill the task. Firmware may report details of the fail 475d47d1d8aSAnton Blanchard * asynchronously, so we can't rely on the target and type 476d47d1d8aSAnton Blanchard * fields being valid here. 477d47d1d8aSAnton Blanchard */ 478d47d1d8aSAnton Blanchard printk(KERN_ERR "MCE: uncorrectable error, killing task " 479d47d1d8aSAnton Blanchard "%s:%d\n", current->comm, current->pid); 480d47d1d8aSAnton Blanchard 481d47d1d8aSAnton Blanchard _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip); 482d47d1d8aSAnton Blanchard recovered = 1; 483d9953105SMichael Ellerman } 484d9953105SMichael Ellerman 48594675cceSMahesh Salgaonkar /* Queue irq work to log this rtas event later. */ 48694675cceSMahesh Salgaonkar irq_work_queue(&mce_errlog_process_work); 487d9953105SMichael Ellerman 488d47d1d8aSAnton Blanchard return recovered; 489d9953105SMichael Ellerman } 490d9953105SMichael Ellerman 491d9953105SMichael Ellerman /* 492d9953105SMichael Ellerman * Handle a machine check. 493d9953105SMichael Ellerman * 494d9953105SMichael Ellerman * Note that on Power 4 and beyond Firmware Non-Maskable Interrupts (fwnmi) 495d9953105SMichael Ellerman * should be present. If so the handler which called us tells us if the 496d9953105SMichael Ellerman * error was recovered (never true if RI=0). 497d9953105SMichael Ellerman * 498d9953105SMichael Ellerman * On hardware prior to Power 4 these exceptions were asynchronous which 499d9953105SMichael Ellerman * means we can't tell exactly where it occurred and so we can't recover. 500d9953105SMichael Ellerman */ 501d9953105SMichael Ellerman int pSeries_machine_check_exception(struct pt_regs *regs) 502d9953105SMichael Ellerman { 503d9953105SMichael Ellerman struct rtas_error_log *errp; 504d9953105SMichael Ellerman 505d9953105SMichael Ellerman if (fwnmi_active) { 506d9953105SMichael Ellerman errp = fwnmi_get_errinfo(regs); 507d9953105SMichael Ellerman fwnmi_release_errinfo(); 508d9953105SMichael Ellerman if (errp && recover_mce(regs, errp)) 509d9953105SMichael Ellerman return 1; 510d9953105SMichael Ellerman } 511d9953105SMichael Ellerman 512d9953105SMichael Ellerman return 0; 513d9953105SMichael Ellerman } 514